1 /* 2 * Copyright (C) 2010-2018 Arm Limited or its affiliates. All rights reserved. 3 * Copyright (c) 2019 Nuclei Limited. All rights reserved. 4 * 5 * SPDX-License-Identifier: Apache-2.0 6 * 7 * Licensed under the Apache License, Version 2.0 (the License); you may 8 * not use this file except in compliance with the License. 9 * You may obtain a copy of the License at 10 * 11 * www.apache.org/licenses/LICENSE-2.0 12 * 13 * Unless required by applicable law or agreed to in writing, software 14 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 15 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 * See the License for the specific language governing permissions and 17 * limitations under the License. 18 */ 19 20 /* ---------------------------------------------------------------------- 21 * Project: NMSIS NN Library 22 * Title: riscv_nnfunctions.h 23 * Description: Public header file for NMSIS NN Library 24 * 25 * $Date: 13. July 2018 26 * $Revision: V.1.0.0 27 * 28 * Target Processor: RISC-V Cores 29 * -------------------------------------------------------------------- */ 30 31 /** 32 \mainpage NMSIS NN Software Library 33 * 34 * Introduction 35 * ------------ 36 * 37 * This user manual describes the NMSIS NN software library, 38 * a collection of efficient neural network kernels developed to maximize the 39 * performance and minimize the memory footprint of neural networks on Nuclei N processor cores. 40 * 41 * The library is divided into a number of functions each covering a specific category: 42 * - Neural Network Convolution Functions 43 * - Neural Network Activation Functions 44 * - Fully-connected Layer Functions 45 * - Neural Network Pooling Functions 46 * - Softmax Functions 47 * - Neural Network Support Functions 48 * 49 * The library has separate functions for operating on different weight and activation data 50 * types including 8-bit integers (q7_t) and 16-bit integers (q15_t). The descrition of the 51 * kernels are included in the function description. The implementation details are also 52 * described in this paper [1]. 53 * 54 * \note Please refer to [NMSIS-NN](../../../nn/index.html) 55 * 56 * Block Diagram 57 * -------- 58 * \image html NMSIS-NN-OVERVIEW.PNG 59 * 60 * Examples 61 * -------- 62 * 63 * The library ships with a number of examples which demonstrate how to use the library functions. 64 * 65 * Pre-processor Macros 66 * ------------ 67 * 68 * Each library project have differant pre-processor macros. 69 * 70 * - RISCV_MATH_DSP: 71 * 72 * Define macro RISCV_MATH_DSP, If the silicon supports DSP instructions. 73 * 74 * - RISCV_NN_TRUNCATE: 75 * 76 * Define macro RISCV_NN_TRUNCATE to use floor instead of round-to-the-nearest-int for the computation. 77 * 78 * 79 * [1] CMSIS-NN: Efficient Neural Network Kernels for Arm Cortex-M CPUs https://arxiv.org/abs/1801.06601 80 */ 81 82 /** 83 * @defgroup groupNN Neural Network Functions 84 * These functions perform basic operations for neural network layers. 85 */ 86 87 #ifndef _RISCV_NNFUNCTIONS_H 88 #define _RISCV_NNFUNCTIONS_H 89 90 #include "riscv_nnsupportfunctions.h" 91 #include "riscv_nn_tables.h" 92 93 #define USE_INTRINSIC 94 95 //#define RISCV_NN_TRUNCATE /* This config the rounding model to floor or round to the nearest int */ 96 97 #ifdef __cplusplus 98 extern "C" 99 { 100 #endif 101 102 /** 103 * @defgroup NNConv Neural Network Convolution Functions 104 * 105 * Perform convolution layer 106 * 107 * The convolution is implemented in 2 steps: im2col and GEMM 108 * 109 * im2col is a process of converting each patch of image data into 110 * a column. After im2col, the convolution is computed as matrix-matrix 111 * multiplication. 112 * 113 * To reduce the memory footprint, the im2col is performed partially. 114 * Each iteration, only a few column (i.e., patches) are generated and 115 * computed with GEMM kernels similar to NMSIS-DSP riscv_mat_mult functions. 116 * 117 */ 118 119 /** 120 * @brief Basic Q7 convolution function 121 * @param[in] Im_in pointer to input tensor 122 * @param[in] dim_im_in input tensor dimention 123 * @param[in] ch_im_in number of input tensor channels 124 * @param[in] wt pointer to kernel weights 125 * @param[in] ch_im_out number of filters, i.e., output tensor channels 126 * @param[in] dim_kernel filter kernel size 127 * @param[in] padding padding sizes 128 * @param[in] stride convolution stride 129 * @param[in] bias pointer to bias 130 * @param[in] bias_shift amount of left-shift for bias 131 * @param[in] out_shift amount of right-shift for output 132 * @param[in,out] Im_out pointer to output tensor 133 * @param[in] dim_im_out output tensor dimension 134 * @param[in,out] bufferA pointer to buffer space for input 135 * @param[in,out] bufferB pointer to buffer space for output 136 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 137 * 138 */ 139 140 riscv_status riscv_convolve_HWC_q7_basic(const q7_t * Im_in, 141 const uint16_t dim_im_in, 142 const uint16_t ch_im_in, 143 const q7_t * wt, 144 const uint16_t ch_im_out, 145 const uint16_t dim_kernel, 146 const uint16_t padding, 147 const uint16_t stride, 148 const q7_t * bias, 149 const uint16_t bias_shift, 150 const uint16_t out_shift, 151 q7_t * Im_out, 152 const uint16_t dim_im_out, 153 q15_t * bufferA, 154 q7_t * bufferB); 155 156 /** 157 * @brief Basic Q7 convolution function (non-sqaure shape) 158 * @param[in] Im_in pointer to input tensor 159 * @param[in] dim_im_in_x input tensor dimention x 160 * @param[in] dim_im_in_y input tensor dimention y 161 * @param[in] ch_im_in number of input tensor channels 162 * @param[in] wt pointer to kernel weights 163 * @param[in] ch_im_out number of filters, i.e., output tensor channels 164 * @param[in] dim_kernel_x filter kernel size x 165 * @param[in] dim_kernel_y filter kernel size y 166 * @param[in] padding_x padding size x 167 * @param[in] padding_y padding size y 168 * @param[in] stride_x convolution stride x 169 * @param[in] stride_y convolution stride y 170 * @param[in] bias pointer to bias 171 * @param[in] bias_shift amount of left-shift for bias 172 * @param[in] out_shift amount of right-shift for output 173 * @param[in,out] Im_out pointer to output tensor 174 * @param[in] dim_im_out_x output tensor dimension x 175 * @param[in] dim_im_out_y output tensor dimension y 176 * @param[in,out] bufferA pointer to buffer space for input 177 * @param[in,out] bufferB pointer to buffer space for output 178 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 179 */ 180 181 riscv_status riscv_convolve_HWC_q7_basic_nonsquare(const q7_t * Im_in, 182 const uint16_t dim_im_in_x, 183 const uint16_t dim_im_in_y, 184 const uint16_t ch_im_in, 185 const q7_t * wt, 186 const uint16_t ch_im_out, 187 const uint16_t dim_kernel_x, 188 const uint16_t dim_kernel_y, 189 const uint16_t padding_x, 190 const uint16_t padding_y, 191 const uint16_t stride_x, 192 const uint16_t stride_y, 193 const q7_t * bias, 194 const uint16_t bias_shift, 195 const uint16_t out_shift, 196 q7_t * Im_out, 197 const uint16_t dim_im_out_x, 198 const uint16_t dim_im_out_y, 199 q15_t * bufferA, 200 q7_t * bufferB); 201 202 /** 203 * @brief Basic Q15 convolution function 204 * @param[in] Im_in pointer to input tensor 205 * @param[in] dim_im_in input tensor dimention 206 * @param[in] ch_im_in number of input tensor channels 207 * @param[in] wt pointer to kernel weights 208 * @param[in] ch_im_out number of filters, i.e., output tensor channels 209 * @param[in] dim_kernel filter kernel size 210 * @param[in] padding padding sizes 211 * @param[in] stride convolution stride 212 * @param[in] bias pointer to bias 213 * @param[in] bias_shift amount of left-shift for bias 214 * @param[in] out_shift amount of right-shift for output 215 * @param[in,out] Im_out pointer to output tensor 216 * @param[in] dim_im_out output tensor dimension 217 * @param[in,out] bufferA pointer to buffer space for input 218 * @param[in,out] bufferB pointer to buffer space for output 219 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 220 * 221 */ 222 223 riscv_status riscv_convolve_HWC_q15_basic(const q15_t * Im_in, 224 const uint16_t dim_im_in, 225 const uint16_t ch_im_in, 226 const q15_t * wt, 227 const uint16_t ch_im_out, 228 const uint16_t dim_kernel, 229 const uint16_t padding, 230 const uint16_t stride, 231 const q15_t * bias, 232 const uint16_t bias_shift, 233 const uint16_t out_shift, 234 q15_t * Im_out, 235 const uint16_t dim_im_out, 236 q15_t * bufferA, 237 q7_t * bufferB); 238 239 /** 240 * @brief Fast Q7 convolution function 241 * @param[in] Im_in pointer to input tensor 242 * @param[in] dim_im_in input tensor dimention 243 * @param[in] ch_im_in number of input tensor channels 244 * @param[in] wt pointer to kernel weights 245 * @param[in] ch_im_out number of filters, i.e., output tensor channels 246 * @param[in] dim_kernel filter kernel size 247 * @param[in] padding padding sizes 248 * @param[in] stride convolution stride 249 * @param[in] bias pointer to bias 250 * @param[in] bias_shift amount of left-shift for bias 251 * @param[in] out_shift amount of right-shift for output 252 * @param[in,out] Im_out pointer to output tensor 253 * @param[in] dim_im_out output tensor dimension 254 * @param[in,out] bufferA pointer to buffer space for input 255 * @param[in,out] bufferB pointer to buffer space for output 256 * @return The function returns either 257 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 258 * 259 * This function is the version with full list of optimization tricks, but with 260 * some contraints: 261 * ch_im_in is multiple of 4 262 * ch_im_out is multiple of 2 263 */ 264 265 riscv_status riscv_convolve_HWC_q7_fast(const q7_t * Im_in, 266 const uint16_t dim_im_in, 267 const uint16_t ch_im_in, 268 const q7_t * wt, 269 const uint16_t ch_im_out, 270 const uint16_t dim_kernel, 271 const uint16_t padding, 272 const uint16_t stride, 273 const q7_t * bias, 274 const uint16_t bias_shift, 275 const uint16_t out_shift, 276 q7_t * Im_out, 277 const uint16_t dim_im_out, 278 q15_t * bufferA, 279 q7_t * bufferB); 280 281 /** 282 * @brief Fast Q7 convolution function (non-sqaure shape) 283 * @param[in] Im_in pointer to input tensor 284 * @param[in] dim_im_in_x input tensor dimention x 285 * @param[in] dim_im_in_y input tensor dimention y 286 * @param[in] ch_im_in number of input tensor channels 287 * @param[in] wt pointer to kernel weights 288 * @param[in] ch_im_out number of filters, i.e., output tensor channels 289 * @param[in] dim_kernel_x filter kernel size x 290 * @param[in] dim_kernel_y filter kernel size y 291 * @param[in] padding_x padding size x 292 * @param[in] padding_y padding size y 293 * @param[in] stride_x convolution stride x 294 * @param[in] stride_y convolution stride y 295 * @param[in] bias pointer to bias 296 * @param[in] bias_shift amount of left-shift for bias 297 * @param[in] out_shift amount of right-shift for output 298 * @param[in,out] Im_out pointer to output tensor 299 * @param[in] dim_im_out_x output tensor dimension x 300 * @param[in] dim_im_out_y output tensor dimension y 301 * @param[in,out] bufferA pointer to buffer space for input 302 * @param[in,out] bufferB pointer to buffer space for output 303 * @return The function returns either 304 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 305 * 306 * This function is the version with full list of optimization tricks, but with 307 * some contraints: 308 * ch_im_in is multiple of 4 309 * ch_im_out is multiple of 2 310 */ 311 312 riscv_status riscv_convolve_HWC_q7_fast_nonsquare(const q7_t * Im_in, 313 const uint16_t dim_im_in_x, 314 const uint16_t dim_im_in_y, 315 const uint16_t ch_im_in, 316 const q7_t * wt, 317 const uint16_t ch_im_out, 318 const uint16_t dim_kernel_x, 319 const uint16_t dim_kernel_y, 320 const uint16_t padding_x, 321 const uint16_t padding_y, 322 const uint16_t stride_x, 323 const uint16_t stride_y, 324 const q7_t * bias, 325 const uint16_t bias_shift, 326 const uint16_t out_shift, 327 q7_t * Im_out, 328 const uint16_t dim_im_out_x, 329 const uint16_t dim_im_out_y, 330 q15_t * bufferA, 331 q7_t * bufferB); 332 333 /** 334 * @brief Fast Q7 version of 1x1 convolution (non-sqaure shape) 335 * @param[in] Im_in pointer to input tensor 336 * @param[in] dim_im_in_x input tensor dimention x 337 * @param[in] dim_im_in_y input tensor dimention y 338 * @param[in] ch_im_in number of input tensor channels 339 * @param[in] wt pointer to kernel weights 340 * @param[in] ch_im_out number of filters, i.e., output tensor channels 341 * @param[in] dim_kernel_x filter kernel size x 342 * @param[in] dim_kernel_y filter kernel size y 343 * @param[in] padding_x padding size x 344 * @param[in] padding_y padding size y 345 * @param[in] stride_x convolution stride x 346 * @param[in] stride_y convolution stride y 347 * @param[in] bias pointer to bias 348 * @param[in] bias_shift amount of left-shift for bias 349 * @param[in] out_shift amount of right-shift for output 350 * @param[in,out] Im_out pointer to output tensor 351 * @param[in] dim_im_out_x output tensor dimension x 352 * @param[in] dim_im_out_y output tensor dimension y 353 * @param[in,out] bufferA pointer to buffer space for input 354 * @param[in,out] bufferB pointer to buffer space for output 355 * @return The function returns either 356 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 357 * 358 * This function implement convolution with 1x1 kernel size (i.e., dim_kernel_x=1 359 * and dim_kernel_y=1). It can be used for 360 * second half of MobileNets after depthwise separable convolution. 361 * 362 * This function is the version with full list of optimization tricks, but with 363 * some contraints: 364 * ch_im_in is multiple of 4 365 * ch_im_out is multiple of 2 366 */ 367 riscv_status riscv_convolve_1x1_HWC_q7_fast_nonsquare(const q7_t * Im_in, 368 const uint16_t dim_im_in_x, 369 const uint16_t dim_im_in_y, 370 const uint16_t ch_im_in, 371 const q7_t * wt, 372 const uint16_t ch_im_out, 373 const uint16_t dim_kernel_x, 374 const uint16_t dim_kernel_y, 375 const uint16_t padding_x, 376 const uint16_t padding_y, 377 const uint16_t stride_x, 378 const uint16_t stride_y, 379 const q7_t * bias, 380 const uint16_t bias_shift, 381 const uint16_t out_shift, 382 q7_t * Im_out, 383 const uint16_t dim_im_out_x, 384 const uint16_t dim_im_out_y, 385 q15_t * bufferA, 386 q7_t * bufferB); 387 388 /** 389 * @brief Q7 version of convolution for RGB image 390 * @param[in] Im_in pointer to input tensor 391 * @param[in] dim_im_in input tensor dimention 392 * @param[in] ch_im_in number of input tensor channels 393 * @param[in] wt pointer to kernel weights 394 * @param[in] ch_im_out number of filters, i.e., output tensor channels 395 * @param[in] dim_kernel filter kernel size 396 * @param[in] padding padding sizes 397 * @param[in] stride convolution stride 398 * @param[in] bias pointer to bias 399 * @param[in] bias_shift amount of left-shift for bias 400 * @param[in] out_shift amount of right-shift for output 401 * @param[in,out] Im_out pointer to output tensor 402 * @param[in] dim_im_out output tensor dimension 403 * @param[in,out] bufferA pointer to buffer space for input 404 * @param[in,out] bufferB pointer to buffer space for output 405 * @return The function returns either 406 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 407 * 408 * This kernel is written exclusively for convolution with ch_im_in 409 * equals 3. This applies on the first layer of CNNs which has input 410 * image with RGB format. 411 */ 412 413 riscv_status riscv_convolve_HWC_q7_RGB(const q7_t * Im_in, 414 const uint16_t dim_im_in, 415 const uint16_t ch_im_in, 416 const q7_t * wt, 417 const uint16_t ch_im_out, 418 const uint16_t dim_kernel, 419 const uint16_t padding, 420 const uint16_t stride, 421 const q7_t * bias, 422 const uint16_t bias_shift, 423 const uint16_t out_shift, 424 q7_t * Im_out, 425 const uint16_t dim_im_out, 426 q15_t * bufferA, 427 q7_t * bufferB); 428 429 /** 430 * @brief Fast Q15 convolution function 431 * @param[in] Im_in pointer to input tensor 432 * @param[in] dim_im_in input tensor dimention 433 * @param[in] ch_im_in number of input tensor channels 434 * @param[in] wt pointer to kernel weights 435 * @param[in] ch_im_out number of filters, i.e., output tensor channels 436 * @param[in] dim_kernel filter kernel size 437 * @param[in] padding padding sizes 438 * @param[in] stride convolution stride 439 * @param[in] bias pointer to bias 440 * @param[in] bias_shift amount of left-shift for bias 441 * @param[in] out_shift amount of right-shift for output 442 * @param[in,out] Im_out pointer to output tensor 443 * @param[in] dim_im_out output tensor dimension 444 * @param[in,out] bufferA pointer to buffer space for input 445 * @param[in,out] bufferB pointer to buffer space for output 446 * @return The function returns either 447 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 448 * 449 * This function is the version with full list of optimization tricks, but with 450 * some contraints: 451 * ch_im_in is multiple of 2 452 * ch_im_out is multiple of 2 453 */ 454 455 riscv_status riscv_convolve_HWC_q15_fast(const q15_t * Im_in, 456 const uint16_t dim_im_in, 457 const uint16_t ch_im_in, 458 const q15_t * wt, 459 const uint16_t ch_im_out, 460 const uint16_t dim_kernel, 461 const uint16_t padding, 462 const uint16_t stride, 463 const q15_t * bias, 464 const uint16_t bias_shift, 465 const uint16_t out_shift, 466 q15_t * Im_out, 467 const uint16_t dim_im_out, 468 q15_t * bufferA, 469 q7_t * bufferB); 470 471 /** 472 * @brief Fast Q15 convolution function (non-sqaure shape) 473 * @param[in] Im_in pointer to input tensor 474 * @param[in] dim_im_in_x input tensor dimention x 475 * @param[in] dim_im_in_y input tensor dimention y 476 * @param[in] ch_im_in number of input tensor channels 477 * @param[in] wt pointer to kernel weights 478 * @param[in] ch_im_out number of filters, i.e., output tensor channels 479 * @param[in] dim_kernel_x filter kernel size x 480 * @param[in] dim_kernel_y filter kernel size y 481 * @param[in] padding_x padding size x 482 * @param[in] padding_y padding size y 483 * @param[in] stride_x convolution stride x 484 * @param[in] stride_y convolution stride y 485 * @param[in] bias pointer to bias 486 * @param[in] bias_shift amount of left-shift for bias 487 * @param[in] out_shift amount of right-shift for output 488 * @param[in,out] Im_out pointer to output tensor 489 * @param[in] dim_im_out_x output tensor dimension x 490 * @param[in] dim_im_out_y output tensor dimension y 491 * @param[in,out] bufferA pointer to buffer space for input 492 * @param[in,out] bufferB pointer to buffer space for output 493 * @return The function returns either 494 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 495 * 496 * @details 497 * 498 * <b>Buffer size:</b> 499 * 500 * bufferA size: 2*ch_im_in*dim_kernel*dim_kernel 501 * 502 * bufferB size: 0 503 * 504 * <b>Input dimension constraints:</b> 505 * 506 * ch_im_in is multiple of 2 507 * 508 * ch_im_out is multipe of 2 509 * 510 */ 511 512 riscv_status 513 riscv_convolve_HWC_q15_fast_nonsquare(const q15_t * Im_in, 514 const uint16_t dim_im_in_x, 515 const uint16_t dim_im_in_y, 516 const uint16_t ch_im_in, 517 const q15_t * wt, 518 const uint16_t ch_im_out, 519 const uint16_t dim_kernel_x, 520 const uint16_t dim_kernel_y, 521 const uint16_t padding_x, 522 const uint16_t padding_y, 523 const uint16_t stride_x, 524 const uint16_t stride_y, 525 const q15_t * bias, 526 const uint16_t bias_shift, 527 const uint16_t out_shift, 528 q15_t * Im_out, 529 const uint16_t dim_im_out_x, 530 const uint16_t dim_im_out_y, 531 q15_t * bufferA, 532 q7_t * bufferB); 533 534 /** 535 * @brief Q7 depthwise separable convolution function 536 * @param[in] Im_in pointer to input tensor 537 * @param[in] dim_im_in input tensor dimention 538 * @param[in] ch_im_in number of input tensor channels 539 * @param[in] wt pointer to kernel weights 540 * @param[in] ch_im_out number of filters, i.e., output tensor channels 541 * @param[in] dim_kernel filter kernel size 542 * @param[in] padding padding sizes 543 * @param[in] stride convolution stride 544 * @param[in] bias pointer to bias 545 * @param[in] bias_shift amount of left-shift for bias 546 * @param[in] out_shift amount of right-shift for output 547 * @param[in,out] Im_out pointer to output tensor 548 * @param[in] dim_im_out output tensor dimension 549 * @param[in,out] bufferA pointer to buffer space for input 550 * @param[in,out] bufferB pointer to buffer space for output 551 * @return The function returns either 552 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 553 * 554 * This function is the version with full list of optimization tricks, but with 555 * some contraints: 556 * ch_im_in is multiple of 2 557 * ch_im_out is multiple of 2 558 */ 559 560 riscv_status riscv_depthwise_separable_conv_HWC_q7(const q7_t * Im_in, 561 const uint16_t dim_im_in, 562 const uint16_t ch_im_in, 563 const q7_t * wt, 564 const uint16_t ch_im_out, 565 const uint16_t dim_kernel, 566 const uint16_t padding, 567 const uint16_t stride, 568 const q7_t * bias, 569 const uint16_t bias_shift, 570 const uint16_t out_shift, 571 q7_t * Im_out, 572 const uint16_t dim_im_out, 573 q15_t * bufferA, 574 q7_t * bufferB); 575 576 /** 577 * @brief Q7 depthwise separable convolution function (non-square shape) 578 * @param[in] Im_in pointer to input tensor 579 * @param[in] dim_im_in_x input tensor dimention x 580 * @param[in] dim_im_in_y input tensor dimention y 581 * @param[in] ch_im_in number of input tensor channels 582 * @param[in] wt pointer to kernel weights 583 * @param[in] ch_im_out number of filters, i.e., output tensor channels 584 * @param[in] dim_kernel_x filter kernel size x 585 * @param[in] dim_kernel_y filter kernel size y 586 * @param[in] padding_x padding sizes x 587 * @param[in] padding_y padding sizes y 588 * @param[in] stride_x convolution stride x 589 * @param[in] stride_y convolution stride y 590 * @param[in] bias pointer to bias 591 * @param[in] bias_shift amount of left-shift for bias 592 * @param[in] out_shift amount of right-shift for output 593 * @param[in,out] Im_out pointer to output tensor 594 * @param[in] dim_im_out_x output tensor dimension x 595 * @param[in] dim_im_out_y output tensor dimension y 596 * @param[in,out] bufferA pointer to buffer space for input 597 * @param[in,out] bufferB pointer to buffer space for output 598 * @return The function returns either 599 * <code>RISCV_MATH_SIZE_MISMATCH</code> or <code>RISCV_MATH_SUCCESS</code> based on the outcome of size checking. 600 * 601 * This function is the version with full list of optimization tricks, but with 602 * some contraints: 603 * ch_im_in is multiple of 2 604 * ch_im_out is multiple of 2 605 */ 606 riscv_status riscv_depthwise_separable_conv_HWC_q7_nonsquare(const q7_t * Im_in, 607 const uint16_t dim_im_in_x, 608 const uint16_t dim_im_in_y, 609 const uint16_t ch_im_in, 610 const q7_t * wt, 611 const uint16_t ch_im_out, 612 const uint16_t dim_kernel_x, 613 const uint16_t dim_kernel_y, 614 const uint16_t padding_x, 615 const uint16_t padding_y, 616 const uint16_t stride_x, 617 const uint16_t stride_y, 618 const q7_t * bias, 619 const uint16_t bias_shift, 620 const uint16_t out_shift, 621 q7_t * Im_out, 622 const uint16_t dim_im_out_x, 623 const uint16_t dim_im_out_y, 624 q15_t * bufferA, 625 q7_t * bufferB); 626 627 628 /** 629 * @defgroup FC Fully-connected Layer Functions 630 * 631 * Perform fully-connected layer 632 * 633 * Fully-connected layer is basically a matrix-vector multiplication 634 * with bias. The matrix is the weights and the input/output vectors 635 * are the activation values. Supported {weight, activation} precisions 636 * include {8-bit, 8-bit}, {16-bit, 16-bit}, and {8-bit, 16-bit}. 637 * 638 * Here we have two types of kernel functions. The basic function 639 * implements the function using regular GEMV approach. The opt functions 640 * operates with weights in interleaved formats. 641 * 642 */ 643 644 /** 645 * @brief Q7 basic fully-connected layer function 646 * @param[in] pV pointer to input vector 647 * @param[in] pM pointer to matrix weights 648 * @param[in] dim_vec length of the vector 649 * @param[in] num_of_rows number of rows in weight matrix 650 * @param[in] bias_shift amount of left-shift for bias 651 * @param[in] out_shift amount of right-shift for output 652 * @param[in] bias pointer to bias 653 * @param[in,out] pOut pointer to output vector 654 * @param[in,out] vec_buffer pointer to buffer space for input 655 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 656 * 657 */ 658 659 riscv_status riscv_fully_connected_q7(const q7_t * pV, 660 const q7_t * pM, 661 const uint16_t dim_vec, 662 const uint16_t num_of_rows, 663 const uint16_t bias_shift, 664 const uint16_t out_shift, 665 const q7_t * bias, 666 q7_t * pOut, 667 q15_t * vec_buffer); 668 669 /** 670 * @brief S8 basic fully-connected layer function for TF Lite 671 * @param[in] pInput pointer to pInput vector 672 * @param[in] pWeight pointer to matrix weights 673 * @param[in] col_dim dimension of the input vector 674 * @param[in] row_dim dimension of the output vector 675 * @param[in] nb_batches number of batches 676 * @param[in] input_offset 677 * @param[in] filter_offset 678 * @param[in] out_mult requantization parameter 679 * @param[in] out_shift requantization parameter 680 * @param[in] output_offset 681 * @param[in] pBias pointer to bias 682 * @param[out] pOut pointer to output vector 683 * @param[in] output_activation_min for clamping 684 * @param[in] output_activation_max for clamping 685 * @param[in,out] vec_buffer pointer to buffer space for pInput 686 * @return The function returns RISCV_MATH_SUCCESS 687 * 688 * @details 689 * 690 * <b>Buffer size:</b> 691 * 692 * vec_buffer size: col_dim of word16. 693 * 694 * This basic function is designed to work with regular pWeight 695 * matrix without interleaving. 696 * 697 */ 698 riscv_status 699 riscv_fully_connected_s8(const int8_t *pInput, 700 const int8_t *weight, 701 const uint16_t input_length, 702 const uint16_t num_rows, 703 const uint16_t nb_batches, 704 const int32_t input_offset, 705 const int32_t filter_offset, 706 const int32_t out_mult, 707 const int32_t out_shift, 708 const int32_t output_offset, 709 const int8_t *bias, 710 int8_t *pOut, 711 const int32_t output_activation_min, 712 const int32_t output_activation_max, 713 q15_t *vec_buffer) ; 714 715 /** 716 * @brief Q7 opt fully-connected layer function 717 * @param[in] pV pointer to input vector 718 * @param[in] pM pointer to matrix weights 719 * @param[in] dim_vec length of the vector 720 * @param[in] num_of_rows number of rows in weight matrix 721 * @param[in] bias_shift amount of left-shift for bias 722 * @param[in] out_shift amount of right-shift for output 723 * @param[in] bias pointer to bias 724 * @param[in,out] pOut pointer to output vector 725 * @param[in,out] vec_buffer pointer to buffer space for input 726 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 727 * 728 */ 729 730 riscv_status riscv_fully_connected_q7_opt(const q7_t * pV, 731 const q7_t * pM, 732 const uint16_t dim_vec, 733 const uint16_t num_of_rows, 734 const uint16_t bias_shift, 735 const uint16_t out_shift, 736 const q7_t * bias, 737 q7_t * pOut, 738 q15_t * vec_buffer); 739 740 /** 741 * @brief Q15 basic fully-connected layer function 742 * @param[in] pV pointer to input vector 743 * @param[in] pM pointer to matrix weights 744 * @param[in] dim_vec length of the vector 745 * @param[in] num_of_rows number of rows in weight matrix 746 * @param[in] bias_shift amount of left-shift for bias 747 * @param[in] out_shift amount of right-shift for output 748 * @param[in] bias pointer to bias 749 * @param[in,out] pOut pointer to output vector 750 * @param[in,out] vec_buffer pointer to buffer space for input 751 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 752 * 753 */ 754 755 riscv_status riscv_fully_connected_q15(const q15_t * pV, 756 const q15_t * pM, 757 const uint16_t dim_vec, 758 const uint16_t num_of_rows, 759 const uint16_t bias_shift, 760 const uint16_t out_shift, 761 const q15_t * bias, 762 q15_t * pOut, 763 q15_t * vec_buffer); 764 765 /** 766 * @brief Q15 opt fully-connected layer function 767 * @param[in] pV pointer to input vector 768 * @param[in] pM pointer to matrix weights 769 * @param[in] dim_vec length of the vector 770 * @param[in] num_of_rows number of rows in weight matrix 771 * @param[in] bias_shift amount of left-shift for bias 772 * @param[in] out_shift amount of right-shift for output 773 * @param[in] bias pointer to bias 774 * @param[in,out] pOut pointer to output vector 775 * @param[in,out] vec_buffer pointer to buffer space for input 776 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 777 * 778 */ 779 780 riscv_status riscv_fully_connected_q15_opt(const q15_t * pV, 781 const q15_t * pM, 782 const uint16_t dim_vec, 783 const uint16_t num_of_rows, 784 const uint16_t bias_shift, 785 const uint16_t out_shift, 786 const q15_t * bias, 787 q15_t * pOut, 788 q15_t * vec_buffer); 789 790 /** 791 * @brief Mixed Q15-Q7 fully-connected layer function 792 * @param[in] pV pointer to input vector 793 * @param[in] pM pointer to matrix weights 794 * @param[in] dim_vec length of the vector 795 * @param[in] num_of_rows number of rows in weight matrix 796 * @param[in] bias_shift amount of left-shift for bias 797 * @param[in] out_shift amount of right-shift for output 798 * @param[in] bias pointer to bias 799 * @param[in,out] pOut pointer to output vector 800 * @param[in,out] vec_buffer pointer to buffer space for input 801 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 802 * 803 */ 804 805 riscv_status riscv_fully_connected_mat_q7_vec_q15(const q15_t * pV, 806 const q7_t * pM, 807 const uint16_t dim_vec, 808 const uint16_t num_of_rows, 809 const uint16_t bias_shift, 810 const uint16_t out_shift, 811 const q7_t * bias, 812 q15_t * pOut, 813 q15_t * vec_buffer); 814 815 /** 816 * @brief Mixed Q15-Q7 opt fully-connected layer function 817 * @param[in] pV pointer to input vector 818 * @param[in] pM pointer to matrix weights 819 * @param[in] dim_vec length of the vector 820 * @param[in] num_of_rows number of rows in weight matrix 821 * @param[in] bias_shift amount of left-shift for bias 822 * @param[in] out_shift amount of right-shift for output 823 * @param[in] bias pointer to bias 824 * @param[in,out] pOut pointer to output vector 825 * @param[in,out] vec_buffer pointer to buffer space for input 826 * @return The function returns <code>RISCV_MATH_SUCCESS</code> 827 * 828 */ 829 830 riscv_status riscv_fully_connected_mat_q7_vec_q15_opt(const q15_t * pV, 831 const q7_t * pM, 832 const uint16_t dim_vec, 833 const uint16_t num_of_rows, 834 const uint16_t bias_shift, 835 const uint16_t out_shift, 836 const q7_t * bias, 837 q15_t * pOut, 838 q15_t * vec_buffer); 839 840 /** 841 * @brief Matrix-Multiplication Kernels for Convolution 842 * 843 * These functions are used within convolution layer functions for 844 * matrix multiplication. 845 * 846 * The implementation is similar to NMSIS-DSP riscv_mat_mult functions 847 * with one Q7 and one Q15 operands. The Q15 operand is the im2col 848 * output which is always with 2 columns. 849 * 850 */ 851 852 /** 853 * @brief Matrix-multiplication function for convolution 854 * @param[in] pA pointer to operand A 855 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 856 * @param[in] ch_im_out numRow of A 857 * @param[in] numCol_A numCol of A 858 * @param[in] bias_shift amount of left-shift for bias 859 * @param[in] out_shift amount of right-shift for output 860 * @param[in] bias the bias 861 * @param[in,out] pOut pointer to output 862 * @return The function returns the incremented output pointer 863 */ 864 865 q7_t *riscv_nn_mat_mult_kernel_q7_q15(const q7_t * pA, 866 const q15_t * pInBuffer, 867 const uint16_t ch_im_out, 868 const uint16_t numCol_A, 869 const uint16_t bias_shift, 870 const uint16_t out_shift, 871 const q7_t * bias, 872 q7_t * pOut); 873 874 q7_t *riscv_nn_mat_mult_kernel_q7(const q7_t * pA, 875 const q7_t * pInBuffer, 876 const uint16_t ch_im_out, 877 const uint16_t numCol_A, 878 const uint16_t bias_shift, 879 const uint16_t out_shift, 880 const q7_t * bias, 881 q7_t * pOut); 882 883 /** 884 * @brief Matrix-multiplication function for convolution with reordered columns 885 * @param[in] pA pointer to operand A 886 * @param[in] pInBuffer pointer to operand B, always conssists of 2 vectors 887 * @param[in] ch_im_out numRow of A 888 * @param[in] numCol_A numCol of A 889 * @param[in] bias_shift amount of left-shift for bias 890 * @param[in] out_shift amount of right-shift for output 891 * @param[in] bias the bias 892 * @param[in,out] pOut pointer to output 893 * @return The function returns the incremented output pointer 894 */ 895 896 q7_t *riscv_nn_mat_mult_kernel_q7_q15_reordered(const q7_t * pA, 897 const q15_t * pInBuffer, 898 const uint16_t ch_im_out, 899 const uint16_t numCol_A, 900 const uint16_t bias_shift, 901 const uint16_t out_shift, 902 const q7_t * bias, 903 q7_t * pOut); 904 905 q7_t *riscv_nn_mat_mult_kernel_q7_reordered(const q7_t * pA, 906 const q7_t * pInBuffer, 907 const uint16_t ch_im_out, 908 const uint16_t numCol_A, 909 const uint16_t bias_shift, 910 const uint16_t out_shift, 911 const q7_t * bias, 912 q7_t * pOut); 913 914 #ifdef __cplusplus 915 } 916 #endif 917 918 /* 919 * Other functions 920 * These layers are typically not timing critical 921 * Basic implementation is supported here 922 */ 923 924 #ifdef __cplusplus 925 extern "C" 926 { 927 #endif 928 929 /** 930 * @defgroup Acti Neural Network Activation Functions 931 * 932 * Perform activation layers, including ReLU (Rectified Linear Unit), 933 * sigmoid and tanh 934 * 935 */ 936 937 /** 938 * @brief Q7 RELU function 939 * @param[in,out] data pointer to input 940 * @param[in] size number of elements 941 * @return none. 942 */ 943 944 void riscv_relu_q7(q7_t * data, uint16_t size); 945 946 /** 947 * @brief Q15 RELU function 948 * @param[in,out] data pointer to input 949 * @param[in] size number of elements 950 * @return none. 951 */ 952 953 void riscv_relu_q15(q15_t * data, uint16_t size); 954 955 /** 956 * @brief Q7 neural network activation function using direct table look-up 957 * @param[in,out] data pointer to input 958 * @param[in] size number of elements 959 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 960 * @param[in] type type of activation functions 961 * @return none. 962 */ 963 964 void riscv_nn_activations_direct_q7(q7_t * data, uint16_t size, uint16_t int_width, 965 riscv_nn_activation_type type); 966 967 /** 968 * @brief Q15 neural network activation function using direct table look-up 969 * @param[in,out] data pointer to input 970 * @param[in] size number of elements 971 * @param[in] int_width bit-width of the integer part, assume to be smaller than 3 972 * @param[in] type type of activation functions 973 * @return none. 974 */ 975 976 void riscv_nn_activations_direct_q15(q15_t * data, uint16_t size, uint16_t int_width, 977 riscv_nn_activation_type type); 978 979 /** 980 * @defgroup Pooling Neural Network Pooling Functions 981 * 982 * Perform pooling functions, including max pooling and average pooling 983 * 984 */ 985 986 /** 987 * @brief Q7 max pooling function 988 * @param[in] Im_in pointer to input tensor 989 * @param[in] dim_im_in input tensor dimention 990 * @param[in] ch_im_in number of input tensor channels 991 * @param[in] dim_kernel filter kernel size 992 * @param[in] padding padding sizes 993 * @param[in] stride convolution stride 994 * @param[in] dim_im_out output tensor dimension 995 * @param[in,out] bufferA pointer to buffer space for input 996 * @param[in,out] Im_out pointer to output tensor 997 * @return none. 998 * 999 */ 1000 1001 void riscv_maxpool_q7_HWC(q7_t * Im_in, 1002 const uint16_t dim_im_in, 1003 const uint16_t ch_im_in, 1004 const uint16_t dim_kernel, 1005 const uint16_t padding, 1006 const uint16_t stride, 1007 const uint16_t dim_im_out, 1008 q7_t * bufferA, 1009 q7_t * Im_out); 1010 1011 /** 1012 * @brief Q7 average pooling function 1013 * @param[in] Im_in pointer to input tensor 1014 * @param[in] dim_im_in input tensor dimention 1015 * @param[in] ch_im_in number of input tensor channels 1016 * @param[in] dim_kernel filter kernel size 1017 * @param[in] padding padding sizes 1018 * @param[in] stride convolution stride 1019 * @param[in] dim_im_out output tensor dimension 1020 * @param[in,out] bufferA pointer to buffer space for input 1021 * @param[in,out] Im_out pointer to output tensor 1022 * @return none. 1023 * 1024 */ 1025 1026 void riscv_avepool_q7_HWC(q7_t * Im_in, 1027 const uint16_t dim_im_in, 1028 const uint16_t ch_im_in, 1029 const uint16_t dim_kernel, 1030 const uint16_t padding, 1031 const uint16_t stride, 1032 const uint16_t dim_im_out, 1033 q7_t * bufferA, 1034 q7_t * Im_out); 1035 1036 /** 1037 * @defgroup Softmax Softmax Functions 1038 * 1039 * EXP(2) based softmax function 1040 * 1041 */ 1042 1043 /** 1044 * @brief Q7 softmax function 1045 * @param[in] vec_in pointer to input vector 1046 * @param[in] dim_vec input vector dimention 1047 * @param[out] p_out pointer to output vector 1048 * @return none. 1049 * 1050 */ 1051 1052 void riscv_softmax_q7(const q7_t * vec_in, const uint16_t dim_vec, q7_t * p_out); 1053 1054 /** 1055 * @brief Q15 softmax function 1056 * @param[in] vec_in pointer to input vector 1057 * @param[in] dim_vec input vector dimention 1058 * @param[out] p_out pointer to output vector 1059 * @return none. 1060 * 1061 */ 1062 1063 void riscv_softmax_q15(const q15_t * vec_in, const uint16_t dim_vec, q15_t * p_out); 1064 1065 /** 1066 * @brief uint8 depthwise convolution function with asymmetric quantization for even number of channel multiplier 1067 * and input channels. Unless specified otherwise, arguments are mandatory. 1068 * 1069 * @param[in] input Pointer to input tensor 1070 * @param[in] input_x Width of input tensor 1071 * @param[in] input_y Height of input tensor 1072 * @param[in] input_ch Channels in input tensor 1073 * @param[in] kernel Pointer to kernel weights 1074 * @param[in] kernel_x Width of kernel 1075 * @param[in] kernel_y Height of kernel 1076 * @param[in] ch_mult Number of channel multiplier 1077 * @param[in] pad_x Padding sizes x 1078 * @param[in] pad_y Padding sizes y 1079 * @param[in] stride_x Convolution stride along the width 1080 * @param[in] stride_y Convolution stride along the height 1081 * @param[in] dilation_x Dilation along width. Not used and intended for future enhancement. 1082 * @param[in] dilation_y Dilation along height. Not used and intended for future enhancement. 1083 * @param[in] bias Pointer to optional bias values. If no bias is 1084 * availble, NULL is expected 1085 * @param[in] input_offset Input tensor zero offset 1086 * @param[in] filter_offset Kernel tensor zero offset 1087 * @param[in] output_offset Output tensor zero offset 1088 * @param[in,out] output Pointer to output tensor 1089 * @param[in] output_x Width of output tensor 1090 * @param[in] output_y Height of output tensor 1091 * @param[in] output_activation_min Minimum value to clamp the output to. Range : {0, 255} 1092 * @param[in] output_activation_max Minimum value to clamp the output to. Range : {0, 255} 1093 * @param[in] out_shift Amount of right-shift for output 1094 * @param[in] out_mult Output multiplier for requantization 1095 * @return The function returns one of the following 1096 * <code>RISCV_MATH_SIZE_MISMATCH</code> - Not supported dimension of tensors 1097 * <code>RISCV_MATH_SUCCESS</code> - Successful operation 1098 * <code>RISCV_MATH_ARGUMENT_ERROR</code> - Implementation not available 1099 * 1100 * <b> Input constraints</b> 1101 * ch_mult is multiple of 2 1102 * kernel_x is multiple of 2 1103 * 1104 */ 1105 riscv_status riscv_depthwise_conv_u8_basic_ver1(const uint8_t *input, 1106 const uint16_t input_x, 1107 const uint16_t input_y, 1108 const uint16_t input_ch, 1109 const uint8_t *kernel, 1110 const uint16_t kernel_x, 1111 const uint16_t kernel_y, 1112 const int16_t ch_mult, 1113 const int16_t pad_x, 1114 const int16_t pad_y, 1115 const int16_t stride_x, 1116 const int16_t stride_y, 1117 const int16_t dilation_x, 1118 const int16_t dilation_y, 1119 const int32_t *bias, 1120 const int32_t input_offset, 1121 const int32_t filter_offset, 1122 const int32_t output_offset, 1123 uint8_t *output, 1124 const uint16_t output_x, 1125 const uint16_t output_y, 1126 const int32_t output_activation_min, 1127 const int32_t output_activation_max, 1128 const int32_t out_shift, 1129 const int32_t out_mult); 1130 #ifdef __cplusplus 1131 } 1132 #endif 1133 1134 #endif 1135