1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_q7.c
4 * Description: Q15 matrix multiplication
5 *
6 * $Date: 23 April 2021
7 *
8 * $Revision: V1.9.0
9 *
10 * Target Processor: Cortex-M and Cortex-A cores
11 * -------------------------------------------------------------------- */
12 /*
13 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14 *
15 * SPDX-License-Identifier: Apache-2.0
16 *
17 * Licensed under the Apache License, Version 2.0 (the License); you may
18 * not use this file except in compliance with the License.
19 * You may obtain a copy of the License at
20 *
21 * www.apache.org/licenses/LICENSE-2.0
22 *
23 * Unless required by applicable law or agreed to in writing, software
24 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26 * See the License for the specific language governing permissions and
27 * limitations under the License.
28 */
29
30 #include "dsp/matrix_functions.h"
31
32 /**
33 @ingroup groupMatrix
34 */
35
36 /**
37 @addtogroup MatrixMult
38 @{
39 */
40
41 /**
42 * @brief Q7 matrix multiplication
43 * @param[in] *pSrcA points to the first input matrix structure
44 * @param[in] *pSrcB points to the second input matrix structure
45 * @param[out] *pDst points to output matrix structure
46 * @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
47 * @return The function returns either
48 * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49 *
50 * @details
51 * <b>Scaling and Overflow Behavior:</b>
52 *
53 * \par
54 * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
55 *
56 *
57 */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_mult_q7_2x2_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)59 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
60 const arm_matrix_instance_q7 * pSrcA,
61 const arm_matrix_instance_q7 * pSrcB,
62 arm_matrix_instance_q7 * pDst)
63 {
64 const uint32_t MATRIX_DIM = 2;
65 q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
66 q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
67 q7_t *pOut = pDst->pData; /* output data matrix pointer */
68 uint8x16_t vecColBOffs;
69 q7_t *pInA0 = pInA;
70 q7_t *pInA1 = pInA0 + MATRIX_DIM;
71 q31_t acc0, acc1;
72 q7x16_t vecB, vecA0, vecA1;
73 mve_pred16_t p0 = vctp8q(MATRIX_DIM);
74
75 vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
76
77 pInB = pSrcB->pData;
78
79 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
80
81 vecA0 = vldrbq_s8(pInA0);
82 vecA1 = vldrbq_s8(pInA1);
83
84 acc0 = vmladavq_s8(vecA0, vecB);
85 acc1 = vmladavq_s8(vecA1, vecB);
86
87 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
88 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
89 pOut++;
90
91 /* move to next B column */
92 pInB = pInB + 1;
93
94 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
95
96 acc0 = vmladavq_s8(vecA0, vecB);
97 acc1 = vmladavq_s8(vecA1, vecB);
98
99 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
100 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
101 /*
102 * Return to application
103 */
104 return (ARM_MATH_SUCCESS);
105 }
106
107
arm_mat_mult_q7_3x3_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)108 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
109 const arm_matrix_instance_q7 * pSrcA,
110 const arm_matrix_instance_q7 * pSrcB,
111 arm_matrix_instance_q7 * pDst)
112 {
113 const uint8_t MATRIX_DIM = 3;
114 q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
115 q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
116 q7_t *pOut = pDst->pData; /* output data matrix pointer */
117 uint8x16_t vecColBOffs;
118 q7_t *pInA0 = pInA;
119 q7_t *pInA1 = pInA0 + MATRIX_DIM;
120 q7_t *pInA2 = pInA1 + MATRIX_DIM;
121 q31_t acc0, acc1, acc2;
122 q7x16_t vecB, vecA0, vecA1, vecA2;
123 mve_pred16_t p0 = vctp8q(MATRIX_DIM);
124
125 vecColBOffs = vidupq_u8((uint32_t)0, 1);
126 vecColBOffs = vecColBOffs * MATRIX_DIM;
127
128 pInB = pSrcB->pData;
129
130 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
131
132 vecA0 = vldrbq_s8(pInA0);
133 vecA1 = vldrbq_s8(pInA1);
134 vecA2 = vldrbq_s8(pInA2);
135
136 acc0 = vmladavq_s8(vecA0, vecB);
137 acc1 = vmladavq_s8(vecA1, vecB);
138 acc2 = vmladavq_s8(vecA2, vecB);
139
140 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
141 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
142 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
143 pOut++;
144
145 /* move to next B column */
146 pInB = pInB + 1;
147
148 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
149
150 acc0 = vmladavq_s8(vecA0, vecB);
151 acc1 = vmladavq_s8(vecA1, vecB);
152 acc2 = vmladavq_s8(vecA2, vecB);
153
154 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
155 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
156 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
157 pOut++;
158
159 /* move to next B column */
160 pInB = pInB + 1;
161
162 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
163
164 acc0 = vmladavq_s8(vecA0, vecB);
165 acc1 = vmladavq_s8(vecA1, vecB);
166 acc2 = vmladavq_s8(vecA2, vecB);
167
168 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
169 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
170 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
171 /*
172 * Return to application
173 */
174 return (ARM_MATH_SUCCESS);
175 }
176
177
arm_mat_mult_q7_4x4_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)178 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
179 const arm_matrix_instance_q7 * pSrcA,
180 const arm_matrix_instance_q7 * pSrcB,
181 arm_matrix_instance_q7 * pDst)
182 {
183 const uint32_t MATRIX_DIM = 4;
184 q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
185 q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
186 q7_t *pOut = pDst->pData; /* output data matrix pointer */
187 uint8x16_t vecColBOffs;
188 q7_t *pInA0 = pInA;
189 q7_t *pInA1 = pInA0 + MATRIX_DIM;
190 q7_t *pInA2 = pInA1 + MATRIX_DIM;
191 q7_t *pInA3 = pInA2 + MATRIX_DIM;
192 q31_t acc0, acc1, acc2, acc3;
193 q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
194 mve_pred16_t p0 = vctp8q(MATRIX_DIM);
195
196 vecColBOffs = vidupq_u8((uint32_t)0, 4);
197
198 pInB = pSrcB->pData;
199
200 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
201
202 vecA0 = vldrbq_s8(pInA0);
203 vecA1 = vldrbq_s8(pInA1);
204 vecA2 = vldrbq_s8(pInA2);
205 vecA3 = vldrbq_s8(pInA3);
206
207 acc0 = vmladavq_s8(vecA0, vecB);
208 acc1 = vmladavq_s8(vecA1, vecB);
209 acc2 = vmladavq_s8(vecA2, vecB);
210 acc3 = vmladavq_s8(vecA3, vecB);
211
212 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
213 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
214 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
215 pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
216 pOut++;
217
218 /* move to next B column */
219 pInB = pInB + 1;
220
221 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
222
223 acc0 = vmladavq_s8(vecA0, vecB);
224 acc1 = vmladavq_s8(vecA1, vecB);
225 acc2 = vmladavq_s8(vecA2, vecB);
226 acc3 = vmladavq_s8(vecA3, vecB);
227
228 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
229 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
230 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
231 pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
232 pOut++;
233
234 /* move to next B column */
235 pInB = pInB + 1;
236
237 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
238
239 acc0 = vmladavq_s8(vecA0, vecB);
240 acc1 = vmladavq_s8(vecA1, vecB);
241 acc2 = vmladavq_s8(vecA2, vecB);
242 acc3 = vmladavq_s8(vecA3, vecB);
243
244 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
245 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
246 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
247 pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
248 pOut++;
249
250 /* move to next B column */
251 pInB = pInB + 1;
252
253 vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
254
255 acc0 = vmladavq_s8(vecA0, vecB);
256 acc1 = vmladavq_s8(vecA1, vecB);
257 acc2 = vmladavq_s8(vecA2, vecB);
258 acc3 = vmladavq_s8(vecA3, vecB);
259
260 pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
261 pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
262 pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
263 pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
264 /*
265 * Return to application
266 */
267 return (ARM_MATH_SUCCESS);
268 }
269
arm_mat_mult_q7(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst,q7_t * pState)270 arm_status arm_mat_mult_q7(
271 const arm_matrix_instance_q7 * pSrcA,
272 const arm_matrix_instance_q7 * pSrcB,
273 arm_matrix_instance_q7 * pDst,
274 q7_t * pState)
275 {
276 q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
277 q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
278 q7_t *pInA2;
279 q7_t *pInB2;
280 q7_t *px; /* Temporary output data matrix pointer */
281 q7_t *px2; /* Temporary output data matrix pointer */
282 uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
283 uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
284 uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
285 uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
286 uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
287 q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
288 uint32_t blkCnt; /* loop counters */
289 arm_status status; /* status of matrix multiplication */
290 arm_matrix_instance_q7 BT;
291
292
293 #ifdef ARM_MATH_MATRIX_CHECK
294
295 /* Check for matrix mismatch condition */
296 if ((pSrcA->numCols != pSrcB->numRows) ||
297 (pSrcA->numRows != pDst->numRows) ||
298 (pSrcB->numCols != pDst->numCols) )
299 {
300 /* Set status as ARM_MATH_SIZE_MISMATCH */
301 status = ARM_MATH_SIZE_MISMATCH;
302 }
303 else
304
305 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
306 {
307 /* small squared matrix specialized routines */
308 if(numRowsA == numColsB && numColsB == numColsA) {
309 if(numRowsA == 2)
310 return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
311 else if(numRowsA == 3)
312 return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
313 else if (numRowsA == 4)
314 return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
315 }
316 /*
317 * Matrix transpose
318 */
319
320 BT.numRows = numColsB;
321 BT.numCols = numRowsB;
322 BT.pData = pSrcBT;
323
324 arm_mat_trans_q7(pSrcB, &BT);
325
326 /*
327 * Reset the variables for the usage in the following multiplication process
328 */
329 i = 0;
330 row = numRowsA >> 1;
331 px = pDst->pData;
332 px2 = px + numColsB;
333
334 /*
335 * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
336 */
337
338 /*
339 * row loop
340 */
341 while (row > 0u)
342 {
343 /*
344 * For every row wise process, the column loop counter is to be initiated
345 */
346 col = numColsB >> 1;
347 /*
348 * For every row wise process, the pIn2 pointer is set
349 * to the starting address of the transposed pSrcB data
350 */
351 pInB = pSrcBT;
352 pInB2 = pInB + numRowsB;
353 j = 0;
354
355 /*
356 * column loop
357 */
358 while (col > 0u)
359 {
360 q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
361 q7x16_t vecA, vecA2, vecB, vecB2;
362 q31_t acc0, acc1, acc2, acc3;
363
364 /*
365 * Initiate the pointer pIn1 to point to the starting address of the column being processed
366 */
367 pInA = pSrcA->pData + i;
368 pInA2 = pInA + numColsA;
369 pInB = pSrcBT + j;
370 pInB2 = pInB + numRowsB;
371
372 pSrcAVec = (q7_t const *) pInA;
373 pSrcA2Vec = (q7_t const *)pInA2;
374 pSrcBVec = (q7_t const *) pInB;
375 pSrcB2Vec = (q7_t const *)pInB2;
376
377 acc0 = 0L;
378 acc1 = 0L;
379 acc2 = 0L;
380 acc3 = 0L;
381
382 vecA = vld1q(pSrcAVec);
383 pSrcAVec += 16;
384
385 blkCnt = numColsA >> 4;
386 while (blkCnt > 0U)
387 {
388 vecB = vld1q(pSrcBVec);
389 pSrcBVec += 16;
390 acc0 = vmladavaq_s8(acc0, vecA, vecB);
391 vecA2 = vld1q(pSrcA2Vec);
392 pSrcA2Vec += 16;
393 acc1 = vmladavaq_s8(acc1, vecA2, vecB);
394 vecB2 = vld1q(pSrcB2Vec);
395 pSrcB2Vec += 16;
396 acc2 = vmladavaq_s8(acc2, vecA, vecB2);
397 vecA = vld1q(pSrcAVec);
398 pSrcAVec += 16;
399 acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
400
401 blkCnt--;
402 }
403 /*
404 * tail
405 * (will be merged thru tail predication)
406 */
407 blkCnt = numColsA & 0xF;
408 if (blkCnt > 0U)
409 {
410 mve_pred16_t p0 = vctp8q(blkCnt);
411 vecB = vld1q(pSrcBVec);
412 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
413 vecA2 = vld1q(pSrcA2Vec);
414 acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
415 vecB2 = vld1q(pSrcB2Vec);
416 acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
417 vecA = vld1q(pSrcAVec);
418 acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
419 }
420
421 *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
422 *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
423 *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
424 *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
425 j += numRowsB * 2;
426 /*
427 * Decrement the column loop counter
428 */
429 col--;
430
431 }
432
433 i = i + numColsA * 2;
434 px = px2 + (numColsB & 1u);
435 px2 = px + numColsB;
436 /*
437 * Decrement the row loop counter
438 */
439 row--;
440 }
441
442 /*
443 * Compute remaining row and/or column below
444 */
445
446 if (numColsB & 1u)
447 {
448 row = numRowsA & (~0x1); //avoid redundant computation
449 px = pDst->pData + numColsB - 1;
450 i = 0;
451
452 /*
453 * row loop
454 */
455 while (row > 0)
456 {
457 q7_t const *pSrcAVec, *pSrcBVec;
458 q7x16_t vecA, vecB;
459 q63_t acc0;
460
461 /*
462 * point to last column in matrix B
463 */
464 pInB = pSrcBT + numRowsB * (numColsB - 1);
465 pInA = pSrcA->pData + i;
466
467 pSrcAVec = (q7_t const *) pInA;
468 pSrcBVec = (q7_t const *) pInB;
469
470 acc0 = 0LL;
471 blkCnt = (numColsA) >> 4;
472 while (blkCnt > 0U)
473 {
474 vecA = vld1q(pSrcAVec);
475 pSrcAVec += 16;
476 vecB = vld1q(pSrcBVec);
477 pSrcBVec += 16;
478 acc0 = vmladavaq_s8(acc0, vecA, vecB);
479
480 blkCnt--;
481 }
482 /*
483 * tail
484 * (will be merged thru tail predication)
485 */
486 blkCnt = numColsA & 0xF;
487 if (blkCnt > 0U)
488 {
489 mve_pred16_t p0 = vctp8q(blkCnt);
490 vecA = vld1q(pSrcAVec);
491 vecB = vld1q(pSrcBVec);
492 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
493 }
494
495 *px = (q7_t) __SSAT(acc0 >> 7, 8);
496
497 px += numColsB;
498
499 i += numColsA;
500 /*
501 * Decrement the row loop counter
502 */
503 row--;
504 }
505 }
506
507 if (numRowsA & 1u)
508 {
509 col = numColsB;
510 i = 0u;
511 /*
512 * point to last row in output matrix
513 */
514 px = pDst->pData + (numColsB) * (numRowsA - 1);
515 /*
516 * col loop
517 */
518 while (col > 0)
519 {
520 q7_t const *pSrcAVec, *pSrcBVec;
521 q7x16_t vecA, vecB;
522 q63_t acc0;
523
524 /*
525 * point to last row in matrix A
526 */
527 pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
528 pInB = pSrcBT + i;
529
530 /*
531 * Set the variable sum, that acts as accumulator, to zero
532 */
533 pSrcAVec = (q7_t const *) pInA;
534 pSrcBVec = (q7_t const *) pInB;
535 acc0 = 0LL;
536
537 blkCnt = (numColsA) >> 4;
538 while (blkCnt > 0U)
539 {
540 vecA = vld1q(pSrcAVec);
541 pSrcAVec += 16;
542 vecB = vld1q(pSrcBVec);
543 pSrcBVec += 16;
544 acc0 = vmladavaq_s8(acc0, vecA, vecB);
545
546 blkCnt--;
547 }
548 /*
549 * tail
550 * (will be merged thru tail predication)
551 */
552 blkCnt = numColsA & 0xF;
553 if (blkCnt > 0U)
554 {
555 mve_pred16_t p0 = vctp8q(blkCnt);
556 vecA = vld1q(pSrcAVec);
557 vecB = vld1q(pSrcBVec);
558 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
559 }
560
561 *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
562
563 i += numColsA;
564
565 /*
566 * Decrement the col loop counter
567 */
568 col--;
569 }
570 }
571 /*
572 * Return to application
573 */
574 status = ARM_MATH_SUCCESS;
575 }
576 return(status);
577 }
578 #else
arm_mat_mult_q7(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst,q7_t * pState)579 arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
580 {
581 q31_t sum; /* accumulator */
582 q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
583 q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
584 q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
585 q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
586 q7_t *pOut = pDst->pData; /* output data matrix pointer */
587 q7_t *px; /* Temporary output data matrix pointer */
588 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
589 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
590 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
591 uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
592 arm_status status; /* status of matrix multiplication */
593
594 (void)pState;
595
596 #ifdef ARM_MATH_MATRIX_CHECK
597
598 /* Check for matrix mismatch condition */
599 if ((pSrcA->numCols != pSrcB->numRows) ||
600 (pSrcA->numRows != pDst->numRows) ||
601 (pSrcB->numCols != pDst->numCols) )
602 {
603 /* Set status as ARM_MATH_SIZE_MISMATCH */
604 status = ARM_MATH_SIZE_MISMATCH;
605 }
606 else
607
608 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
609
610 {
611 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
612 /* row loop */
613 do {
614 /* Output pointer is set to starting address of the row being processed */
615 px = pOut + i;
616
617 /* For every row wise process, the column loop counter is to be initiated */
618 col = numColsB;
619
620 /* For every row wise process, the pIn2 pointer is set
621 ** to the starting address of the pSrcB data */
622 pIn2 = pSrcB->pData;
623
624 /* column loop */
625 do {
626 /* Set the variable sum, that acts as accumulator, to zero */
627 sum = 0;
628
629 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
630 pIn1 = pInA;
631
632 /* Matrix A columns number of MAC operations are to be performed */
633 colCnt = numColsA;
634
635 /* matrix multiplication */
636 while (colCnt > 0U) {
637 /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
638 /* Perform the multiply-accumulates */
639 sum += (q31_t)*pIn1++ * *pIn2;
640 pIn2 += numColsB;
641
642 /* Decrement the loop counter */
643 colCnt--;
644 }
645
646 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
647 /* Saturate and store the result in the destination buffer */
648 *px++ = (q7_t)__SSAT((sum >> 7), 8);
649
650 /* Decrement the column loop counter */
651 col--;
652
653 /* Update the pointer pIn2 to point to the starting address of the next column */
654 pIn2 = pInB + (numColsB - col);
655
656 } while (col > 0U);
657
658 /* Update the pointer pSrcA to point to the starting address of the next row */
659 i = i + numColsB;
660 pInA = pInA + numColsA;
661
662 /* Decrement the row loop counter */
663 row--;
664
665 } while (row > 0U);
666
667 /* set status as ARM_MATH_SUCCESS */
668 status = ARM_MATH_SUCCESS;
669 }
670
671 /* Return to application */
672 return (status);
673 }
674 #endif /* defined(ARM_MATH_MVEI) */
675
676 /**
677 @} end of MatrixMult group
678 */
679