• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_mult_q7.c
4  * Description:  Q15 matrix multiplication
5  *
6  * $Date:        23 April 2021
7  *
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/matrix_functions.h"
31 
32 /**
33   @ingroup groupMatrix
34  */
35 
36 /**
37   @addtogroup MatrixMult
38   @{
39  */
40 
41 /**
42  * @brief Q7 matrix multiplication
43  * @param[in]       *pSrcA points to the first input matrix structure
44  * @param[in]       *pSrcB points to the second input matrix structure
45  * @param[out]      *pDst points to output matrix structure
46  * @param[in]       *pState points to the array for storing intermediate results (Unused in some versions)
47  * @return          The function returns either
48  * <code>ARM_MATH_SIZE_MISMATCH</code> or <code>ARM_MATH_SUCCESS</code> based on the outcome of size checking.
49  *
50  * @details
51  * <b>Scaling and Overflow Behavior:</b>
52  *
53  * \par
54  * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
55  *
56  *
57  */
58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
arm_mat_mult_q7_2x2_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)59 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
60     const arm_matrix_instance_q7 * pSrcA,
61     const arm_matrix_instance_q7 * pSrcB,
62     arm_matrix_instance_q7 * pDst)
63 {
64     const uint32_t MATRIX_DIM = 2;
65     q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
66     q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
67     q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
68     uint8x16_t vecColBOffs;
69     q7_t       *pInA0 = pInA;
70     q7_t       *pInA1 = pInA0 + MATRIX_DIM;
71     q31_t       acc0, acc1;
72     q7x16_t    vecB, vecA0, vecA1;
73     mve_pred16_t p0 = vctp8q(MATRIX_DIM);
74 
75     vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
76 
77     pInB = pSrcB->pData;
78 
79     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
80 
81     vecA0 = vldrbq_s8(pInA0);
82     vecA1 = vldrbq_s8(pInA1);
83 
84     acc0 = vmladavq_s8(vecA0, vecB);
85     acc1 = vmladavq_s8(vecA1, vecB);
86 
87     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
88     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
89     pOut++;
90 
91     /* move to next B column */
92     pInB = pInB + 1;
93 
94     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
95 
96     acc0 = vmladavq_s8(vecA0, vecB);
97     acc1 = vmladavq_s8(vecA1, vecB);
98 
99     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
100     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
101     /*
102      * Return to application
103      */
104     return (ARM_MATH_SUCCESS);
105 }
106 
107 
arm_mat_mult_q7_3x3_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)108 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
109     const arm_matrix_instance_q7 * pSrcA,
110     const arm_matrix_instance_q7 * pSrcB,
111     arm_matrix_instance_q7 * pDst)
112 {
113     const uint8_t  MATRIX_DIM = 3;
114     q7_t const     *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
115     q7_t           *pInA = pSrcA->pData;  /* input data matrix pointer A */
116     q7_t           *pOut = pDst->pData;   /* output data matrix pointer */
117     uint8x16_t     vecColBOffs;
118     q7_t           *pInA0 = pInA;
119     q7_t           *pInA1 = pInA0 + MATRIX_DIM;
120     q7_t           *pInA2 = pInA1 + MATRIX_DIM;
121     q31_t           acc0, acc1, acc2;
122     q7x16_t        vecB, vecA0, vecA1, vecA2;
123     mve_pred16_t    p0 = vctp8q(MATRIX_DIM);
124 
125     vecColBOffs = vidupq_u8((uint32_t)0, 1);
126     vecColBOffs = vecColBOffs * MATRIX_DIM;
127 
128     pInB = pSrcB->pData;
129 
130     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
131 
132     vecA0 = vldrbq_s8(pInA0);
133     vecA1 = vldrbq_s8(pInA1);
134     vecA2 = vldrbq_s8(pInA2);
135 
136     acc0 = vmladavq_s8(vecA0, vecB);
137     acc1 = vmladavq_s8(vecA1, vecB);
138     acc2 = vmladavq_s8(vecA2, vecB);
139 
140     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
141     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
142     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
143     pOut++;
144 
145     /* move to next B column */
146     pInB = pInB + 1;
147 
148     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
149 
150     acc0 = vmladavq_s8(vecA0, vecB);
151     acc1 = vmladavq_s8(vecA1, vecB);
152     acc2 = vmladavq_s8(vecA2, vecB);
153 
154     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
155     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
156     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
157     pOut++;
158 
159     /* move to next B column */
160     pInB = pInB + 1;
161 
162     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
163 
164     acc0 = vmladavq_s8(vecA0, vecB);
165     acc1 = vmladavq_s8(vecA1, vecB);
166     acc2 = vmladavq_s8(vecA2, vecB);
167 
168     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
169     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
170     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
171     /*
172      * Return to application
173      */
174     return (ARM_MATH_SUCCESS);
175 }
176 
177 
arm_mat_mult_q7_4x4_mve(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst)178 __STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
179     const arm_matrix_instance_q7 * pSrcA,
180     const arm_matrix_instance_q7 * pSrcB,
181     arm_matrix_instance_q7 * pDst)
182 {
183     const uint32_t MATRIX_DIM = 4;
184     q7_t const *pInB = (q7_t const *)pSrcB->pData;  /* input data matrix pointer B */
185     q7_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
186     q7_t       *pOut = pDst->pData;   /* output data matrix pointer */
187     uint8x16_t vecColBOffs;
188     q7_t       *pInA0 = pInA;
189     q7_t       *pInA1 = pInA0 + MATRIX_DIM;
190     q7_t       *pInA2 = pInA1 + MATRIX_DIM;
191     q7_t       *pInA3 = pInA2 + MATRIX_DIM;
192     q31_t       acc0, acc1, acc2, acc3;
193     q7x16_t    vecB, vecA0, vecA1, vecA2, vecA3;
194     mve_pred16_t p0 = vctp8q(MATRIX_DIM);
195 
196     vecColBOffs = vidupq_u8((uint32_t)0, 4);
197 
198     pInB = pSrcB->pData;
199 
200     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
201 
202     vecA0 = vldrbq_s8(pInA0);
203     vecA1 = vldrbq_s8(pInA1);
204     vecA2 = vldrbq_s8(pInA2);
205     vecA3 = vldrbq_s8(pInA3);
206 
207     acc0 = vmladavq_s8(vecA0, vecB);
208     acc1 = vmladavq_s8(vecA1, vecB);
209     acc2 = vmladavq_s8(vecA2, vecB);
210     acc3 = vmladavq_s8(vecA3, vecB);
211 
212     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
213     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
214     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
215     pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
216     pOut++;
217 
218     /* move to next B column */
219     pInB = pInB + 1;
220 
221     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
222 
223     acc0 = vmladavq_s8(vecA0, vecB);
224     acc1 = vmladavq_s8(vecA1, vecB);
225     acc2 = vmladavq_s8(vecA2, vecB);
226     acc3 = vmladavq_s8(vecA3, vecB);
227 
228     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
229     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
230     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
231     pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
232     pOut++;
233 
234     /* move to next B column */
235     pInB = pInB + 1;
236 
237     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
238 
239     acc0 = vmladavq_s8(vecA0, vecB);
240     acc1 = vmladavq_s8(vecA1, vecB);
241     acc2 = vmladavq_s8(vecA2, vecB);
242     acc3 = vmladavq_s8(vecA3, vecB);
243 
244     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
245     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
246     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
247     pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
248     pOut++;
249 
250     /* move to next B column */
251     pInB = pInB + 1;
252 
253     vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
254 
255     acc0 = vmladavq_s8(vecA0, vecB);
256     acc1 = vmladavq_s8(vecA1, vecB);
257     acc2 = vmladavq_s8(vecA2, vecB);
258     acc3 = vmladavq_s8(vecA3, vecB);
259 
260     pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
261     pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
262     pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
263     pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
264     /*
265      * Return to application
266      */
267     return (ARM_MATH_SUCCESS);
268 }
269 
arm_mat_mult_q7(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst,q7_t * pState)270 arm_status arm_mat_mult_q7(
271     const arm_matrix_instance_q7 * pSrcA,
272     const arm_matrix_instance_q7 * pSrcB,
273     arm_matrix_instance_q7 * pDst,
274     q7_t * pState)
275 {
276     q7_t    *pInA = pSrcA->pData;  /* input data matrix pointer A of Q7 type */
277     q7_t    *pInB = pSrcB->pData;  /* input data matrix pointer B of Q7 type */
278     q7_t    *pInA2;
279     q7_t    *pInB2;
280     q7_t    *px;               /* Temporary output data matrix pointer */
281     q7_t    *px2;              /* Temporary output data matrix pointer */
282     uint32_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
283     uint32_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
284     uint32_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
285     uint32_t  numRowsB = pSrcB->numRows;    /* number of rows of input matrix A    */
286     uint32_t  col, i = 0u, j, row = numRowsB;   /* loop counters */
287     q7_t    *pSrcBT = pState;   /* input data matrix pointer for transpose */
288     uint32_t  blkCnt;           /* loop counters */
289     arm_status status;                            /* status of matrix multiplication */
290     arm_matrix_instance_q7 BT;
291 
292 
293    #ifdef ARM_MATH_MATRIX_CHECK
294 
295   /* Check for matrix mismatch condition */
296   if ((pSrcA->numCols != pSrcB->numRows) ||
297       (pSrcA->numRows != pDst->numRows)  ||
298       (pSrcB->numCols != pDst->numCols)    )
299   {
300     /* Set status as ARM_MATH_SIZE_MISMATCH */
301     status = ARM_MATH_SIZE_MISMATCH;
302   }
303   else
304 
305 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
306   {
307     /* small squared matrix specialized routines */
308     if(numRowsA == numColsB && numColsB == numColsA) {
309         if(numRowsA == 2)
310             return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
311         else if(numRowsA == 3)
312             return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
313         else if (numRowsA == 4)
314             return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
315     }
316     /*
317      * Matrix transpose
318      */
319 
320     BT.numRows = numColsB;
321     BT.numCols = numRowsB;
322     BT.pData = pSrcBT;
323 
324     arm_mat_trans_q7(pSrcB, &BT);
325 
326     /*
327      * Reset the variables for the usage in the following multiplication process
328      */
329     i = 0;
330     row = numRowsA >> 1;
331     px = pDst->pData;
332     px2 = px + numColsB;
333 
334     /*
335      * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
336      */
337 
338     /*
339      * row loop
340      */
341     while (row > 0u)
342     {
343         /*
344          * For every row wise process, the column loop counter is to be initiated
345          */
346         col = numColsB >> 1;
347         /*
348          * For every row wise process, the pIn2 pointer is set
349          * to the starting address of the transposed pSrcB data
350          */
351         pInB = pSrcBT;
352         pInB2 = pInB + numRowsB;
353         j = 0;
354 
355         /*
356          * column loop
357          */
358         while (col > 0u)
359         {
360             q7_t const     *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
361             q7x16_t        vecA, vecA2, vecB, vecB2;
362             q31_t           acc0, acc1, acc2, acc3;
363 
364             /*
365              * Initiate the pointer pIn1 to point to the starting address of the column being processed
366              */
367             pInA = pSrcA->pData + i;
368             pInA2 = pInA + numColsA;
369             pInB = pSrcBT + j;
370             pInB2 = pInB + numRowsB;
371 
372             pSrcAVec = (q7_t const *) pInA;
373             pSrcA2Vec = (q7_t const *)pInA2;
374             pSrcBVec = (q7_t const *) pInB;
375             pSrcB2Vec = (q7_t const *)pInB2;
376 
377             acc0 = 0L;
378             acc1 = 0L;
379             acc2 = 0L;
380             acc3 = 0L;
381 
382             vecA = vld1q(pSrcAVec);
383             pSrcAVec += 16;
384 
385             blkCnt = numColsA >> 4;
386             while (blkCnt > 0U)
387             {
388                 vecB = vld1q(pSrcBVec);
389                 pSrcBVec += 16;
390                 acc0 = vmladavaq_s8(acc0, vecA, vecB);
391                 vecA2 = vld1q(pSrcA2Vec);
392                 pSrcA2Vec += 16;
393                 acc1 = vmladavaq_s8(acc1, vecA2, vecB);
394                 vecB2 = vld1q(pSrcB2Vec);
395                 pSrcB2Vec += 16;
396                 acc2 = vmladavaq_s8(acc2, vecA, vecB2);
397                 vecA = vld1q(pSrcAVec);
398                 pSrcAVec += 16;
399                 acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
400 
401                 blkCnt--;
402             }
403             /*
404              * tail
405              * (will be merged thru tail predication)
406              */
407             blkCnt = numColsA & 0xF;
408             if (blkCnt > 0U)
409             {
410                 mve_pred16_t p0 = vctp8q(blkCnt);
411                 vecB = vld1q(pSrcBVec);
412                 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
413                 vecA2 = vld1q(pSrcA2Vec);
414                 acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
415                 vecB2 = vld1q(pSrcB2Vec);
416                 acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
417                 vecA = vld1q(pSrcAVec);
418                 acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
419             }
420 
421             *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
422             *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
423             *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
424             *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
425             j += numRowsB * 2;
426             /*
427              * Decrement the column loop counter
428              */
429             col--;
430 
431         }
432 
433         i = i + numColsA * 2;
434         px = px2 + (numColsB & 1u);
435         px2 = px + numColsB;
436         /*
437          * Decrement the row loop counter
438          */
439         row--;
440     }
441 
442     /*
443      * Compute remaining row and/or column below
444      */
445 
446     if (numColsB & 1u)
447     {
448         row = numRowsA & (~0x1);    //avoid redundant computation
449         px = pDst->pData + numColsB - 1;
450         i = 0;
451 
452         /*
453          * row loop
454          */
455         while (row > 0)
456         {
457             q7_t const   *pSrcAVec, *pSrcBVec;
458             q7x16_t       vecA, vecB;
459             q63_t           acc0;
460 
461             /*
462              * point to last column in matrix B
463              */
464             pInB = pSrcBT + numRowsB * (numColsB - 1);
465             pInA = pSrcA->pData + i;
466 
467             pSrcAVec = (q7_t const *) pInA;
468             pSrcBVec = (q7_t const *) pInB;
469 
470             acc0 = 0LL;
471             blkCnt = (numColsA) >> 4;
472             while (blkCnt > 0U)
473             {
474                 vecA = vld1q(pSrcAVec);
475                 pSrcAVec += 16;
476                 vecB = vld1q(pSrcBVec);
477                 pSrcBVec += 16;
478                 acc0 = vmladavaq_s8(acc0, vecA, vecB);
479 
480                 blkCnt--;
481             }
482             /*
483              * tail
484              * (will be merged thru tail predication)
485              */
486             blkCnt = numColsA & 0xF;
487             if (blkCnt > 0U)
488             {
489                 mve_pred16_t p0 = vctp8q(blkCnt);
490                 vecA = vld1q(pSrcAVec);
491                 vecB = vld1q(pSrcBVec);
492                 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
493             }
494 
495             *px = (q7_t) __SSAT(acc0 >> 7, 8);
496 
497             px += numColsB;
498 
499             i += numColsA;
500             /*
501              * Decrement the row loop counter
502              */
503             row--;
504         }
505     }
506 
507     if (numRowsA & 1u)
508     {
509         col = numColsB;
510         i = 0u;
511         /*
512          * point to last row in output matrix
513          */
514         px = pDst->pData + (numColsB) * (numRowsA - 1);
515         /*
516          * col loop
517          */
518         while (col > 0)
519         {
520             q7_t const    *pSrcAVec, *pSrcBVec;
521             q7x16_t       vecA, vecB;
522             q63_t           acc0;
523 
524             /*
525              * point to last row in matrix A
526              */
527             pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
528             pInB = pSrcBT + i;
529 
530             /*
531              * Set the variable sum, that acts as accumulator, to zero
532              */
533             pSrcAVec = (q7_t const *) pInA;
534             pSrcBVec = (q7_t const *) pInB;
535             acc0 = 0LL;
536 
537             blkCnt = (numColsA) >> 4;
538             while (blkCnt > 0U)
539             {
540                 vecA = vld1q(pSrcAVec);
541                 pSrcAVec += 16;
542                 vecB = vld1q(pSrcBVec);
543                 pSrcBVec += 16;
544                 acc0 = vmladavaq_s8(acc0, vecA, vecB);
545 
546                 blkCnt--;
547             }
548             /*
549              * tail
550              * (will be merged thru tail predication)
551              */
552             blkCnt = numColsA & 0xF;
553             if (blkCnt > 0U)
554             {
555                 mve_pred16_t p0 = vctp8q(blkCnt);
556                 vecA = vld1q(pSrcAVec);
557                 vecB = vld1q(pSrcBVec);
558                 acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
559             }
560 
561             *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
562 
563             i += numColsA;
564 
565             /*
566              * Decrement the col loop counter
567              */
568             col--;
569         }
570     }
571     /*
572      * Return to application
573      */
574      status = ARM_MATH_SUCCESS;
575     }
576     return(status);
577 }
578 #else
arm_mat_mult_q7(const arm_matrix_instance_q7 * pSrcA,const arm_matrix_instance_q7 * pSrcB,arm_matrix_instance_q7 * pDst,q7_t * pState)579 arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
580 {
581     q31_t sum; /* accumulator */
582     q7_t *pIn1 = pSrcA->pData;                    /* input data matrix pointer A */
583     q7_t *pIn2 = pSrcB->pData;                    /* input data matrix pointer B */
584     q7_t *pInA = pSrcA->pData;                    /* input data matrix pointer A of Q7 type */
585     q7_t *pInB = pSrcB->pData;                    /* input data matrix pointer B of Q7 type */
586     q7_t *pOut = pDst->pData;                     /* output data matrix pointer */
587     q7_t *px;                                     /* Temporary output data matrix pointer */
588     uint16_t numColsB = pSrcB->numCols;           /* number of columns of input matrix B */
589     uint16_t numColsA = pSrcA->numCols;           /* number of columns of input matrix A */
590     uint16_t numRowsA = pSrcA->numRows;           /* number of rows of input matrix A    */
591     uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
592     arm_status status;                            /* status of matrix multiplication */
593 
594     (void)pState;
595 
596 #ifdef ARM_MATH_MATRIX_CHECK
597 
598   /* Check for matrix mismatch condition */
599   if ((pSrcA->numCols != pSrcB->numRows) ||
600       (pSrcA->numRows != pDst->numRows)  ||
601       (pSrcB->numCols != pDst->numCols)    )
602   {
603     /* Set status as ARM_MATH_SIZE_MISMATCH */
604     status = ARM_MATH_SIZE_MISMATCH;
605   }
606   else
607 
608 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
609 
610     {
611         /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
612         /* row loop */
613         do {
614             /* Output pointer is set to starting address of the row being processed */
615             px = pOut + i;
616 
617             /* For every row wise process, the column loop counter is to be initiated */
618             col = numColsB;
619 
620             /* For every row wise process, the pIn2 pointer is set
621              ** to the starting address of the pSrcB data */
622             pIn2 = pSrcB->pData;
623 
624             /* column loop */
625             do {
626                 /* Set the variable sum, that acts as accumulator, to zero */
627                 sum = 0;
628 
629                 /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
630                 pIn1 = pInA;
631 
632                 /* Matrix A columns number of MAC operations are to be performed */
633                 colCnt = numColsA;
634 
635                 /* matrix multiplication */
636                 while (colCnt > 0U) {
637                     /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
638                     /* Perform the multiply-accumulates */
639                     sum += (q31_t)*pIn1++ * *pIn2;
640                     pIn2 += numColsB;
641 
642                     /* Decrement the loop counter */
643                     colCnt--;
644                 }
645 
646                 /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
647                 /* Saturate and store the result in the destination buffer */
648                 *px++ = (q7_t)__SSAT((sum >> 7), 8);
649 
650                 /* Decrement the column loop counter */
651                 col--;
652 
653                 /* Update the pointer pIn2 to point to the  starting address of the next column */
654                 pIn2 = pInB + (numColsB - col);
655 
656             } while (col > 0U);
657 
658             /* Update the pointer pSrcA to point to the  starting address of the next row */
659             i = i + numColsB;
660             pInA = pInA + numColsA;
661 
662             /* Decrement the row loop counter */
663             row--;
664 
665         } while (row > 0U);
666 
667         /* set status as ARM_MATH_SUCCESS */
668         status = ARM_MATH_SUCCESS;
669     }
670 
671     /* Return to application */
672     return (status);
673 }
674 #endif /* defined(ARM_MATH_MVEI) */
675 
676 /**
677   @} end of MatrixMult group
678  */
679