• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_mat_mult_q15.c
4  * Description:  Q15 matrix multiplication
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/matrix_functions.h"
30 
31 /**
32   @ingroup groupMatrix
33  */
34 
35 /**
36   @addtogroup MatrixMult
37   @{
38  */
39 
40 /**
41   @brief         Q15 matrix multiplication.
42   @param[in]     pSrcA      points to the first input matrix structure
43   @param[in]     pSrcB      points to the second input matrix structure
44   @param[out]    pDst       points to output matrix structure
45   @param[in]     pState     points to the array for storing intermediate results (Unused)
46   @return        execution status
47                    - \ref ARM_MATH_SUCCESS       : Operation successful
48                    - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
49 
50   @par           Scaling and Overflow Behavior
51                    The function is implemented using an internal 64-bit accumulator. The inputs to the
52                    multiplications are in 1.15 format and multiplications yield a 2.30 result.
53                    The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
54                    This approach provides 33 guard bits and there is no risk of overflow.
55                    The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
56                    and then saturated to 1.15 format.
57   @par
58                    Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
59  */
60 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
61 
62 #define MVE_ASRL_SAT16(acc, shift)          ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
63 
64 #define MATRIX_DIM2 2
65 #define MATRIX_DIM3 3
66 #define MATRIX_DIM4 4
67 
arm_mat_mult_q15_2x2_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)68 __STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
69     const arm_matrix_instance_q15 * pSrcA,
70     const arm_matrix_instance_q15 * pSrcB,
71     arm_matrix_instance_q15 * pDst)
72 {
73     q15_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
74     q15_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
75     q15_t       *pOut = pDst->pData;   /* output data matrix pointer */
76     uint16x8_t  vecColBOffs;
77     q15_t       *pInA0 = pInA;
78     q15_t       *pInA1 = pInA0 + MATRIX_DIM2;
79     q63_t        acc0, acc1;
80     q15x8_t     vecB, vecA0, vecA1;
81     mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
82 
83     vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
84 
85     pInB = pSrcB->pData;
86 
87     vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
88 
89     vecA0 = vldrhq_s16(pInA0);
90     vecA1 = vldrhq_s16(pInA1);
91 
92     acc0 = vmlaldavq(vecA0, vecB);
93     acc1 = vmlaldavq(vecA1, vecB);
94 
95     acc0 = asrl(acc0, 15);
96     acc1 = asrl(acc1, 15);
97 
98     pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
99     pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
100     pOut++;
101 
102     /* move to next B column */
103     pInB = pInB + 1;
104 
105     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
106 
107     acc0 = vmlaldavq(vecA0, vecB);
108     acc1 = vmlaldavq(vecA1, vecB);
109 
110     acc0 = asrl(acc0, 15);
111     acc1 = asrl(acc1, 15);
112 
113     pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
114     pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
115 
116     /*
117      * Return to application
118      */
119     return (ARM_MATH_SUCCESS);
120 }
121 
122 
123 
arm_mat_mult_q15_3x3_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)124 __STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
125     const arm_matrix_instance_q15 * pSrcA,
126     const arm_matrix_instance_q15 * pSrcB,
127     arm_matrix_instance_q15 * pDst)
128 {
129     q15_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
130     q15_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
131     q15_t       *pOut = pDst->pData;   /* output data matrix pointer */
132     uint16x8_t vecColBOffs;
133     q15_t       *pInA0 = pInA;
134     q15_t       *pInA1 = pInA0 + MATRIX_DIM3;
135     q15_t       *pInA2 = pInA1 + MATRIX_DIM3;
136     q63_t        acc0, acc1, acc2;
137     q15x8_t    vecB, vecA0, vecA1, vecA2;
138     mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
139 
140     vecColBOffs = vidupq_u16((uint32_t)0, 1);
141     vecColBOffs = vecColBOffs * MATRIX_DIM3;
142 
143     pInB = pSrcB->pData;
144 
145     vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
146 
147     vecA0 = vldrhq_s16(pInA0);
148     vecA1 = vldrhq_s16(pInA1);
149     vecA2 = vldrhq_s16(pInA2);
150 
151     acc0 = vmlaldavq(vecA0, vecB);
152     acc1 = vmlaldavq(vecA1, vecB);
153     acc2 = vmlaldavq(vecA2, vecB);
154 
155     acc0 = asrl(acc0, 15);
156     acc1 = asrl(acc1, 15);
157     acc2 = asrl(acc2, 15);
158 
159     pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
160     pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
161     pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
162     pOut++;
163 
164     /* move to next B column */
165     pInB = pInB + 1;
166 
167     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
168 
169     acc0 = vmlaldavq(vecA0, vecB);
170     acc1 = vmlaldavq(vecA1, vecB);
171     acc2 = vmlaldavq(vecA2, vecB);
172 
173     acc0 = asrl(acc0, 15);
174     acc1 = asrl(acc1, 15);
175     acc2 = asrl(acc2, 15);
176 
177     pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
178     pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
179     pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
180     pOut++;
181 
182     /* move to next B column */
183     pInB = pInB + 1;
184 
185     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
186 
187     acc0 = vmlaldavq(vecA0, vecB);
188     acc1 = vmlaldavq(vecA1, vecB);
189     acc2 = vmlaldavq(vecA2, vecB);
190 
191     acc0 = asrl(acc0, 15);
192     acc1 = asrl(acc1, 15);
193     acc2 = asrl(acc2, 15);
194 
195     pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
196     pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
197     pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
198     /*
199      * Return to application
200      */
201     return (ARM_MATH_SUCCESS);
202 }
203 
204 
arm_mat_mult_q15_4x4_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)205 __STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
206     const arm_matrix_instance_q15 * pSrcA,
207     const arm_matrix_instance_q15 * pSrcB,
208     arm_matrix_instance_q15 * pDst)
209 {
210     q15_t       *pInB = pSrcB->pData;  /* input data matrix pointer B */
211     q15_t       *pInA = pSrcA->pData;  /* input data matrix pointer A */
212     q15_t       *pOut = pDst->pData;   /* output data matrix pointer */
213     uint16x8_t vecColBOffs;
214     q15_t       *pInA0 = pInA;
215     q15_t       *pInA1 = pInA0 + MATRIX_DIM4;
216     q15_t       *pInA2 = pInA1 + MATRIX_DIM4;
217     q15_t       *pInA3 = pInA2 + MATRIX_DIM4;
218     q63_t        acc0, acc1, acc2, acc3;
219     q15x8_t     vecB, vecA0, vecA1, vecA2, vecA3;
220     mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
221 
222     vecColBOffs = vidupq_u16((uint32_t)0, 4);
223 
224     pInB = pSrcB->pData;
225 
226     vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
227 
228     vecA0 = vldrhq_s16(pInA0);
229     vecA1 = vldrhq_s16(pInA1);
230     vecA2 = vldrhq_s16(pInA2);
231     vecA3 = vldrhq_s16(pInA3);
232 
233     acc0 = vmlaldavq(vecA0, vecB);
234     acc1 = vmlaldavq(vecA1, vecB);
235     acc2 = vmlaldavq(vecA2, vecB);
236     acc3 = vmlaldavq(vecA3, vecB);
237 
238     acc0 = asrl(acc0, 15);
239     acc1 = asrl(acc1, 15);
240     acc2 = asrl(acc2, 15);
241     acc3 = asrl(acc3, 15);
242 
243     pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
244     pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
245     pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
246     pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
247     pOut++;
248 
249     /* move to next B column */
250     pInB = pInB + 1;
251 
252     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
253 
254     acc0 = vmlaldavq(vecA0, vecB);
255     acc1 = vmlaldavq(vecA1, vecB);
256     acc2 = vmlaldavq(vecA2, vecB);
257     acc3 = vmlaldavq(vecA3, vecB);
258 
259     acc0 = asrl(acc0, 15);
260     acc1 = asrl(acc1, 15);
261     acc2 = asrl(acc2, 15);
262     acc3 = asrl(acc3, 15);
263 
264     pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
265     pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
266     pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
267     pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
268 
269     pOut++;
270 
271     /* move to next B column */
272     pInB = pInB + 1;
273 
274     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
275 
276     acc0 = vmlaldavq(vecA0, vecB);
277     acc1 = vmlaldavq(vecA1, vecB);
278     acc2 = vmlaldavq(vecA2, vecB);
279     acc3 = vmlaldavq(vecA3, vecB);
280 
281     acc0 = asrl(acc0, 15);
282     acc1 = asrl(acc1, 15);
283     acc2 = asrl(acc2, 15);
284     acc3 = asrl(acc3, 15);
285 
286     pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
287     pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
288     pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
289     pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
290 
291     pOut++;
292 
293     /* move to next B column */
294     pInB = pInB + 1;
295 
296     vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
297 
298     acc0 = vmlaldavq(vecA0, vecB);
299     acc1 = vmlaldavq(vecA1, vecB);
300     acc2 = vmlaldavq(vecA2, vecB);
301     acc3 = vmlaldavq(vecA3, vecB);
302 
303     acc0 = asrl(acc0, 15);
304     acc1 = asrl(acc1, 15);
305     acc2 = asrl(acc2, 15);
306     acc3 = asrl(acc3, 15);
307 
308     pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
309     pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
310     pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
311     pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
312     /*
313      * Return to application
314      */
315     return (ARM_MATH_SUCCESS);
316 }
317 
arm_mat_mult_q15(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst,q15_t * pState)318 arm_status arm_mat_mult_q15(
319   const arm_matrix_instance_q15 * pSrcA,
320   const arm_matrix_instance_q15 * pSrcB,
321         arm_matrix_instance_q15 * pDst,
322         q15_t                   * pState)
323 {
324     q15_t    *pInB = pSrcB->pData;  /* input data matrix pointer B */
325     q15_t    *pInA = pSrcA->pData;  /* input data matrix pointer A */
326     q15_t    *pOut = pDst->pData;   /* output data matrix pointer */
327     q15_t    *px;               /* Temporary output data matrix pointer */
328     uint16_t  numRowsA = pSrcA->numRows;    /* number of rows of input matrix A    */
329     uint16_t  numColsB = pSrcB->numCols;    /* number of columns of input matrix B */
330     uint16_t  numColsA = pSrcA->numCols;    /* number of columns of input matrix A */
331     uint16_t  col, i = 0U, row = numRowsA;  /* loop counters */
332     uint16x8_t vecOffs, vecColBOffs;
333     uint32_t  blkCnt,rowCnt;           /* loop counters */
334     arm_status status;                             /* Status of matrix multiplication */
335     (void)pState;
336 
337 #ifdef ARM_MATH_MATRIX_CHECK
338 
339   /* Check for matrix mismatch condition */
340   if ((pSrcA->numCols != pSrcB->numRows) ||
341       (pSrcA->numRows != pDst->numRows)  ||
342       (pSrcB->numCols != pDst->numCols)    )
343   {
344     /* Set status as ARM_MATH_SIZE_MISMATCH */
345     status = ARM_MATH_SIZE_MISMATCH;
346   }
347   else
348 #endif
349   {
350     /* small squared matrix specialized routines */
351     if(numRowsA == numColsB && numColsB == numColsA) {
352 
353         if (numRowsA == 1)
354         {
355            q63_t sum;
356            sum = pInA[0] * pInB[0];
357            pOut[0] = (q15_t) __SSAT((sum >> 15), 16);
358            return (ARM_MATH_SUCCESS);
359         }
360         else if(numRowsA == 2)
361             return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
362         else if(numRowsA == 3)
363             return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
364         else if (numRowsA == 4)
365             return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
366     }
367 
368     vecColBOffs = vidupq_u16((uint32_t)0, 1);
369     vecColBOffs = vecColBOffs * (uint16_t) (numColsB);
370 
371     /*
372      * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
373      */
374 
375     /*
376      * row loop
377      */
378     rowCnt = row >> 2;
379     while (rowCnt > 0U)
380     {
381         /*
382          * Output pointer is set to starting address of the row being processed
383          */
384         px = pOut + i;
385         i = i + 4 * numColsB;
386         /*
387          * For every row wise process, the column loop counter is to be initiated
388          */
389         col = numColsB;
390         /*
391          * For every row wise process, the pInB pointer is set
392          * to the starting address of the pSrcB data
393          */
394         pInB = pSrcB->pData;
395         /*
396          * column loop
397          */
398         while (col > 0U)
399         {
400             /*
401              * generate 4 columns elements
402              */
403             /*
404              * Matrix A columns number of MAC operations are to be performed
405              */
406 
407             q15_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
408             q15_t    *pInA0 = pInA;
409             q15_t    *pInA1 = pInA0 + numColsA;
410             q15_t    *pInA2 = pInA1 + numColsA;
411             q15_t    *pInA3 = pInA2 + numColsA;
412             q63_t     acc0, acc1, acc2, acc3;
413 
414             acc0 = 0LL;
415             acc1 = 0LL;
416             acc2 = 0LL;
417             acc3 = 0LL;
418 
419             pSrcA0Vec = (q15_t const *) pInA0;
420             pSrcA1Vec = (q15_t const *) pInA1;
421             pSrcA2Vec = (q15_t const *) pInA2;
422             pSrcA3Vec = (q15_t const *) pInA3;
423 
424             vecOffs = vecColBOffs;
425 
426             blkCnt = (numColsA) >> 3;
427             while (blkCnt > 0U)
428             {
429                 q15x8_t vecB, vecA;
430 
431                 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
432                 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
433 
434                 vecA = vld1q(pSrcA0Vec);  pSrcA0Vec += 8;
435                 acc0 = vmlaldavaq(acc0, vecA, vecB);
436                 vecA = vld1q(pSrcA1Vec);  pSrcA1Vec += 8;
437                 acc1 = vmlaldavaq(acc1, vecA, vecB);
438                 vecA = vld1q(pSrcA2Vec);  pSrcA2Vec += 8;
439                 acc2 = vmlaldavaq(acc2, vecA, vecB);
440                 vecA = vld1q(pSrcA3Vec);  pSrcA3Vec += 8;
441                 acc3 = vmlaldavaq(acc3, vecA, vecB);
442                 blkCnt--;
443 
444             }
445             /*
446              * tail
447              */
448             blkCnt = numColsA & 7;
449             if (blkCnt > 0U)
450             {
451                 mve_pred16_t p0 = vctp16q(blkCnt);
452                 q15x8_t   vecB, vecA;
453 
454                 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
455                 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
456 
457                 vecA = vld1q(pSrcA0Vec);
458                 acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
459                 vecA = vld1q(pSrcA1Vec);
460                 acc1 = vmlaldavaq_p(acc1, vecA, vecB, p0);
461                 vecA = vld1q(pSrcA2Vec);
462                 acc2 = vmlaldavaq_p(acc2, vecA, vecB, p0);
463                 vecA = vld1q(pSrcA3Vec);
464                 acc3 = vmlaldavaq_p(acc3, vecA, vecB, p0);
465             }
466 
467             px[0]            = (q15_t)MVE_ASRL_SAT16(acc0, 15);
468             px[1 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc1, 15);
469             px[2 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc2, 15);
470             px[3 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc3, 15);
471             px++;
472             /*
473              * Decrement the column loop counter
474              */
475             col--;
476             /*
477              * Update the pointer pInB to point to the  starting address of the next column
478              */
479             pInB = pSrcB->pData + (numColsB - col);
480         }
481 
482         /*
483          * Update the pointer pInA to point to the  starting address of the next row
484          */
485         pInA += (numColsA * 4);
486         /*
487          * Decrement the row loop counter
488          */
489         rowCnt --;
490 
491     }
492 
493     rowCnt = row & 3;
494     while (rowCnt > 0U)
495     {
496       /*
497          * Output pointer is set to starting address of the row being processed
498          */
499         px = pOut + i;
500         i = i + numColsB;
501         /*
502          * For every row wise process, the column loop counter is to be initiated
503          */
504         col = numColsB;
505         /*
506          * For every row wise process, the pInB pointer is set
507          * to the starting address of the pSrcB data
508          */
509         pInB = pSrcB->pData;
510         /*
511          * column loop
512          */
513         while (col > 0U)
514         {
515             /*
516              * generate 4 columns elements
517              */
518             /*
519              * Matrix A columns number of MAC operations are to be performed
520              */
521 
522             q15_t const *pSrcA0Vec;
523             q15_t    *pInA0 = pInA;
524             q63_t     acc0;
525 
526             acc0 = 0LL;
527 
528             pSrcA0Vec = (q15_t const *) pInA0;
529 
530             vecOffs = vecColBOffs;
531 
532             blkCnt = (numColsA) >> 3;
533             while (blkCnt > 0U)
534             {
535                 q15x8_t vecB, vecA;
536 
537                 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
538                 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
539 
540                 vecA = vld1q(pSrcA0Vec);
541                 pSrcA0Vec += 8;
542                 acc0 = vmlaldavaq(acc0, vecA, vecB);
543 
544                 blkCnt--;
545 
546             }
547             /*
548              * tail
549              */
550             blkCnt = numColsA & 7;
551             if (blkCnt > 0U)
552             {
553                 mve_pred16_t p0 = vctp16q(blkCnt);
554                 q15x8_t   vecB, vecA;
555 
556                 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
557                 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
558 
559                 vecA = vld1q(pSrcA0Vec);
560                 acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
561 
562             }
563 
564             px[0]            = (q15_t)MVE_ASRL_SAT16(acc0, 15);
565 
566             px++;
567             /*
568              * Decrement the column loop counter
569              */
570             col--;
571             /*
572              * Update the pointer pInB to point to the  starting address of the next column
573              */
574             pInB = pSrcB->pData + (numColsB - col);
575         }
576 
577         /*
578          * Update the pointer pInA to point to the  starting address of the next row
579          */
580         pInA += (numColsA );
581         rowCnt--;
582     }
583     /* Set status as ARM_MATH_SUCCESS */
584     status = ARM_MATH_SUCCESS;
585   }
586 
587   /* Return to application */
588   return (status);
589 
590 }
591 #else
arm_mat_mult_q15(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst,q15_t * pState)592 arm_status arm_mat_mult_q15(
593   const arm_matrix_instance_q15 * pSrcA,
594   const arm_matrix_instance_q15 * pSrcB,
595         arm_matrix_instance_q15 * pDst,
596         q15_t                   * pState)
597 {
598         q63_t sum;                                     /* Accumulator */
599 
600 #if defined (ARM_MATH_DSP)                             /* != CM0 */
601 
602         q15_t *pSrcBT = pState;                        /* Input data matrix pointer for transpose */
603         q15_t *pInA = pSrcA->pData;                    /* Input data matrix pointer A of Q15 type */
604         q15_t *pInB = pSrcB->pData;                    /* Input data matrix pointer B of Q15 type */
605         q15_t *px;                                     /* Temporary output data matrix pointer */
606         uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A */
607         uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
608         uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
609         uint16_t numRowsB = pSrcB->numRows;            /* Number of rows of input matrix B */
610         uint32_t col, i = 0U, row = numRowsB, colCnt;  /* Loop counters */
611         arm_status status;                             /* Status of matrix multiplication */
612 
613         q31_t in;                                      /* Temporary variable to hold the input value */
614         q31_t inA1, inB1, inA2, inB2;
615 
616 #ifdef ARM_MATH_MATRIX_CHECK
617 
618   /* Check for matrix mismatch condition */
619   if ((pSrcA->numCols != pSrcB->numRows) ||
620       (pSrcA->numRows != pDst->numRows)  ||
621       (pSrcB->numCols != pDst->numCols)    )
622   {
623     /* Set status as ARM_MATH_SIZE_MISMATCH */
624     status = ARM_MATH_SIZE_MISMATCH;
625   }
626   else
627 
628 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
629 
630   {
631     /* Matrix transpose */
632     do
633     {
634       /* The pointer px is set to starting address of column being processed */
635       px = pSrcBT + i;
636 
637       /* Apply loop unrolling and exchange columns with row elements */
638       col = numColsB >> 2U;
639 
640       /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
641        ** a second loop below computes the remaining 1 to 3 samples. */
642       while (col > 0U)
643       {
644         /* Read two elements from row */
645         in = read_q15x2_ia ((q15_t **) &pInB);
646 
647         /* Unpack and store one element in destination */
648 #ifndef ARM_MATH_BIG_ENDIAN
649         *px = (q15_t) in;
650 #else
651         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
652 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
653 
654         /* Update pointer px to point to next row of transposed matrix */
655         px += numRowsB;
656 
657         /* Unpack and store second element in destination */
658 #ifndef ARM_MATH_BIG_ENDIAN
659         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
660 #else
661         *px = (q15_t) in;
662 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
663 
664         /* Update pointer px to point to next row of transposed matrix */
665         px += numRowsB;
666 
667         /* Read two elements from row */
668         in = read_q15x2_ia ((q15_t **) &pInB);
669 
670         /* Unpack and store one element in destination */
671 #ifndef ARM_MATH_BIG_ENDIAN
672         *px = (q15_t) in;
673 #else
674         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
675 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
676         px += numRowsB;
677 
678 #ifndef ARM_MATH_BIG_ENDIAN
679         *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
680 #else
681         *px = (q15_t) in;
682 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
683         px += numRowsB;
684 
685         /* Decrement column loop counter */
686         col--;
687       }
688 
689       /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
690        ** No loop unrolling is used. */
691       col = numColsB % 0x4U;
692 
693       while (col > 0U)
694       {
695         /* Read and store input element in destination */
696         *px = *pInB++;
697 
698         /* Update pointer px to point to next row of transposed matrix */
699         px += numRowsB;
700 
701         /* Decrement column loop counter */
702         col--;
703       }
704 
705       i++;
706 
707       /* Decrement row loop counter */
708       row--;
709 
710     } while (row > 0U);
711 
712     /* Reset variables for usage in following multiplication process */
713     row = numRowsA;
714     i = 0U;
715     px = pDst->pData;
716 
717     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
718     /* row loop */
719     do
720     {
721       /* For every row wise process, column loop counter is to be initiated */
722       col = numColsB;
723 
724       /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
725       pInB = pSrcBT;
726 
727       /* column loop */
728       do
729       {
730         /* Set variable sum, that acts as accumulator, to zero */
731         sum = 0;
732 
733         /* Initiate pointer pInA to point to starting address of column being processed */
734         pInA = pSrcA->pData + i;
735 
736         /* Apply loop unrolling and compute 2 MACs simultaneously. */
737         colCnt = numColsA >> 2U;
738 
739         /* matrix multiplication */
740         while (colCnt > 0U)
741         {
742           /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
743 
744           /* read real and imag values from pSrcA and pSrcB buffer */
745           inA1 = read_q15x2_ia ((q15_t **) &pInA);
746           inB1 = read_q15x2_ia ((q15_t **) &pInB);
747 
748           inA2 = read_q15x2_ia ((q15_t **) &pInA);
749           inB2 = read_q15x2_ia ((q15_t **) &pInB);
750 
751           /* Multiply and Accumulates */
752           sum = __SMLALD(inA1, inB1, sum);
753           sum = __SMLALD(inA2, inB2, sum);
754 
755           /* Decrement loop counter */
756           colCnt--;
757         }
758 
759         /* process remaining column samples */
760         colCnt = numColsA % 0x4U;
761 
762         while (colCnt > 0U)
763         {
764           /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
765           sum += *pInA++ * *pInB++;
766 
767           /* Decrement loop counter */
768           colCnt--;
769         }
770 
771         /* Saturate and store result in destination buffer */
772         *px = (q15_t) (__SSAT((sum >> 15), 16));
773         px++;
774 
775         /* Decrement column loop counter */
776         col--;
777 
778       } while (col > 0U);
779 
780       i = i + numColsA;
781 
782       /* Decrement row loop counter */
783       row--;
784 
785     } while (row > 0U);
786 
787 #else /* #if defined (ARM_MATH_DSP) */
788 
789         q15_t *pIn1 = pSrcA->pData;                    /* Input data matrix pointer A */
790         q15_t *pIn2 = pSrcB->pData;                    /* Input data matrix pointer B */
791         q15_t *pInA = pSrcA->pData;                    /* Input data matrix pointer A of Q15 type */
792         q15_t *pInB = pSrcB->pData;                    /* Input data matrix pointer B of Q15 type */
793         q15_t *pOut = pDst->pData;                     /* Output data matrix pointer */
794         q15_t *px;                                     /* Temporary output data matrix pointer */
795         uint16_t numColsB = pSrcB->numCols;            /* Number of columns of input matrix B */
796         uint16_t numColsA = pSrcA->numCols;            /* Number of columns of input matrix A */
797         uint16_t numRowsA = pSrcA->numRows;            /* Number of rows of input matrix A    */
798         uint32_t col, i = 0U, row = numRowsA, colCnt;  /* Loop counters */
799         arm_status status;                             /* Status of matrix multiplication */
800         (void)pState;
801 
802 #ifdef ARM_MATH_MATRIX_CHECK
803 
804   /* Check for matrix mismatch condition */
805   if ((pSrcA->numCols != pSrcB->numRows) ||
806       (pSrcA->numRows != pDst->numRows)  ||
807       (pSrcB->numCols != pDst->numCols)    )
808   {
809     /* Set status as ARM_MATH_SIZE_MISMATCH */
810     status = ARM_MATH_SIZE_MISMATCH;
811   }
812   else
813 
814 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
815 
816   {
817     /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
818     /* row loop */
819     do
820     {
821       /* Output pointer is set to starting address of the row being processed */
822       px = pOut + i;
823 
824       /* For every row wise process, column loop counter is to be initiated */
825       col = numColsB;
826 
827       /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
828       pIn2 = pSrcB->pData;
829 
830       /* column loop */
831       do
832       {
833         /* Set the variable sum, that acts as accumulator, to zero */
834         sum = 0;
835 
836         /* Initiate pointer pIn1 to point to starting address of pSrcA */
837         pIn1 = pInA;
838 
839         /* Matrix A columns number of MAC operations are to be performed */
840         colCnt = numColsA;
841 
842         /* matrix multiplication */
843         while (colCnt > 0U)
844         {
845           /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
846 
847           /* Perform multiply-accumulates */
848           sum += (q31_t) * pIn1++ * *pIn2;
849           pIn2 += numColsB;
850 
851           /* Decrement loop counter */
852           colCnt--;
853         }
854 
855         /* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
856 
857         /* Saturate and store result in destination buffer */
858         *px++ = (q15_t) __SSAT((sum >> 15), 16);
859 
860         /* Decrement column loop counter */
861         col--;
862 
863         /* Update pointer pIn2 to point to starting address of next column */
864         pIn2 = pInB + (numColsB - col);
865 
866       } while (col > 0U);
867 
868       /* Update pointer pSrcA to point to starting address of next row */
869       i = i + numColsB;
870       pInA = pInA + numColsA;
871 
872       /* Decrement row loop counter */
873       row--;
874 
875     } while (row > 0U);
876 
877 #endif /* #if defined (ARM_MATH_DSP) */
878 
879     /* Set status as ARM_MATH_SUCCESS */
880     status = ARM_MATH_SUCCESS;
881   }
882 
883   /* Return to application */
884   return (status);
885 }
886 #endif /* defined(ARM_MATH_MVEI) */
887 
888 /**
889   @} end of MatrixMult group
890  */
891