1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_mat_mult_q15.c
4 * Description: Q15 matrix multiplication
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/matrix_functions.h"
30
31 /**
32 @ingroup groupMatrix
33 */
34
35 /**
36 @addtogroup MatrixMult
37 @{
38 */
39
40 /**
41 @brief Q15 matrix multiplication.
42 @param[in] pSrcA points to the first input matrix structure
43 @param[in] pSrcB points to the second input matrix structure
44 @param[out] pDst points to output matrix structure
45 @param[in] pState points to the array for storing intermediate results (Unused)
46 @return execution status
47 - \ref ARM_MATH_SUCCESS : Operation successful
48 - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
49
50 @par Scaling and Overflow Behavior
51 The function is implemented using an internal 64-bit accumulator. The inputs to the
52 multiplications are in 1.15 format and multiplications yield a 2.30 result.
53 The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
54 This approach provides 33 guard bits and there is no risk of overflow.
55 The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
56 and then saturated to 1.15 format.
57 @par
58 Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
59 */
60 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
61
62 #define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
63
64 #define MATRIX_DIM2 2
65 #define MATRIX_DIM3 3
66 #define MATRIX_DIM4 4
67
arm_mat_mult_q15_2x2_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)68 __STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
69 const arm_matrix_instance_q15 * pSrcA,
70 const arm_matrix_instance_q15 * pSrcB,
71 arm_matrix_instance_q15 * pDst)
72 {
73 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
74 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
75 q15_t *pOut = pDst->pData; /* output data matrix pointer */
76 uint16x8_t vecColBOffs;
77 q15_t *pInA0 = pInA;
78 q15_t *pInA1 = pInA0 + MATRIX_DIM2;
79 q63_t acc0, acc1;
80 q15x8_t vecB, vecA0, vecA1;
81 mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
82
83 vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
84
85 pInB = pSrcB->pData;
86
87 vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
88
89 vecA0 = vldrhq_s16(pInA0);
90 vecA1 = vldrhq_s16(pInA1);
91
92 acc0 = vmlaldavq(vecA0, vecB);
93 acc1 = vmlaldavq(vecA1, vecB);
94
95 acc0 = asrl(acc0, 15);
96 acc1 = asrl(acc1, 15);
97
98 pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
99 pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
100 pOut++;
101
102 /* move to next B column */
103 pInB = pInB + 1;
104
105 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
106
107 acc0 = vmlaldavq(vecA0, vecB);
108 acc1 = vmlaldavq(vecA1, vecB);
109
110 acc0 = asrl(acc0, 15);
111 acc1 = asrl(acc1, 15);
112
113 pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
114 pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
115
116 /*
117 * Return to application
118 */
119 return (ARM_MATH_SUCCESS);
120 }
121
122
123
arm_mat_mult_q15_3x3_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)124 __STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
125 const arm_matrix_instance_q15 * pSrcA,
126 const arm_matrix_instance_q15 * pSrcB,
127 arm_matrix_instance_q15 * pDst)
128 {
129 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
130 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
131 q15_t *pOut = pDst->pData; /* output data matrix pointer */
132 uint16x8_t vecColBOffs;
133 q15_t *pInA0 = pInA;
134 q15_t *pInA1 = pInA0 + MATRIX_DIM3;
135 q15_t *pInA2 = pInA1 + MATRIX_DIM3;
136 q63_t acc0, acc1, acc2;
137 q15x8_t vecB, vecA0, vecA1, vecA2;
138 mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
139
140 vecColBOffs = vidupq_u16((uint32_t)0, 1);
141 vecColBOffs = vecColBOffs * MATRIX_DIM3;
142
143 pInB = pSrcB->pData;
144
145 vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
146
147 vecA0 = vldrhq_s16(pInA0);
148 vecA1 = vldrhq_s16(pInA1);
149 vecA2 = vldrhq_s16(pInA2);
150
151 acc0 = vmlaldavq(vecA0, vecB);
152 acc1 = vmlaldavq(vecA1, vecB);
153 acc2 = vmlaldavq(vecA2, vecB);
154
155 acc0 = asrl(acc0, 15);
156 acc1 = asrl(acc1, 15);
157 acc2 = asrl(acc2, 15);
158
159 pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
160 pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
161 pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
162 pOut++;
163
164 /* move to next B column */
165 pInB = pInB + 1;
166
167 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
168
169 acc0 = vmlaldavq(vecA0, vecB);
170 acc1 = vmlaldavq(vecA1, vecB);
171 acc2 = vmlaldavq(vecA2, vecB);
172
173 acc0 = asrl(acc0, 15);
174 acc1 = asrl(acc1, 15);
175 acc2 = asrl(acc2, 15);
176
177 pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
178 pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
179 pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
180 pOut++;
181
182 /* move to next B column */
183 pInB = pInB + 1;
184
185 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
186
187 acc0 = vmlaldavq(vecA0, vecB);
188 acc1 = vmlaldavq(vecA1, vecB);
189 acc2 = vmlaldavq(vecA2, vecB);
190
191 acc0 = asrl(acc0, 15);
192 acc1 = asrl(acc1, 15);
193 acc2 = asrl(acc2, 15);
194
195 pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
196 pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
197 pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
198 /*
199 * Return to application
200 */
201 return (ARM_MATH_SUCCESS);
202 }
203
204
arm_mat_mult_q15_4x4_mve(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst)205 __STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
206 const arm_matrix_instance_q15 * pSrcA,
207 const arm_matrix_instance_q15 * pSrcB,
208 arm_matrix_instance_q15 * pDst)
209 {
210 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
211 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
212 q15_t *pOut = pDst->pData; /* output data matrix pointer */
213 uint16x8_t vecColBOffs;
214 q15_t *pInA0 = pInA;
215 q15_t *pInA1 = pInA0 + MATRIX_DIM4;
216 q15_t *pInA2 = pInA1 + MATRIX_DIM4;
217 q15_t *pInA3 = pInA2 + MATRIX_DIM4;
218 q63_t acc0, acc1, acc2, acc3;
219 q15x8_t vecB, vecA0, vecA1, vecA2, vecA3;
220 mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
221
222 vecColBOffs = vidupq_u16((uint32_t)0, 4);
223
224 pInB = pSrcB->pData;
225
226 vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
227
228 vecA0 = vldrhq_s16(pInA0);
229 vecA1 = vldrhq_s16(pInA1);
230 vecA2 = vldrhq_s16(pInA2);
231 vecA3 = vldrhq_s16(pInA3);
232
233 acc0 = vmlaldavq(vecA0, vecB);
234 acc1 = vmlaldavq(vecA1, vecB);
235 acc2 = vmlaldavq(vecA2, vecB);
236 acc3 = vmlaldavq(vecA3, vecB);
237
238 acc0 = asrl(acc0, 15);
239 acc1 = asrl(acc1, 15);
240 acc2 = asrl(acc2, 15);
241 acc3 = asrl(acc3, 15);
242
243 pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
244 pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
245 pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
246 pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
247 pOut++;
248
249 /* move to next B column */
250 pInB = pInB + 1;
251
252 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
253
254 acc0 = vmlaldavq(vecA0, vecB);
255 acc1 = vmlaldavq(vecA1, vecB);
256 acc2 = vmlaldavq(vecA2, vecB);
257 acc3 = vmlaldavq(vecA3, vecB);
258
259 acc0 = asrl(acc0, 15);
260 acc1 = asrl(acc1, 15);
261 acc2 = asrl(acc2, 15);
262 acc3 = asrl(acc3, 15);
263
264 pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
265 pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
266 pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
267 pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
268
269 pOut++;
270
271 /* move to next B column */
272 pInB = pInB + 1;
273
274 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
275
276 acc0 = vmlaldavq(vecA0, vecB);
277 acc1 = vmlaldavq(vecA1, vecB);
278 acc2 = vmlaldavq(vecA2, vecB);
279 acc3 = vmlaldavq(vecA3, vecB);
280
281 acc0 = asrl(acc0, 15);
282 acc1 = asrl(acc1, 15);
283 acc2 = asrl(acc2, 15);
284 acc3 = asrl(acc3, 15);
285
286 pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
287 pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
288 pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
289 pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
290
291 pOut++;
292
293 /* move to next B column */
294 pInB = pInB + 1;
295
296 vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
297
298 acc0 = vmlaldavq(vecA0, vecB);
299 acc1 = vmlaldavq(vecA1, vecB);
300 acc2 = vmlaldavq(vecA2, vecB);
301 acc3 = vmlaldavq(vecA3, vecB);
302
303 acc0 = asrl(acc0, 15);
304 acc1 = asrl(acc1, 15);
305 acc2 = asrl(acc2, 15);
306 acc3 = asrl(acc3, 15);
307
308 pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
309 pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
310 pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
311 pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
312 /*
313 * Return to application
314 */
315 return (ARM_MATH_SUCCESS);
316 }
317
arm_mat_mult_q15(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst,q15_t * pState)318 arm_status arm_mat_mult_q15(
319 const arm_matrix_instance_q15 * pSrcA,
320 const arm_matrix_instance_q15 * pSrcB,
321 arm_matrix_instance_q15 * pDst,
322 q15_t * pState)
323 {
324 q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
325 q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
326 q15_t *pOut = pDst->pData; /* output data matrix pointer */
327 q15_t *px; /* Temporary output data matrix pointer */
328 uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
329 uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
330 uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
331 uint16_t col, i = 0U, row = numRowsA; /* loop counters */
332 uint16x8_t vecOffs, vecColBOffs;
333 uint32_t blkCnt,rowCnt; /* loop counters */
334 arm_status status; /* Status of matrix multiplication */
335 (void)pState;
336
337 #ifdef ARM_MATH_MATRIX_CHECK
338
339 /* Check for matrix mismatch condition */
340 if ((pSrcA->numCols != pSrcB->numRows) ||
341 (pSrcA->numRows != pDst->numRows) ||
342 (pSrcB->numCols != pDst->numCols) )
343 {
344 /* Set status as ARM_MATH_SIZE_MISMATCH */
345 status = ARM_MATH_SIZE_MISMATCH;
346 }
347 else
348 #endif
349 {
350 /* small squared matrix specialized routines */
351 if(numRowsA == numColsB && numColsB == numColsA) {
352
353 if (numRowsA == 1)
354 {
355 q63_t sum;
356 sum = pInA[0] * pInB[0];
357 pOut[0] = (q15_t) __SSAT((sum >> 15), 16);
358 return (ARM_MATH_SUCCESS);
359 }
360 else if(numRowsA == 2)
361 return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
362 else if(numRowsA == 3)
363 return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
364 else if (numRowsA == 4)
365 return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
366 }
367
368 vecColBOffs = vidupq_u16((uint32_t)0, 1);
369 vecColBOffs = vecColBOffs * (uint16_t) (numColsB);
370
371 /*
372 * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
373 */
374
375 /*
376 * row loop
377 */
378 rowCnt = row >> 2;
379 while (rowCnt > 0U)
380 {
381 /*
382 * Output pointer is set to starting address of the row being processed
383 */
384 px = pOut + i;
385 i = i + 4 * numColsB;
386 /*
387 * For every row wise process, the column loop counter is to be initiated
388 */
389 col = numColsB;
390 /*
391 * For every row wise process, the pInB pointer is set
392 * to the starting address of the pSrcB data
393 */
394 pInB = pSrcB->pData;
395 /*
396 * column loop
397 */
398 while (col > 0U)
399 {
400 /*
401 * generate 4 columns elements
402 */
403 /*
404 * Matrix A columns number of MAC operations are to be performed
405 */
406
407 q15_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
408 q15_t *pInA0 = pInA;
409 q15_t *pInA1 = pInA0 + numColsA;
410 q15_t *pInA2 = pInA1 + numColsA;
411 q15_t *pInA3 = pInA2 + numColsA;
412 q63_t acc0, acc1, acc2, acc3;
413
414 acc0 = 0LL;
415 acc1 = 0LL;
416 acc2 = 0LL;
417 acc3 = 0LL;
418
419 pSrcA0Vec = (q15_t const *) pInA0;
420 pSrcA1Vec = (q15_t const *) pInA1;
421 pSrcA2Vec = (q15_t const *) pInA2;
422 pSrcA3Vec = (q15_t const *) pInA3;
423
424 vecOffs = vecColBOffs;
425
426 blkCnt = (numColsA) >> 3;
427 while (blkCnt > 0U)
428 {
429 q15x8_t vecB, vecA;
430
431 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
432 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
433
434 vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
435 acc0 = vmlaldavaq(acc0, vecA, vecB);
436 vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
437 acc1 = vmlaldavaq(acc1, vecA, vecB);
438 vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
439 acc2 = vmlaldavaq(acc2, vecA, vecB);
440 vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
441 acc3 = vmlaldavaq(acc3, vecA, vecB);
442 blkCnt--;
443
444 }
445 /*
446 * tail
447 */
448 blkCnt = numColsA & 7;
449 if (blkCnt > 0U)
450 {
451 mve_pred16_t p0 = vctp16q(blkCnt);
452 q15x8_t vecB, vecA;
453
454 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
455 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
456
457 vecA = vld1q(pSrcA0Vec);
458 acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
459 vecA = vld1q(pSrcA1Vec);
460 acc1 = vmlaldavaq_p(acc1, vecA, vecB, p0);
461 vecA = vld1q(pSrcA2Vec);
462 acc2 = vmlaldavaq_p(acc2, vecA, vecB, p0);
463 vecA = vld1q(pSrcA3Vec);
464 acc3 = vmlaldavaq_p(acc3, vecA, vecB, p0);
465 }
466
467 px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
468 px[1 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc1, 15);
469 px[2 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc2, 15);
470 px[3 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc3, 15);
471 px++;
472 /*
473 * Decrement the column loop counter
474 */
475 col--;
476 /*
477 * Update the pointer pInB to point to the starting address of the next column
478 */
479 pInB = pSrcB->pData + (numColsB - col);
480 }
481
482 /*
483 * Update the pointer pInA to point to the starting address of the next row
484 */
485 pInA += (numColsA * 4);
486 /*
487 * Decrement the row loop counter
488 */
489 rowCnt --;
490
491 }
492
493 rowCnt = row & 3;
494 while (rowCnt > 0U)
495 {
496 /*
497 * Output pointer is set to starting address of the row being processed
498 */
499 px = pOut + i;
500 i = i + numColsB;
501 /*
502 * For every row wise process, the column loop counter is to be initiated
503 */
504 col = numColsB;
505 /*
506 * For every row wise process, the pInB pointer is set
507 * to the starting address of the pSrcB data
508 */
509 pInB = pSrcB->pData;
510 /*
511 * column loop
512 */
513 while (col > 0U)
514 {
515 /*
516 * generate 4 columns elements
517 */
518 /*
519 * Matrix A columns number of MAC operations are to be performed
520 */
521
522 q15_t const *pSrcA0Vec;
523 q15_t *pInA0 = pInA;
524 q63_t acc0;
525
526 acc0 = 0LL;
527
528 pSrcA0Vec = (q15_t const *) pInA0;
529
530 vecOffs = vecColBOffs;
531
532 blkCnt = (numColsA) >> 3;
533 while (blkCnt > 0U)
534 {
535 q15x8_t vecB, vecA;
536
537 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
538 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
539
540 vecA = vld1q(pSrcA0Vec);
541 pSrcA0Vec += 8;
542 acc0 = vmlaldavaq(acc0, vecA, vecB);
543
544 blkCnt--;
545
546 }
547 /*
548 * tail
549 */
550 blkCnt = numColsA & 7;
551 if (blkCnt > 0U)
552 {
553 mve_pred16_t p0 = vctp16q(blkCnt);
554 q15x8_t vecB, vecA;
555
556 vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
557 vecOffs = vecOffs + (uint16_t) (numColsB * 8);
558
559 vecA = vld1q(pSrcA0Vec);
560 acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
561
562 }
563
564 px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
565
566 px++;
567 /*
568 * Decrement the column loop counter
569 */
570 col--;
571 /*
572 * Update the pointer pInB to point to the starting address of the next column
573 */
574 pInB = pSrcB->pData + (numColsB - col);
575 }
576
577 /*
578 * Update the pointer pInA to point to the starting address of the next row
579 */
580 pInA += (numColsA );
581 rowCnt--;
582 }
583 /* Set status as ARM_MATH_SUCCESS */
584 status = ARM_MATH_SUCCESS;
585 }
586
587 /* Return to application */
588 return (status);
589
590 }
591 #else
arm_mat_mult_q15(const arm_matrix_instance_q15 * pSrcA,const arm_matrix_instance_q15 * pSrcB,arm_matrix_instance_q15 * pDst,q15_t * pState)592 arm_status arm_mat_mult_q15(
593 const arm_matrix_instance_q15 * pSrcA,
594 const arm_matrix_instance_q15 * pSrcB,
595 arm_matrix_instance_q15 * pDst,
596 q15_t * pState)
597 {
598 q63_t sum; /* Accumulator */
599
600 #if defined (ARM_MATH_DSP) /* != CM0 */
601
602 q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
603 q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
604 q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
605 q15_t *px; /* Temporary output data matrix pointer */
606 uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
607 uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
608 uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
609 uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
610 uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
611 arm_status status; /* Status of matrix multiplication */
612
613 q31_t in; /* Temporary variable to hold the input value */
614 q31_t inA1, inB1, inA2, inB2;
615
616 #ifdef ARM_MATH_MATRIX_CHECK
617
618 /* Check for matrix mismatch condition */
619 if ((pSrcA->numCols != pSrcB->numRows) ||
620 (pSrcA->numRows != pDst->numRows) ||
621 (pSrcB->numCols != pDst->numCols) )
622 {
623 /* Set status as ARM_MATH_SIZE_MISMATCH */
624 status = ARM_MATH_SIZE_MISMATCH;
625 }
626 else
627
628 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
629
630 {
631 /* Matrix transpose */
632 do
633 {
634 /* The pointer px is set to starting address of column being processed */
635 px = pSrcBT + i;
636
637 /* Apply loop unrolling and exchange columns with row elements */
638 col = numColsB >> 2U;
639
640 /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
641 ** a second loop below computes the remaining 1 to 3 samples. */
642 while (col > 0U)
643 {
644 /* Read two elements from row */
645 in = read_q15x2_ia ((q15_t **) &pInB);
646
647 /* Unpack and store one element in destination */
648 #ifndef ARM_MATH_BIG_ENDIAN
649 *px = (q15_t) in;
650 #else
651 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
652 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
653
654 /* Update pointer px to point to next row of transposed matrix */
655 px += numRowsB;
656
657 /* Unpack and store second element in destination */
658 #ifndef ARM_MATH_BIG_ENDIAN
659 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
660 #else
661 *px = (q15_t) in;
662 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
663
664 /* Update pointer px to point to next row of transposed matrix */
665 px += numRowsB;
666
667 /* Read two elements from row */
668 in = read_q15x2_ia ((q15_t **) &pInB);
669
670 /* Unpack and store one element in destination */
671 #ifndef ARM_MATH_BIG_ENDIAN
672 *px = (q15_t) in;
673 #else
674 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
675 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
676 px += numRowsB;
677
678 #ifndef ARM_MATH_BIG_ENDIAN
679 *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
680 #else
681 *px = (q15_t) in;
682 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
683 px += numRowsB;
684
685 /* Decrement column loop counter */
686 col--;
687 }
688
689 /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
690 ** No loop unrolling is used. */
691 col = numColsB % 0x4U;
692
693 while (col > 0U)
694 {
695 /* Read and store input element in destination */
696 *px = *pInB++;
697
698 /* Update pointer px to point to next row of transposed matrix */
699 px += numRowsB;
700
701 /* Decrement column loop counter */
702 col--;
703 }
704
705 i++;
706
707 /* Decrement row loop counter */
708 row--;
709
710 } while (row > 0U);
711
712 /* Reset variables for usage in following multiplication process */
713 row = numRowsA;
714 i = 0U;
715 px = pDst->pData;
716
717 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
718 /* row loop */
719 do
720 {
721 /* For every row wise process, column loop counter is to be initiated */
722 col = numColsB;
723
724 /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
725 pInB = pSrcBT;
726
727 /* column loop */
728 do
729 {
730 /* Set variable sum, that acts as accumulator, to zero */
731 sum = 0;
732
733 /* Initiate pointer pInA to point to starting address of column being processed */
734 pInA = pSrcA->pData + i;
735
736 /* Apply loop unrolling and compute 2 MACs simultaneously. */
737 colCnt = numColsA >> 2U;
738
739 /* matrix multiplication */
740 while (colCnt > 0U)
741 {
742 /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
743
744 /* read real and imag values from pSrcA and pSrcB buffer */
745 inA1 = read_q15x2_ia ((q15_t **) &pInA);
746 inB1 = read_q15x2_ia ((q15_t **) &pInB);
747
748 inA2 = read_q15x2_ia ((q15_t **) &pInA);
749 inB2 = read_q15x2_ia ((q15_t **) &pInB);
750
751 /* Multiply and Accumulates */
752 sum = __SMLALD(inA1, inB1, sum);
753 sum = __SMLALD(inA2, inB2, sum);
754
755 /* Decrement loop counter */
756 colCnt--;
757 }
758
759 /* process remaining column samples */
760 colCnt = numColsA % 0x4U;
761
762 while (colCnt > 0U)
763 {
764 /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
765 sum += *pInA++ * *pInB++;
766
767 /* Decrement loop counter */
768 colCnt--;
769 }
770
771 /* Saturate and store result in destination buffer */
772 *px = (q15_t) (__SSAT((sum >> 15), 16));
773 px++;
774
775 /* Decrement column loop counter */
776 col--;
777
778 } while (col > 0U);
779
780 i = i + numColsA;
781
782 /* Decrement row loop counter */
783 row--;
784
785 } while (row > 0U);
786
787 #else /* #if defined (ARM_MATH_DSP) */
788
789 q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
790 q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
791 q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
792 q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
793 q15_t *pOut = pDst->pData; /* Output data matrix pointer */
794 q15_t *px; /* Temporary output data matrix pointer */
795 uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
796 uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
797 uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
798 uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
799 arm_status status; /* Status of matrix multiplication */
800 (void)pState;
801
802 #ifdef ARM_MATH_MATRIX_CHECK
803
804 /* Check for matrix mismatch condition */
805 if ((pSrcA->numCols != pSrcB->numRows) ||
806 (pSrcA->numRows != pDst->numRows) ||
807 (pSrcB->numCols != pDst->numCols) )
808 {
809 /* Set status as ARM_MATH_SIZE_MISMATCH */
810 status = ARM_MATH_SIZE_MISMATCH;
811 }
812 else
813
814 #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
815
816 {
817 /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
818 /* row loop */
819 do
820 {
821 /* Output pointer is set to starting address of the row being processed */
822 px = pOut + i;
823
824 /* For every row wise process, column loop counter is to be initiated */
825 col = numColsB;
826
827 /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
828 pIn2 = pSrcB->pData;
829
830 /* column loop */
831 do
832 {
833 /* Set the variable sum, that acts as accumulator, to zero */
834 sum = 0;
835
836 /* Initiate pointer pIn1 to point to starting address of pSrcA */
837 pIn1 = pInA;
838
839 /* Matrix A columns number of MAC operations are to be performed */
840 colCnt = numColsA;
841
842 /* matrix multiplication */
843 while (colCnt > 0U)
844 {
845 /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
846
847 /* Perform multiply-accumulates */
848 sum += (q31_t) * pIn1++ * *pIn2;
849 pIn2 += numColsB;
850
851 /* Decrement loop counter */
852 colCnt--;
853 }
854
855 /* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
856
857 /* Saturate and store result in destination buffer */
858 *px++ = (q15_t) __SSAT((sum >> 15), 16);
859
860 /* Decrement column loop counter */
861 col--;
862
863 /* Update pointer pIn2 to point to starting address of next column */
864 pIn2 = pInB + (numColsB - col);
865
866 } while (col > 0U);
867
868 /* Update pointer pSrcA to point to starting address of next row */
869 i = i + numColsB;
870 pInA = pInA + numColsA;
871
872 /* Decrement row loop counter */
873 row--;
874
875 } while (row > 0U);
876
877 #endif /* #if defined (ARM_MATH_DSP) */
878
879 /* Set status as ARM_MATH_SUCCESS */
880 status = ARM_MATH_SUCCESS;
881 }
882
883 /* Return to application */
884 return (status);
885 }
886 #endif /* defined(ARM_MATH_MVEI) */
887
888 /**
889 @} end of MatrixMult group
890 */
891