1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_fast_q31.c
4 * Description: Fast Q31 Convolution
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q31 sequences (fast version).
42 @param[in] pSrcA points to the first input sequence.
43 @param[in] srcALen length of the first input sequence.
44 @param[in] pSrcB points to the second input sequence.
45 @param[in] srcBLen length of the second input sequence.
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47 @return none
48
49 @par Scaling and Overflow Behavior
50 This function is optimized for speed at the expense of fixed-point precision and overflow protection.
51 The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
52 These intermediate results are accumulated in a 32-bit register in 2.30 format.
53 Finally, the accumulator is saturated and converted to a 1.31 result.
54 @par
55 The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
56 In order to avoid overflows completely the input signals must be scaled down.
57 Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
58 as maximum of min(srcALen, srcBLen) number of additions are carried internally.
59 @remark
60 Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
61 */
62
arm_conv_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)63 void arm_conv_fast_q31(
64 const q31_t * pSrcA,
65 uint32_t srcALen,
66 const q31_t * pSrcB,
67 uint32_t srcBLen,
68 q31_t * pDst)
69 {
70 const q31_t *pIn1; /* InputA pointer */
71 const q31_t *pIn2; /* InputB pointer */
72 q31_t *pOut = pDst; /* Output pointer */
73 const q31_t *px; /* Intermediate inputA pointer */
74 const q31_t *py; /* Intermediate inputB pointer */
75 const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
76 q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
77 q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
78 uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
79 uint32_t j, k, count, blkCnt; /* Loop counters */
80
81 /* The algorithm implementation is based on the lengths of the inputs. */
82 /* srcB is always made to slide across srcA. */
83 /* So srcBLen is always considered as shorter or equal to srcALen */
84 if (srcALen >= srcBLen)
85 {
86 /* Initialization of inputA pointer */
87 pIn1 = pSrcA;
88
89 /* Initialization of inputB pointer */
90 pIn2 = pSrcB;
91 }
92 else
93 {
94 /* Initialization of inputA pointer */
95 pIn1 = pSrcB;
96
97 /* Initialization of inputB pointer */
98 pIn2 = pSrcA;
99
100 /* srcBLen is always considered as shorter or equal to srcALen */
101 j = srcBLen;
102 srcBLen = srcALen;
103 srcALen = j;
104 }
105
106 /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
107 /* The function is internally
108 * divided into three stages according to the number of multiplications that has to be
109 * taken place between inputA samples and inputB samples. In the first stage of the
110 * algorithm, the multiplications increase by one for every iteration.
111 * In the second stage of the algorithm, srcBLen number of multiplications are done.
112 * In the third stage of the algorithm, the multiplications decrease by one
113 * for every iteration. */
114
115 /* The algorithm is implemented in three stages.
116 The loop counters of each stage is initiated here. */
117 blockSize1 = srcBLen - 1U;
118 blockSize2 = srcALen - (srcBLen - 1U);
119 blockSize3 = blockSize1;
120
121 /* --------------------------
122 * Initializations of stage1
123 * -------------------------*/
124
125 /* sum = x[0] * y[0]
126 * sum = x[0] * y[1] + x[1] * y[0]
127 * ....
128 * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
129 */
130
131 /* In this stage the MAC operations are increased by 1 for every iteration.
132 The count variable holds the number of MAC operations performed */
133 count = 1U;
134
135 /* Working pointer of inputA */
136 px = pIn1;
137
138 /* Working pointer of inputB */
139 py = pIn2;
140
141
142 /* ------------------------
143 * Stage1 process
144 * ----------------------*/
145
146 /* The first stage starts here */
147 while (blockSize1 > 0U)
148 {
149 /* Accumulator is made zero for every iteration */
150 sum = 0;
151
152 /* Apply loop unrolling and compute 4 MACs simultaneously. */
153 k = count >> 2U;
154
155 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
156 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
157 while (k > 0U)
158 {
159 /* x[0] * y[srcBLen - 1] */
160 sum = (q31_t) ((((q63_t) sum << 32) +
161 ((q63_t) *px++ * (*py--))) >> 32);
162
163 /* x[1] * y[srcBLen - 2] */
164 sum = (q31_t) ((((q63_t) sum << 32) +
165 ((q63_t) *px++ * (*py--))) >> 32);
166
167 /* x[2] * y[srcBLen - 3] */
168 sum = (q31_t) ((((q63_t) sum << 32) +
169 ((q63_t) *px++ * (*py--))) >> 32);
170
171 /* x[3] * y[srcBLen - 4] */
172 sum = (q31_t) ((((q63_t) sum << 32) +
173 ((q63_t) *px++ * (*py--))) >> 32);
174
175 /* Decrement loop counter */
176 k--;
177 }
178
179 /* If the count is not a multiple of 4, compute any remaining MACs here.
180 ** No loop unrolling is used. */
181 k = count % 0x4U;
182
183 while (k > 0U)
184 {
185 /* Perform the multiply-accumulate */
186 sum = (q31_t) ((((q63_t) sum << 32) +
187 ((q63_t) *px++ * (*py--))) >> 32);
188
189 /* Decrement loop counter */
190 k--;
191 }
192
193 /* Store the result in the accumulator in the destination buffer. */
194 *pOut++ = sum << 1;
195
196 /* Update the inputA and inputB pointers for next MAC calculation */
197 py = pIn2 + count;
198 px = pIn1;
199
200 /* Increment MAC count */
201 count++;
202
203 /* Decrement loop counter */
204 blockSize1--;
205 }
206
207 /* --------------------------
208 * Initializations of stage2
209 * ------------------------*/
210
211 /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
212 * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
213 * ....
214 * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
215 */
216
217 /* Working pointer of inputA */
218 px = pIn1;
219
220 /* Working pointer of inputB */
221 pSrc2 = pIn2 + (srcBLen - 1U);
222 py = pSrc2;
223
224 /* count is index by which the pointer pIn1 to be incremented */
225 count = 0U;
226
227 /* -------------------
228 * Stage2 process
229 * ------------------*/
230
231 /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
232 * So, to loop unroll over blockSize2,
233 * srcBLen should be greater than or equal to 4 */
234 if (srcBLen >= 4U)
235 {
236 /* Loop unroll over blockSize2, by 4 */
237 blkCnt = blockSize2 >> 2U;
238
239 while (blkCnt > 0U)
240 {
241 /* Set all accumulators to zero */
242 acc0 = 0;
243 acc1 = 0;
244 acc2 = 0;
245 acc3 = 0;
246
247 /* read x[0], x[1], x[2] samples */
248 x0 = *px++;
249 x1 = *px++;
250 x2 = *px++;
251
252 /* Apply loop unrolling and compute 4 MACs simultaneously. */
253 k = srcBLen >> 2U;
254
255 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
256 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
257 do
258 {
259 /* Read y[srcBLen - 1] sample */
260 c0 = *py--;
261 /* Read x[3] sample */
262 x3 = *px++;
263
264 /* Perform the multiply-accumulate */
265 /* acc0 += x[0] * y[srcBLen - 1] */
266 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
267 /* acc1 += x[1] * y[srcBLen - 1] */
268 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
269 /* acc2 += x[2] * y[srcBLen - 1] */
270 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
271 /* acc3 += x[3] * y[srcBLen - 1] */
272 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
273
274
275 /* Read y[srcBLen - 2] sample */
276 c0 = *py--;
277 /* Read x[4] sample */
278 x0 = *px++;
279
280 /* Perform the multiply-accumulate */
281 /* acc0 += x[1] * y[srcBLen - 2] */
282 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
283 /* acc1 += x[2] * y[srcBLen - 2] */
284 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
285 /* acc2 += x[3] * y[srcBLen - 2] */
286 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
287 /* acc3 += x[4] * y[srcBLen - 2] */
288 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
289
290
291 /* Read y[srcBLen - 3] sample */
292 c0 = *py--;
293 /* Read x[5] sample */
294 x1 = *px++;
295
296 /* Perform the multiply-accumulates */
297 /* acc0 += x[2] * y[srcBLen - 3] */
298 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
299 /* acc1 += x[3] * y[srcBLen - 3] */
300 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
301 /* acc2 += x[4] * y[srcBLen - 3] */
302 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
303 /* acc3 += x[5] * y[srcBLen - 3] */
304 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
305
306
307 /* Read y[srcBLen - 4] sample */
308 c0 = *py--;
309 /* Read x[6] sample */
310 x2 = *px++;
311
312 /* Perform the multiply-accumulates */
313 /* acc0 += x[3] * y[srcBLen - 4] */
314 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
315 /* acc1 += x[4] * y[srcBLen - 4] */
316 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
317 /* acc2 += x[5] * y[srcBLen - 4] */
318 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
319 /* acc3 += x[6] * y[srcBLen - 4] */
320 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
321
322
323 } while (--k);
324
325 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
326 ** No loop unrolling is used. */
327 k = srcBLen % 0x4U;
328
329 while (k > 0U)
330 {
331 /* Read y[srcBLen - 5] sample */
332 c0 = *py--;
333 /* Read x[7] sample */
334 x3 = *px++;
335
336 /* Perform the multiply-accumulates */
337 /* acc0 += x[4] * y[srcBLen - 5] */
338 acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
339 /* acc1 += x[5] * y[srcBLen - 5] */
340 acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
341 /* acc2 += x[6] * y[srcBLen - 5] */
342 acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
343 /* acc3 += x[7] * y[srcBLen - 5] */
344 acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
345
346 /* Reuse the present samples for the next MAC */
347 x0 = x1;
348 x1 = x2;
349 x2 = x3;
350
351 /* Decrement loop counter */
352 k--;
353 }
354
355 /* Store the result in the accumulator in the destination buffer. */
356 *pOut++ = (q31_t) (acc0 << 1);
357 *pOut++ = (q31_t) (acc1 << 1);
358 *pOut++ = (q31_t) (acc2 << 1);
359 *pOut++ = (q31_t) (acc3 << 1);
360
361 /* Increment the pointer pIn1 index, count by 4 */
362 count += 4U;
363
364 /* Update the inputA and inputB pointers for next MAC calculation */
365 px = pIn1 + count;
366 py = pSrc2;
367
368 /* Decrement loop counter */
369 blkCnt--;
370 }
371
372 /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
373 ** No loop unrolling is used. */
374 blkCnt = blockSize2 % 0x4U;
375
376 while (blkCnt > 0U)
377 {
378 /* Accumulator is made zero for every iteration */
379 sum = 0;
380
381 /* Apply loop unrolling and compute 4 MACs simultaneously. */
382 k = srcBLen >> 2U;
383
384 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
385 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
386 while (k > 0U)
387 {
388 /* Perform the multiply-accumulates */
389 sum = (q31_t) ((((q63_t) sum << 32) +
390 ((q63_t) *px++ * (*py--))) >> 32);
391 sum = (q31_t) ((((q63_t) sum << 32) +
392 ((q63_t) *px++ * (*py--))) >> 32);
393 sum = (q31_t) ((((q63_t) sum << 32) +
394 ((q63_t) *px++ * (*py--))) >> 32);
395 sum = (q31_t) ((((q63_t) sum << 32) +
396 ((q63_t) *px++ * (*py--))) >> 32);
397
398 /* Decrement loop counter */
399 k--;
400 }
401
402 /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
403 ** No loop unrolling is used. */
404 k = srcBLen % 0x4U;
405
406 while (k > 0U)
407 {
408 /* Perform the multiply-accumulate */
409 sum = (q31_t) ((((q63_t) sum << 32) +
410 ((q63_t) *px++ * (*py--))) >> 32);
411
412 /* Decrement loop counter */
413 k--;
414 }
415
416 /* Store the result in the accumulator in the destination buffer. */
417 *pOut++ = sum << 1;
418
419 /* Increment MAC count */
420 count++;
421
422 /* Update the inputA and inputB pointers for next MAC calculation */
423 px = pIn1 + count;
424 py = pSrc2;
425
426 /* Decrement loop counter */
427 blkCnt--;
428 }
429 }
430 else
431 {
432 /* If the srcBLen is not a multiple of 4,
433 * the blockSize2 loop cannot be unrolled by 4 */
434 blkCnt = blockSize2;
435
436 while (blkCnt > 0U)
437 {
438 /* Accumulator is made zero for every iteration */
439 sum = 0;
440
441 /* srcBLen number of MACS should be performed */
442 k = srcBLen;
443
444 while (k > 0U)
445 {
446 /* Perform the multiply-accumulate */
447 sum = (q31_t) ((((q63_t) sum << 32) +
448 ((q63_t) *px++ * (*py--))) >> 32);
449
450 /* Decrement loop counter */
451 k--;
452 }
453
454 /* Store the result in the accumulator in the destination buffer. */
455 *pOut++ = sum << 1;
456
457 /* Increment MAC count */
458 count++;
459
460 /* Update the inputA and inputB pointers for next MAC calculation */
461 px = pIn1 + count;
462 py = pSrc2;
463
464 /* Decrement loop counter */
465 blkCnt--;
466 }
467 }
468
469
470 /* --------------------------
471 * Initializations of stage3
472 * -------------------------*/
473
474 /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
475 * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
476 * ....
477 * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
478 * sum += x[srcALen-1] * y[srcBLen-1]
479 */
480
481 /* In this stage the MAC operations are decreased by 1 for every iteration.
482 The blockSize3 variable holds the number of MAC operations performed */
483
484 /* Working pointer of inputA */
485 pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
486 px = pSrc1;
487
488 /* Working pointer of inputB */
489 pSrc2 = pIn2 + (srcBLen - 1U);
490 py = pSrc2;
491
492 /* -------------------
493 * Stage3 process
494 * ------------------*/
495
496 while (blockSize3 > 0U)
497 {
498 /* Accumulator is made zero for every iteration */
499 sum = 0;
500
501 /* Apply loop unrolling and compute 4 MACs simultaneously. */
502 k = blockSize3 >> 2U;
503
504 /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
505 ** a second loop below computes MACs for the remaining 1 to 3 samples. */
506 while (k > 0U)
507 {
508 /* Perform the multiply-accumulate */
509 /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
510 sum = (q31_t) ((((q63_t) sum << 32) +
511 ((q63_t) *px++ * (*py--))) >> 32);
512
513 /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
514 sum = (q31_t) ((((q63_t) sum << 32) +
515 ((q63_t) *px++ * (*py--))) >> 32);
516
517 /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
518 sum = (q31_t) ((((q63_t) sum << 32) +
519 ((q63_t) *px++ * (*py--))) >> 32);
520
521 /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
522 sum = (q31_t) ((((q63_t) sum << 32) +
523 ((q63_t) *px++ * (*py--))) >> 32);
524
525 /* Decrement loop counter */
526 k--;
527 }
528
529 /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
530 ** No loop unrolling is used. */
531 k = blockSize3 % 0x4U;
532
533 while (k > 0U)
534 {
535 /* Perform the multiply-accumulate */
536 sum = (q31_t) ((((q63_t) sum << 32) +
537 ((q63_t) *px++ * (*py--))) >> 32);
538
539 /* Decrement loop counter */
540 k--;
541 }
542
543 /* Store the result in the accumulator in the destination buffer. */
544 *pOut++ = sum << 1;
545
546 /* Update the inputA and inputB pointers for next MAC calculation */
547 px = ++pSrc1;
548 py = pSrc2;
549
550 /* Decrement loop counter */
551 blockSize3--;
552 }
553
554 }
555
556 /**
557 @} end of Conv group
558 */
559