• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_fast_q31.c
4  * Description:  Fast Q31 Convolution
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q31 sequences (fast version).
42   @param[in]     pSrcA      points to the first input sequence.
43   @param[in]     srcALen    length of the first input sequence.
44   @param[in]     pSrcB      points to the second input sequence.
45   @param[in]     srcBLen    length of the second input sequence.
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47   @return        none
48 
49   @par           Scaling and Overflow Behavior
50                    This function is optimized for speed at the expense of fixed-point precision and overflow protection.
51                    The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
52                    These intermediate results are accumulated in a 32-bit register in 2.30 format.
53                    Finally, the accumulator is saturated and converted to a 1.31 result.
54   @par
55                    The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
56                    In order to avoid overflows completely the input signals must be scaled down.
57                    Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
58                    as maximum of min(srcALen, srcBLen) number of additions are carried internally.
59   @remark
60                    Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
61  */
62 
arm_conv_fast_q31(const q31_t * pSrcA,uint32_t srcALen,const q31_t * pSrcB,uint32_t srcBLen,q31_t * pDst)63 void arm_conv_fast_q31(
64   const q31_t * pSrcA,
65         uint32_t srcALen,
66   const q31_t * pSrcB,
67         uint32_t srcBLen,
68         q31_t * pDst)
69 {
70   const q31_t *pIn1;                                   /* InputA pointer */
71   const q31_t *pIn2;                                   /* InputB pointer */
72         q31_t *pOut = pDst;                            /* Output pointer */
73   const q31_t *px;                                     /* Intermediate inputA pointer */
74   const q31_t *py;                                     /* Intermediate inputB pointer */
75   const q31_t *pSrc1, *pSrc2;                          /* Intermediate pointers */
76         q31_t sum, acc0, acc1, acc2, acc3;             /* Accumulators */
77         q31_t x0, x1, x2, x3, c0;                      /* Temporary variables to hold state and coefficient values */
78         uint32_t blockSize1, blockSize2, blockSize3;   /* Loop counters */
79         uint32_t j, k, count, blkCnt;                  /* Loop counters */
80 
81   /* The algorithm implementation is based on the lengths of the inputs. */
82   /* srcB is always made to slide across srcA. */
83   /* So srcBLen is always considered as shorter or equal to srcALen */
84   if (srcALen >= srcBLen)
85   {
86     /* Initialization of inputA pointer */
87     pIn1 = pSrcA;
88 
89     /* Initialization of inputB pointer */
90     pIn2 = pSrcB;
91   }
92   else
93   {
94     /* Initialization of inputA pointer */
95     pIn1 = pSrcB;
96 
97     /* Initialization of inputB pointer */
98     pIn2 = pSrcA;
99 
100     /* srcBLen is always considered as shorter or equal to srcALen */
101     j = srcBLen;
102     srcBLen = srcALen;
103     srcALen = j;
104   }
105 
106   /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
107   /* The function is internally
108    * divided into three stages according to the number of multiplications that has to be
109    * taken place between inputA samples and inputB samples. In the first stage of the
110    * algorithm, the multiplications increase by one for every iteration.
111    * In the second stage of the algorithm, srcBLen number of multiplications are done.
112    * In the third stage of the algorithm, the multiplications decrease by one
113    * for every iteration. */
114 
115   /* The algorithm is implemented in three stages.
116      The loop counters of each stage is initiated here. */
117   blockSize1 = srcBLen - 1U;
118   blockSize2 = srcALen - (srcBLen - 1U);
119   blockSize3 = blockSize1;
120 
121   /* --------------------------
122    * Initializations of stage1
123    * -------------------------*/
124 
125   /* sum = x[0] * y[0]
126    * sum = x[0] * y[1] + x[1] * y[0]
127    * ....
128    * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
129    */
130 
131   /* In this stage the MAC operations are increased by 1 for every iteration.
132      The count variable holds the number of MAC operations performed */
133   count = 1U;
134 
135   /* Working pointer of inputA */
136   px = pIn1;
137 
138   /* Working pointer of inputB */
139   py = pIn2;
140 
141 
142   /* ------------------------
143    * Stage1 process
144    * ----------------------*/
145 
146   /* The first stage starts here */
147   while (blockSize1 > 0U)
148   {
149     /* Accumulator is made zero for every iteration */
150     sum = 0;
151 
152     /* Apply loop unrolling and compute 4 MACs simultaneously. */
153     k = count >> 2U;
154 
155     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
156      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
157     while (k > 0U)
158     {
159       /* x[0] * y[srcBLen - 1] */
160       sum = (q31_t) ((((q63_t) sum << 32) +
161                       ((q63_t) *px++ * (*py--))) >> 32);
162 
163       /* x[1] * y[srcBLen - 2] */
164       sum = (q31_t) ((((q63_t) sum << 32) +
165                       ((q63_t) *px++ * (*py--))) >> 32);
166 
167       /* x[2] * y[srcBLen - 3] */
168       sum = (q31_t) ((((q63_t) sum << 32) +
169                       ((q63_t) *px++ * (*py--))) >> 32);
170 
171       /* x[3] * y[srcBLen - 4] */
172       sum = (q31_t) ((((q63_t) sum << 32) +
173                       ((q63_t) *px++ * (*py--))) >> 32);
174 
175       /* Decrement loop counter */
176       k--;
177     }
178 
179     /* If the count is not a multiple of 4, compute any remaining MACs here.
180      ** No loop unrolling is used. */
181     k = count % 0x4U;
182 
183     while (k > 0U)
184     {
185       /* Perform the multiply-accumulate */
186       sum = (q31_t) ((((q63_t) sum << 32) +
187                       ((q63_t) *px++ * (*py--))) >> 32);
188 
189       /* Decrement loop counter */
190       k--;
191     }
192 
193     /* Store the result in the accumulator in the destination buffer. */
194     *pOut++ = sum << 1;
195 
196     /* Update the inputA and inputB pointers for next MAC calculation */
197     py = pIn2 + count;
198     px = pIn1;
199 
200     /* Increment MAC count */
201     count++;
202 
203     /* Decrement loop counter */
204     blockSize1--;
205   }
206 
207   /* --------------------------
208    * Initializations of stage2
209    * ------------------------*/
210 
211   /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
212    * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
213    * ....
214    * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
215    */
216 
217   /* Working pointer of inputA */
218   px = pIn1;
219 
220   /* Working pointer of inputB */
221   pSrc2 = pIn2 + (srcBLen - 1U);
222   py = pSrc2;
223 
224   /* count is index by which the pointer pIn1 to be incremented */
225   count = 0U;
226 
227   /* -------------------
228    * Stage2 process
229    * ------------------*/
230 
231   /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
232    * So, to loop unroll over blockSize2,
233    * srcBLen should be greater than or equal to 4 */
234   if (srcBLen >= 4U)
235   {
236     /* Loop unroll over blockSize2, by 4 */
237     blkCnt = blockSize2 >> 2U;
238 
239     while (blkCnt > 0U)
240     {
241       /* Set all accumulators to zero */
242       acc0 = 0;
243       acc1 = 0;
244       acc2 = 0;
245       acc3 = 0;
246 
247       /* read x[0], x[1], x[2] samples */
248       x0 = *px++;
249       x1 = *px++;
250       x2 = *px++;
251 
252       /* Apply loop unrolling and compute 4 MACs simultaneously. */
253       k = srcBLen >> 2U;
254 
255       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
256        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
257       do
258       {
259         /* Read y[srcBLen - 1] sample */
260         c0 = *py--;
261         /* Read x[3] sample */
262         x3 = *px++;
263 
264         /* Perform the multiply-accumulate */
265         /* acc0 +=  x[0] * y[srcBLen - 1] */
266         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
267         /* acc1 +=  x[1] * y[srcBLen - 1] */
268         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
269         /* acc2 +=  x[2] * y[srcBLen - 1] */
270         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
271         /* acc3 +=  x[3] * y[srcBLen - 1] */
272         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
273 
274 
275         /* Read y[srcBLen - 2] sample */
276         c0 = *py--;
277         /* Read x[4] sample */
278         x0 = *px++;
279 
280         /* Perform the multiply-accumulate */
281         /* acc0 +=  x[1] * y[srcBLen - 2] */
282         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
283         /* acc1 +=  x[2] * y[srcBLen - 2] */
284         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
285         /* acc2 +=  x[3] * y[srcBLen - 2] */
286         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
287         /* acc3 +=  x[4] * y[srcBLen - 2] */
288         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
289 
290 
291         /* Read y[srcBLen - 3] sample */
292         c0 = *py--;
293         /* Read x[5] sample */
294         x1 = *px++;
295 
296         /* Perform the multiply-accumulates */
297         /* acc0 +=  x[2] * y[srcBLen - 3] */
298         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
299         /* acc1 +=  x[3] * y[srcBLen - 3] */
300         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
301         /* acc2 +=  x[4] * y[srcBLen - 3] */
302         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
303         /* acc3 +=  x[5] * y[srcBLen - 3] */
304         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
305 
306 
307         /* Read y[srcBLen - 4] sample */
308         c0 = *py--;
309         /* Read x[6] sample */
310         x2 = *px++;
311 
312         /* Perform the multiply-accumulates */
313         /* acc0 +=  x[3] * y[srcBLen - 4] */
314         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
315         /* acc1 +=  x[4] * y[srcBLen - 4] */
316         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
317         /* acc2 +=  x[5] * y[srcBLen - 4] */
318         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
319         /* acc3 +=  x[6] * y[srcBLen - 4] */
320         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
321 
322 
323       } while (--k);
324 
325       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
326        ** No loop unrolling is used. */
327       k = srcBLen % 0x4U;
328 
329       while (k > 0U)
330       {
331         /* Read y[srcBLen - 5] sample */
332         c0 = *py--;
333         /* Read x[7] sample */
334         x3 = *px++;
335 
336         /* Perform the multiply-accumulates */
337         /* acc0 +=  x[4] * y[srcBLen - 5] */
338         acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
339         /* acc1 +=  x[5] * y[srcBLen - 5] */
340         acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
341         /* acc2 +=  x[6] * y[srcBLen - 5] */
342         acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
343         /* acc3 +=  x[7] * y[srcBLen - 5] */
344         acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
345 
346         /* Reuse the present samples for the next MAC */
347         x0 = x1;
348         x1 = x2;
349         x2 = x3;
350 
351         /* Decrement loop counter */
352         k--;
353       }
354 
355       /* Store the result in the accumulator in the destination buffer. */
356       *pOut++ = (q31_t) (acc0 << 1);
357       *pOut++ = (q31_t) (acc1 << 1);
358       *pOut++ = (q31_t) (acc2 << 1);
359       *pOut++ = (q31_t) (acc3 << 1);
360 
361       /* Increment the pointer pIn1 index, count by 4 */
362       count += 4U;
363 
364       /* Update the inputA and inputB pointers for next MAC calculation */
365       px = pIn1 + count;
366       py = pSrc2;
367 
368       /* Decrement loop counter */
369       blkCnt--;
370     }
371 
372     /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
373      ** No loop unrolling is used. */
374     blkCnt = blockSize2 % 0x4U;
375 
376     while (blkCnt > 0U)
377     {
378       /* Accumulator is made zero for every iteration */
379       sum = 0;
380 
381       /* Apply loop unrolling and compute 4 MACs simultaneously. */
382       k = srcBLen >> 2U;
383 
384       /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
385        ** a second loop below computes MACs for the remaining 1 to 3 samples. */
386       while (k > 0U)
387       {
388         /* Perform the multiply-accumulates */
389         sum = (q31_t) ((((q63_t) sum << 32) +
390                         ((q63_t) *px++ * (*py--))) >> 32);
391         sum = (q31_t) ((((q63_t) sum << 32) +
392                         ((q63_t) *px++ * (*py--))) >> 32);
393         sum = (q31_t) ((((q63_t) sum << 32) +
394                         ((q63_t) *px++ * (*py--))) >> 32);
395         sum = (q31_t) ((((q63_t) sum << 32) +
396                         ((q63_t) *px++ * (*py--))) >> 32);
397 
398         /* Decrement loop counter */
399         k--;
400       }
401 
402       /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
403        ** No loop unrolling is used. */
404       k = srcBLen % 0x4U;
405 
406       while (k > 0U)
407       {
408         /* Perform the multiply-accumulate */
409         sum = (q31_t) ((((q63_t) sum << 32) +
410                         ((q63_t) *px++ * (*py--))) >> 32);
411 
412         /* Decrement loop counter */
413         k--;
414       }
415 
416       /* Store the result in the accumulator in the destination buffer. */
417       *pOut++ = sum << 1;
418 
419       /* Increment MAC count */
420       count++;
421 
422       /* Update the inputA and inputB pointers for next MAC calculation */
423       px = pIn1 + count;
424       py = pSrc2;
425 
426       /* Decrement loop counter */
427       blkCnt--;
428     }
429   }
430   else
431   {
432     /* If the srcBLen is not a multiple of 4,
433      * the blockSize2 loop cannot be unrolled by 4 */
434     blkCnt = blockSize2;
435 
436     while (blkCnt > 0U)
437     {
438       /* Accumulator is made zero for every iteration */
439       sum = 0;
440 
441       /* srcBLen number of MACS should be performed */
442       k = srcBLen;
443 
444       while (k > 0U)
445       {
446         /* Perform the multiply-accumulate */
447         sum = (q31_t) ((((q63_t) sum << 32) +
448                         ((q63_t) *px++ * (*py--))) >> 32);
449 
450         /* Decrement loop counter */
451         k--;
452       }
453 
454       /* Store the result in the accumulator in the destination buffer. */
455       *pOut++ = sum << 1;
456 
457       /* Increment MAC count */
458       count++;
459 
460       /* Update the inputA and inputB pointers for next MAC calculation */
461       px = pIn1 + count;
462       py = pSrc2;
463 
464       /* Decrement loop counter */
465       blkCnt--;
466     }
467   }
468 
469 
470   /* --------------------------
471    * Initializations of stage3
472    * -------------------------*/
473 
474   /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
475    * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
476    * ....
477    * sum +=  x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
478    * sum +=  x[srcALen-1] * y[srcBLen-1]
479    */
480 
481   /* In this stage the MAC operations are decreased by 1 for every iteration.
482      The blockSize3 variable holds the number of MAC operations performed */
483 
484   /* Working pointer of inputA */
485   pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
486   px = pSrc1;
487 
488   /* Working pointer of inputB */
489   pSrc2 = pIn2 + (srcBLen - 1U);
490   py = pSrc2;
491 
492   /* -------------------
493    * Stage3 process
494    * ------------------*/
495 
496   while (blockSize3 > 0U)
497   {
498     /* Accumulator is made zero for every iteration */
499     sum = 0;
500 
501     /* Apply loop unrolling and compute 4 MACs simultaneously. */
502     k = blockSize3 >> 2U;
503 
504     /* First part of the processing with loop unrolling.  Compute 4 MACs at a time.
505      ** a second loop below computes MACs for the remaining 1 to 3 samples. */
506     while (k > 0U)
507     {
508       /* Perform the multiply-accumulate */
509       /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
510       sum = (q31_t) ((((q63_t) sum << 32) +
511                       ((q63_t) *px++ * (*py--))) >> 32);
512 
513       /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
514       sum = (q31_t) ((((q63_t) sum << 32) +
515                       ((q63_t) *px++ * (*py--))) >> 32);
516 
517       /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
518       sum = (q31_t) ((((q63_t) sum << 32) +
519                       ((q63_t) *px++ * (*py--))) >> 32);
520 
521       /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
522       sum = (q31_t) ((((q63_t) sum << 32) +
523                       ((q63_t) *px++ * (*py--))) >> 32);
524 
525       /* Decrement loop counter */
526       k--;
527     }
528 
529     /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
530      ** No loop unrolling is used. */
531     k = blockSize3 % 0x4U;
532 
533     while (k > 0U)
534     {
535       /* Perform the multiply-accumulate */
536       sum = (q31_t) ((((q63_t) sum << 32) +
537                       ((q63_t) *px++ * (*py--))) >> 32);
538 
539       /* Decrement loop counter */
540       k--;
541     }
542 
543     /* Store the result in the accumulator in the destination buffer. */
544     *pOut++ = sum << 1;
545 
546     /* Update the inputA and inputB pointers for next MAC calculation */
547     px = ++pSrc1;
548     py = pSrc2;
549 
550     /* Decrement loop counter */
551     blockSize3--;
552   }
553 
554 }
555 
556 /**
557   @} end of Conv group
558  */
559