• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_conv_opt_q7.c
4  * Description:  Convolution of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Conv
37   @{
38  */
39 
40 /**
41   @brief         Convolution of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length srcALen+srcBLen-1.
47   @param[in]     pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48   @param[in]     pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49   @return        none
50 
51   @par           Scaling and Overflow Behavior
52                    The function is implemented using a 32-bit internal accumulator.
53                    Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
54                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
55                    This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
56                    The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
57  */
58 
arm_conv_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)59 void arm_conv_opt_q7(
60   const q7_t * pSrcA,
61         uint32_t srcALen,
62   const q7_t * pSrcB,
63         uint32_t srcBLen,
64         q7_t * pDst,
65         q15_t * pScratch1,
66         q15_t * pScratch2)
67 {
68         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch */
69         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch */
70         q15_t x4;                                      /* Temporary input variable */
71         q15_t *py;                                     /* Temporary input2 pointer */
72         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
73   const q7_t *pIn1, *pIn2;                             /* InputA and inputB pointer */
74         uint32_t j, k, blkCnt, tapCnt;                 /* Loop counter */
75         q31_t x1, x2, x3, y1;                          /* Temporary input variables */
76   const q7_t *px;                                      /* Temporary input1 pointer */
77         q7_t *pOut = pDst;                             /* Output pointer */
78         q7_t out0, out1, out2, out3;                   /* Temporary variables */
79 
80   /* The algorithm implementation is based on the lengths of the inputs. */
81   /* srcB is always made to slide across srcA. */
82   /* So srcBLen is always considered as shorter or equal to srcALen */
83   if (srcALen >= srcBLen)
84   {
85     /* Initialization of inputA pointer */
86     pIn1 = pSrcA;
87 
88     /* Initialization of inputB pointer */
89     pIn2 = pSrcB;
90   }
91   else
92   {
93     /* Initialization of inputA pointer */
94     pIn1 = pSrcB;
95 
96     /* Initialization of inputB pointer */
97     pIn2 = pSrcA;
98 
99     /* srcBLen is always considered as shorter or equal to srcALen */
100     j = srcBLen;
101     srcBLen = srcALen;
102     srcALen = j;
103   }
104 
105   /* points to smaller length sequence */
106   px = pIn2 + srcBLen - 1;
107 
108   /* Apply loop unrolling and do 4 Copies simultaneously. */
109   k = srcBLen >> 2U;
110 
111   /* First part of the processing with loop unrolling copies 4 data points at a time.
112    ** a second loop below copies for the remaining 1 to 3 samples. */
113   while (k > 0U)
114   {
115     /* copy second buffer in reversal manner */
116     x4 = (q15_t) *px--;
117     *pScr2++ = x4;
118     x4 = (q15_t) *px--;
119     *pScr2++ = x4;
120     x4 = (q15_t) *px--;
121     *pScr2++ = x4;
122     x4 = (q15_t) *px--;
123     *pScr2++ = x4;
124 
125     /* Decrement loop counter */
126     k--;
127   }
128 
129   /* If the count is not a multiple of 4, copy remaining samples here.
130    ** No loop unrolling is used. */
131   k = srcBLen % 0x4U;
132 
133   while (k > 0U)
134   {
135     /* copy second buffer in reversal manner for remaining samples */
136     x4 = (q15_t) *px--;
137     *pScr2++ = x4;
138 
139     /* Decrement loop counter */
140     k--;
141   }
142 
143   /* Fill (srcBLen - 1U) zeros in scratch buffer */
144   arm_fill_q15(0, pScr1, (srcBLen - 1U));
145 
146   /* Update temporary scratch pointer */
147   pScr1 += (srcBLen - 1U);
148 
149   /* Copy (srcALen) samples in scratch buffer */
150   /* Apply loop unrolling and do 4 Copies simultaneously. */
151   k = srcALen >> 2U;
152 
153   /* First part of the processing with loop unrolling copies 4 data points at a time.
154    ** a second loop below copies for the remaining 1 to 3 samples. */
155   while (k > 0U)
156   {
157     /* copy second buffer in reversal manner */
158     x4 = (q15_t) *pIn1++;
159     *pScr1++ = x4;
160     x4 = (q15_t) *pIn1++;
161     *pScr1++ = x4;
162     x4 = (q15_t) *pIn1++;
163     *pScr1++ = x4;
164     x4 = (q15_t) *pIn1++;
165     *pScr1++ = x4;
166 
167     /* Decrement loop counter */
168     k--;
169   }
170 
171   /* If the count is not a multiple of 4, copy remaining samples here.
172    ** No loop unrolling is used. */
173   k = srcALen % 0x4U;
174 
175   while (k > 0U)
176   {
177     /* copy second buffer in reversal manner for remaining samples */
178     x4 = (q15_t) * pIn1++;
179     *pScr1++ = x4;
180 
181     /* Decrement the loop counter */
182     k--;
183   }
184 
185   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
186   arm_fill_q15(0, pScr1, (srcBLen - 1U));
187 
188   /* Update pointer */
189   pScr1 += (srcBLen - 1U);
190 
191   /* Temporary pointer for scratch2 */
192   py = pScratch2;
193 
194   /* Initialization of pIn2 pointer */
195   pIn2 = (q7_t *) py;
196 
197   pScr2 = py;
198 
199   /* Actual convolution process starts here */
200   blkCnt = (srcALen + srcBLen - 1U) >> 2U;
201 
202   while (blkCnt > 0)
203   {
204     /* Initialze temporary scratch pointer as scratch1 */
205     pScr1 = pScratch1;
206 
207     /* Clear Accumlators */
208     acc0 = 0;
209     acc1 = 0;
210     acc2 = 0;
211     acc3 = 0;
212 
213     /* Read two samples from scratch1 buffer */
214     x1 = read_q15x2_ia (&pScr1);
215 
216     /* Read next two samples from scratch1 buffer */
217     x2 = read_q15x2_ia (&pScr1);
218 
219     tapCnt = (srcBLen) >> 2U;
220 
221     while (tapCnt > 0U)
222     {
223       /* Read four samples from smaller buffer */
224       y1 = read_q15x2_ia (&pScr2);
225 
226       /* multiply and accumulate */
227       acc0 = __SMLAD(x1, y1, acc0);
228       acc2 = __SMLAD(x2, y1, acc2);
229 
230       /* pack input data */
231 #ifndef ARM_MATH_BIG_ENDIAN
232       x3 = __PKHBT(x2, x1, 0);
233 #else
234       x3 = __PKHBT(x1, x2, 0);
235 #endif
236 
237       /* multiply and accumulate */
238       acc1 = __SMLADX(x3, y1, acc1);
239 
240       /* Read next two samples from scratch1 buffer */
241       x1 = read_q15x2_ia (&pScr1);
242 
243       /* pack input data */
244 #ifndef ARM_MATH_BIG_ENDIAN
245       x3 = __PKHBT(x1, x2, 0);
246 #else
247       x3 = __PKHBT(x2, x1, 0);
248 #endif
249 
250       acc3 = __SMLADX(x3, y1, acc3);
251 
252       /* Read four samples from smaller buffer */
253       y1 = read_q15x2_ia (&pScr2);
254 
255       acc0 = __SMLAD(x2, y1, acc0);
256 
257       acc2 = __SMLAD(x1, y1, acc2);
258 
259       acc1 = __SMLADX(x3, y1, acc1);
260 
261       x2 = read_q15x2_ia (&pScr1);
262 
263 #ifndef ARM_MATH_BIG_ENDIAN
264       x3 = __PKHBT(x2, x1, 0);
265 #else
266       x3 = __PKHBT(x1, x2, 0);
267 #endif
268 
269       acc3 = __SMLADX(x3, y1, acc3);
270 
271       /* Decrement loop counter */
272       tapCnt--;
273     }
274 
275     /* Update scratch pointer for remaining samples of smaller length sequence */
276     pScr1 -= 4U;
277 
278     /* apply same above for remaining samples of smaller length sequence */
279     tapCnt = (srcBLen) & 3U;
280 
281     while (tapCnt > 0U)
282     {
283       /* accumulate the results */
284       acc0 += (*pScr1++ * *pScr2);
285       acc1 += (*pScr1++ * *pScr2);
286       acc2 += (*pScr1++ * *pScr2);
287       acc3 += (*pScr1++ * *pScr2++);
288 
289       pScr1 -= 3U;
290 
291       /* Decrement loop counter */
292       tapCnt--;
293     }
294 
295     blkCnt--;
296 
297     /* Store the result in the accumulator in the destination buffer. */
298     out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
299     out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
300     out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
301     out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
302 
303     write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
304 
305     /* Initialization of inputB pointer */
306     pScr2 = py;
307 
308     pScratch1 += 4U;
309   }
310 
311   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
312 
313   /* Calculate convolution for remaining samples of Bigger length sequence */
314   while (blkCnt > 0)
315   {
316     /* Initialze temporary scratch pointer as scratch1 */
317     pScr1 = pScratch1;
318 
319     /* Clear Accumlators */
320     acc0 = 0;
321 
322     tapCnt = (srcBLen) >> 1U;
323 
324     while (tapCnt > 0U)
325     {
326       acc0 += (*pScr1++ * *pScr2++);
327       acc0 += (*pScr1++ * *pScr2++);
328 
329       /* Decrement loop counter */
330       tapCnt--;
331     }
332 
333     tapCnt = (srcBLen) & 1U;
334 
335     /* apply same above for remaining samples of smaller length sequence */
336     while (tapCnt > 0U)
337     {
338       /* accumulate the results */
339       acc0 += (*pScr1++ * *pScr2++);
340 
341       /* Decrement loop counter */
342       tapCnt--;
343     }
344 
345     blkCnt--;
346 
347     /* Store the result in the accumulator in the destination buffer. */
348     *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
349 
350     /* Initialization of inputB pointer */
351     pScr2 = py;
352 
353     pScratch1 += 1U;
354   }
355 
356 }
357 
358 /**
359   @} end of Conv group
360  */
361