• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_correlate_opt_q7.c
4  * Description:  Correlation of Q7 sequences
5  *
6  * $Date:        23 April 2021
7  * $Revision:    V1.9.0
8  *
9  * Target Processor: Cortex-M and Cortex-A cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "dsp/filtering_functions.h"
30 
31 /**
32   @ingroup groupFilters
33  */
34 
35 /**
36   @addtogroup Corr
37   @{
38  */
39 
40 /**
41   @brief         Correlation of Q7 sequences.
42   @param[in]     pSrcA      points to the first input sequence
43   @param[in]     srcALen    length of the first input sequence
44   @param[in]     pSrcB      points to the second input sequence
45   @param[in]     srcBLen    length of the second input sequence
46   @param[out]    pDst       points to the location where the output result is written.  Length 2 * max(srcALen, srcBLen) - 1.
47   @param[in]     pScratch1  points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48   @param[in]     pScratch2  points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49   @return        none
50 
51   @par           Scaling and Overflow Behavior
52                    The function is implemented using a 32-bit internal accumulator.
53                    Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
54                    The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
55                    This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
56                    The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
57  */
58 
arm_correlate_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)59 void arm_correlate_opt_q7(
60   const q7_t * pSrcA,
61         uint32_t srcALen,
62   const q7_t * pSrcB,
63         uint32_t srcBLen,
64         q7_t * pDst,
65         q15_t * pScratch1,
66         q15_t * pScratch2)
67 {
68         q15_t *pScr1 = pScratch1;                      /* Temporary pointer for scratch */
69         q15_t *pScr2 = pScratch2;                      /* Temporary pointer for scratch */
70         q15_t x4;                                      /* Temporary input variable */
71         q15_t *py;                                     /* Temporary input2 pointer */
72         q31_t acc0, acc1, acc2, acc3;                  /* Accumulators */
73   const q7_t *pIn1, *pIn2;                             /* InputA and inputB pointer */
74         uint32_t j, k, blkCnt, tapCnt;                 /* Loop counter */
75         int32_t inc = 1;                               /* Output pointer increment */
76         uint32_t outBlockSize;                         /* Loop counter */
77         q31_t x1, x2, x3, y1;                          /* Temporary input variables */
78         q7_t *pOut = pDst;                             /* Output pointer */
79 
80   /* The algorithm implementation is based on the lengths of the inputs. */
81   /* srcB is always made to slide across srcA. */
82   /* So srcBLen is always considered as shorter or equal to srcALen */
83   /* But CORR(x, y) is reverse of CORR(y, x) */
84   /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
85   /* and the destination pointer modifier, inc is set to -1 */
86   /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
87   /* But to improve the performance,
88    * we include zeroes in the output instead of zero padding either of the the inputs*/
89   /* If srcALen > srcBLen,
90    * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
91   /* If srcALen < srcBLen,
92    * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
93   if (srcALen >= srcBLen)
94   {
95     /* Initialization of inputA pointer */
96     pIn1 = pSrcA;
97 
98     /* Initialization of inputB pointer */
99     pIn2 = pSrcB;
100 
101     /* Number of output samples is calculated */
102     outBlockSize = (srcALen * 2U) - 1U;
103 
104     /* When srcALen > srcBLen, zero padding is done to srcB
105      * to make their lengths equal.
106      * Instead, (outBlockSize - (srcALen + srcBLen - 1))
107      * number of output samples are made zero */
108     j = outBlockSize - (srcALen + (srcBLen - 1U));
109 
110     /* Updating the pointer position to non zero value */
111     pOut += j;
112   }
113   else
114   {
115     /* Initialization of inputA pointer */
116     pIn1 = pSrcB;
117 
118     /* Initialization of inputB pointer */
119     pIn2 = pSrcA;
120 
121     /* srcBLen is always considered as shorter or equal to srcALen */
122     j = srcBLen;
123     srcBLen = srcALen;
124     srcALen = j;
125 
126     /* CORR(x, y) = Reverse order(CORR(y, x)) */
127     /* Hence set the destination pointer to point to the last output sample */
128     pOut = pDst + ((srcALen + srcBLen) - 2U);
129 
130     /* Destination address modifier is set to -1 */
131     inc = -1;
132   }
133 
134 
135   /* Copy (srcBLen) samples in scratch buffer */
136   k = srcBLen >> 2U;
137 
138   /* First part of the processing with loop unrolling copies 4 data points at a time.
139      a second loop below copies for the remaining 1 to 3 samples. */
140   while (k > 0U)
141   {
142     /* copy second buffer in reversal manner */
143     x4 = (q15_t) *pIn2++;
144     *pScr2++ = x4;
145     x4 = (q15_t) *pIn2++;
146     *pScr2++ = x4;
147     x4 = (q15_t) *pIn2++;
148     *pScr2++ = x4;
149     x4 = (q15_t) *pIn2++;
150     *pScr2++ = x4;
151 
152     /* Decrement loop counter */
153     k--;
154   }
155 
156   /* If the count is not a multiple of 4, copy remaining samples here.
157      No loop unrolling is used. */
158   k = srcBLen % 0x4U;
159 
160   while (k > 0U)
161   {
162     /* copy second buffer in reversal manner for remaining samples */
163     x4 = (q15_t) *pIn2++;
164     *pScr2++ = x4;
165 
166     /* Decrement loop counter */
167     k--;
168   }
169 
170   /* Fill (srcBLen - 1U) zeros in scratch buffer */
171   arm_fill_q15(0, pScr1, (srcBLen - 1U));
172 
173   /* Update temporary scratch pointer */
174   pScr1 += (srcBLen - 1U);
175 
176   /* Copy (srcALen) samples in scratch buffer */
177   /* Apply loop unrolling and do 4 Copies simultaneously. */
178   k = srcALen >> 2U;
179 
180   /* First part of the processing with loop unrolling copies 4 data points at a time.
181      a second loop below copies for the remaining 1 to 3 samples. */
182   while (k > 0U)
183   {
184     /* copy second buffer in reversal manner */
185     x4 = (q15_t) *pIn1++;
186     *pScr1++ = x4;
187     x4 = (q15_t) *pIn1++;
188     *pScr1++ = x4;
189     x4 = (q15_t) *pIn1++;
190     *pScr1++ = x4;
191     x4 = (q15_t) *pIn1++;
192     *pScr1++ = x4;
193 
194     /* Decrement loop counter */
195     k--;
196   }
197 
198   /* If the count is not a multiple of 4, copy remaining samples here.
199      No loop unrolling is used. */
200   k = srcALen % 0x4U;
201 
202   while (k > 0U)
203   {
204     /* copy second buffer in reversal manner for remaining samples */
205     x4 = (q15_t) * pIn1++;
206     *pScr1++ = x4;
207 
208     /* Decrement the loop counter */
209     k--;
210   }
211 
212   /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
213   arm_fill_q15(0, pScr1, (srcBLen - 1U));
214 
215   /* Update pointer */
216   pScr1 += (srcBLen - 1U);
217 
218   /* Temporary pointer for scratch2 */
219   py = pScratch2;
220 
221   /* Initialization of pScr2 pointer */
222   pScr2 = pScratch2;
223 
224   /* Actual correlation process starts here */
225   blkCnt = (srcALen + srcBLen - 1U) >> 2;
226 
227   while (blkCnt > 0)
228   {
229     /* Initialze temporary scratch pointer as scratch1 */
230     pScr1 = pScratch1;
231 
232     /* Clear Accumlators */
233     acc0 = 0;
234     acc1 = 0;
235     acc2 = 0;
236     acc3 = 0;
237 
238     /* Read two samples from scratch1 buffer */
239     x1 = read_q15x2_ia (&pScr1);
240 
241     /* Read next two samples from scratch1 buffer */
242     x2 = read_q15x2_ia (&pScr1);
243 
244     tapCnt = (srcBLen) >> 2U;
245 
246     while (tapCnt > 0U)
247     {
248       /* Read four samples from smaller buffer */
249       y1 = read_q15x2_ia (&pScr2);
250 
251       /* multiply and accumulate */
252       acc0 = __SMLAD(x1, y1, acc0);
253       acc2 = __SMLAD(x2, y1, acc2);
254 
255       /* pack input data */
256 #ifndef ARM_MATH_BIG_ENDIAN
257       x3 = __PKHBT(x2, x1, 0);
258 #else
259       x3 = __PKHBT(x1, x2, 0);
260 #endif
261 
262       /* multiply and accumulate */
263       acc1 = __SMLADX(x3, y1, acc1);
264 
265       /* Read next two samples from scratch1 buffer */
266       x1 = read_q15x2_ia (&pScr1);
267 
268       /* pack input data */
269 #ifndef ARM_MATH_BIG_ENDIAN
270       x3 = __PKHBT(x1, x2, 0);
271 #else
272       x3 = __PKHBT(x2, x1, 0);
273 #endif
274 
275       acc3 = __SMLADX(x3, y1, acc3);
276 
277       /* Read four samples from smaller buffer */
278       y1 = read_q15x2_ia (&pScr2);
279 
280       acc0 = __SMLAD(x2, y1, acc0);
281 
282       acc2 = __SMLAD(x1, y1, acc2);
283 
284       acc1 = __SMLADX(x3, y1, acc1);
285 
286       x2 = read_q15x2_ia (&pScr1);
287 
288 #ifndef ARM_MATH_BIG_ENDIAN
289       x3 = __PKHBT(x2, x1, 0);
290 #else
291       x3 = __PKHBT(x1, x2, 0);
292 #endif
293 
294       acc3 = __SMLADX(x3, y1, acc3);
295 
296       /* Decrement loop counter */
297       tapCnt--;
298     }
299 
300     /* Update scratch pointer for remaining samples of smaller length sequence */
301     pScr1 -= 4U;
302 
303     /* apply same above for remaining samples of smaller length sequence */
304     tapCnt = (srcBLen) & 3U;
305 
306     while (tapCnt > 0U)
307     {
308       /* accumulate the results */
309       acc0 += (*pScr1++ * *pScr2);
310       acc1 += (*pScr1++ * *pScr2);
311       acc2 += (*pScr1++ * *pScr2);
312       acc3 += (*pScr1++ * *pScr2++);
313 
314       pScr1 -= 3U;
315 
316       /* Decrement loop counter */
317       tapCnt--;
318     }
319 
320     blkCnt--;
321 
322     /* Store the result in the accumulator in the destination buffer. */
323     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
324     pOut += inc;
325     *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
326     pOut += inc;
327     *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
328     pOut += inc;
329     *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
330     pOut += inc;
331 
332     /* Initialization of inputB pointer */
333     pScr2 = py;
334 
335     pScratch1 += 4U;
336   }
337 
338   blkCnt = (srcALen + srcBLen - 1U) & 0x3;
339 
340   /* Calculate correlation for remaining samples of Bigger length sequence */
341   while (blkCnt > 0)
342   {
343     /* Initialze temporary scratch pointer as scratch1 */
344     pScr1 = pScratch1;
345 
346     /* Clear Accumlators */
347     acc0 = 0;
348 
349     tapCnt = (srcBLen) >> 1U;
350 
351     while (tapCnt > 0U)
352     {
353       acc0 += (*pScr1++ * *pScr2++);
354       acc0 += (*pScr1++ * *pScr2++);
355 
356       /* Decrement loop counter */
357       tapCnt--;
358     }
359 
360     tapCnt = (srcBLen) & 1U;
361 
362     /* apply same above for remaining samples of smaller length sequence */
363     while (tapCnt > 0U)
364     {
365       /* accumulate the results */
366       acc0 += (*pScr1++ * *pScr2++);
367 
368       /* Decrement loop counter */
369       tapCnt--;
370     }
371 
372     blkCnt--;
373 
374     /* Store the result in the accumulator in the destination buffer. */
375     *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
376     pOut += inc;
377 
378     /* Initialization of inputB pointer */
379     pScr2 = py;
380 
381     pScratch1 += 1U;
382   }
383 
384 }
385 
386 /**
387   @} end of Corr group
388  */
389