1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_correlate_opt_q7.c
4 * Description: Correlation of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Corr
37 @{
38 */
39
40 /**
41 @brief Correlation of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
47 @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49 @return none
50
51 @par Scaling and Overflow Behavior
52 The function is implemented using a 32-bit internal accumulator.
53 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
54 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
55 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
56 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
57 */
58
arm_correlate_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)59 void arm_correlate_opt_q7(
60 const q7_t * pSrcA,
61 uint32_t srcALen,
62 const q7_t * pSrcB,
63 uint32_t srcBLen,
64 q7_t * pDst,
65 q15_t * pScratch1,
66 q15_t * pScratch2)
67 {
68 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
69 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
70 q15_t x4; /* Temporary input variable */
71 q15_t *py; /* Temporary input2 pointer */
72 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
73 const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
74 uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
75 int32_t inc = 1; /* Output pointer increment */
76 uint32_t outBlockSize; /* Loop counter */
77 q31_t x1, x2, x3, y1; /* Temporary input variables */
78 q7_t *pOut = pDst; /* Output pointer */
79
80 /* The algorithm implementation is based on the lengths of the inputs. */
81 /* srcB is always made to slide across srcA. */
82 /* So srcBLen is always considered as shorter or equal to srcALen */
83 /* But CORR(x, y) is reverse of CORR(y, x) */
84 /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
85 /* and the destination pointer modifier, inc is set to -1 */
86 /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
87 /* But to improve the performance,
88 * we include zeroes in the output instead of zero padding either of the the inputs*/
89 /* If srcALen > srcBLen,
90 * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
91 /* If srcALen < srcBLen,
92 * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
93 if (srcALen >= srcBLen)
94 {
95 /* Initialization of inputA pointer */
96 pIn1 = pSrcA;
97
98 /* Initialization of inputB pointer */
99 pIn2 = pSrcB;
100
101 /* Number of output samples is calculated */
102 outBlockSize = (srcALen * 2U) - 1U;
103
104 /* When srcALen > srcBLen, zero padding is done to srcB
105 * to make their lengths equal.
106 * Instead, (outBlockSize - (srcALen + srcBLen - 1))
107 * number of output samples are made zero */
108 j = outBlockSize - (srcALen + (srcBLen - 1U));
109
110 /* Updating the pointer position to non zero value */
111 pOut += j;
112 }
113 else
114 {
115 /* Initialization of inputA pointer */
116 pIn1 = pSrcB;
117
118 /* Initialization of inputB pointer */
119 pIn2 = pSrcA;
120
121 /* srcBLen is always considered as shorter or equal to srcALen */
122 j = srcBLen;
123 srcBLen = srcALen;
124 srcALen = j;
125
126 /* CORR(x, y) = Reverse order(CORR(y, x)) */
127 /* Hence set the destination pointer to point to the last output sample */
128 pOut = pDst + ((srcALen + srcBLen) - 2U);
129
130 /* Destination address modifier is set to -1 */
131 inc = -1;
132 }
133
134
135 /* Copy (srcBLen) samples in scratch buffer */
136 k = srcBLen >> 2U;
137
138 /* First part of the processing with loop unrolling copies 4 data points at a time.
139 a second loop below copies for the remaining 1 to 3 samples. */
140 while (k > 0U)
141 {
142 /* copy second buffer in reversal manner */
143 x4 = (q15_t) *pIn2++;
144 *pScr2++ = x4;
145 x4 = (q15_t) *pIn2++;
146 *pScr2++ = x4;
147 x4 = (q15_t) *pIn2++;
148 *pScr2++ = x4;
149 x4 = (q15_t) *pIn2++;
150 *pScr2++ = x4;
151
152 /* Decrement loop counter */
153 k--;
154 }
155
156 /* If the count is not a multiple of 4, copy remaining samples here.
157 No loop unrolling is used. */
158 k = srcBLen % 0x4U;
159
160 while (k > 0U)
161 {
162 /* copy second buffer in reversal manner for remaining samples */
163 x4 = (q15_t) *pIn2++;
164 *pScr2++ = x4;
165
166 /* Decrement loop counter */
167 k--;
168 }
169
170 /* Fill (srcBLen - 1U) zeros in scratch buffer */
171 arm_fill_q15(0, pScr1, (srcBLen - 1U));
172
173 /* Update temporary scratch pointer */
174 pScr1 += (srcBLen - 1U);
175
176 /* Copy (srcALen) samples in scratch buffer */
177 /* Apply loop unrolling and do 4 Copies simultaneously. */
178 k = srcALen >> 2U;
179
180 /* First part of the processing with loop unrolling copies 4 data points at a time.
181 a second loop below copies for the remaining 1 to 3 samples. */
182 while (k > 0U)
183 {
184 /* copy second buffer in reversal manner */
185 x4 = (q15_t) *pIn1++;
186 *pScr1++ = x4;
187 x4 = (q15_t) *pIn1++;
188 *pScr1++ = x4;
189 x4 = (q15_t) *pIn1++;
190 *pScr1++ = x4;
191 x4 = (q15_t) *pIn1++;
192 *pScr1++ = x4;
193
194 /* Decrement loop counter */
195 k--;
196 }
197
198 /* If the count is not a multiple of 4, copy remaining samples here.
199 No loop unrolling is used. */
200 k = srcALen % 0x4U;
201
202 while (k > 0U)
203 {
204 /* copy second buffer in reversal manner for remaining samples */
205 x4 = (q15_t) * pIn1++;
206 *pScr1++ = x4;
207
208 /* Decrement the loop counter */
209 k--;
210 }
211
212 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
213 arm_fill_q15(0, pScr1, (srcBLen - 1U));
214
215 /* Update pointer */
216 pScr1 += (srcBLen - 1U);
217
218 /* Temporary pointer for scratch2 */
219 py = pScratch2;
220
221 /* Initialization of pScr2 pointer */
222 pScr2 = pScratch2;
223
224 /* Actual correlation process starts here */
225 blkCnt = (srcALen + srcBLen - 1U) >> 2;
226
227 while (blkCnt > 0)
228 {
229 /* Initialze temporary scratch pointer as scratch1 */
230 pScr1 = pScratch1;
231
232 /* Clear Accumlators */
233 acc0 = 0;
234 acc1 = 0;
235 acc2 = 0;
236 acc3 = 0;
237
238 /* Read two samples from scratch1 buffer */
239 x1 = read_q15x2_ia (&pScr1);
240
241 /* Read next two samples from scratch1 buffer */
242 x2 = read_q15x2_ia (&pScr1);
243
244 tapCnt = (srcBLen) >> 2U;
245
246 while (tapCnt > 0U)
247 {
248 /* Read four samples from smaller buffer */
249 y1 = read_q15x2_ia (&pScr2);
250
251 /* multiply and accumulate */
252 acc0 = __SMLAD(x1, y1, acc0);
253 acc2 = __SMLAD(x2, y1, acc2);
254
255 /* pack input data */
256 #ifndef ARM_MATH_BIG_ENDIAN
257 x3 = __PKHBT(x2, x1, 0);
258 #else
259 x3 = __PKHBT(x1, x2, 0);
260 #endif
261
262 /* multiply and accumulate */
263 acc1 = __SMLADX(x3, y1, acc1);
264
265 /* Read next two samples from scratch1 buffer */
266 x1 = read_q15x2_ia (&pScr1);
267
268 /* pack input data */
269 #ifndef ARM_MATH_BIG_ENDIAN
270 x3 = __PKHBT(x1, x2, 0);
271 #else
272 x3 = __PKHBT(x2, x1, 0);
273 #endif
274
275 acc3 = __SMLADX(x3, y1, acc3);
276
277 /* Read four samples from smaller buffer */
278 y1 = read_q15x2_ia (&pScr2);
279
280 acc0 = __SMLAD(x2, y1, acc0);
281
282 acc2 = __SMLAD(x1, y1, acc2);
283
284 acc1 = __SMLADX(x3, y1, acc1);
285
286 x2 = read_q15x2_ia (&pScr1);
287
288 #ifndef ARM_MATH_BIG_ENDIAN
289 x3 = __PKHBT(x2, x1, 0);
290 #else
291 x3 = __PKHBT(x1, x2, 0);
292 #endif
293
294 acc3 = __SMLADX(x3, y1, acc3);
295
296 /* Decrement loop counter */
297 tapCnt--;
298 }
299
300 /* Update scratch pointer for remaining samples of smaller length sequence */
301 pScr1 -= 4U;
302
303 /* apply same above for remaining samples of smaller length sequence */
304 tapCnt = (srcBLen) & 3U;
305
306 while (tapCnt > 0U)
307 {
308 /* accumulate the results */
309 acc0 += (*pScr1++ * *pScr2);
310 acc1 += (*pScr1++ * *pScr2);
311 acc2 += (*pScr1++ * *pScr2);
312 acc3 += (*pScr1++ * *pScr2++);
313
314 pScr1 -= 3U;
315
316 /* Decrement loop counter */
317 tapCnt--;
318 }
319
320 blkCnt--;
321
322 /* Store the result in the accumulator in the destination buffer. */
323 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
324 pOut += inc;
325 *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
326 pOut += inc;
327 *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
328 pOut += inc;
329 *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
330 pOut += inc;
331
332 /* Initialization of inputB pointer */
333 pScr2 = py;
334
335 pScratch1 += 4U;
336 }
337
338 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
339
340 /* Calculate correlation for remaining samples of Bigger length sequence */
341 while (blkCnt > 0)
342 {
343 /* Initialze temporary scratch pointer as scratch1 */
344 pScr1 = pScratch1;
345
346 /* Clear Accumlators */
347 acc0 = 0;
348
349 tapCnt = (srcBLen) >> 1U;
350
351 while (tapCnt > 0U)
352 {
353 acc0 += (*pScr1++ * *pScr2++);
354 acc0 += (*pScr1++ * *pScr2++);
355
356 /* Decrement loop counter */
357 tapCnt--;
358 }
359
360 tapCnt = (srcBLen) & 1U;
361
362 /* apply same above for remaining samples of smaller length sequence */
363 while (tapCnt > 0U)
364 {
365 /* accumulate the results */
366 acc0 += (*pScr1++ * *pScr2++);
367
368 /* Decrement loop counter */
369 tapCnt--;
370 }
371
372 blkCnt--;
373
374 /* Store the result in the accumulator in the destination buffer. */
375 *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
376 pOut += inc;
377
378 /* Initialization of inputB pointer */
379 pScr2 = py;
380
381 pScratch1 += 1U;
382 }
383
384 }
385
386 /**
387 @} end of Corr group
388 */
389