1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_conv_opt_q7.c
4 * Description: Convolution of Q7 sequences
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/filtering_functions.h"
30
31 /**
32 @ingroup groupFilters
33 */
34
35 /**
36 @addtogroup Conv
37 @{
38 */
39
40 /**
41 @brief Convolution of Q7 sequences.
42 @param[in] pSrcA points to the first input sequence
43 @param[in] srcALen length of the first input sequence
44 @param[in] pSrcB points to the second input sequence
45 @param[in] srcBLen length of the second input sequence
46 @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
47 @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
48 @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
49 @return none
50
51 @par Scaling and Overflow Behavior
52 The function is implemented using a 32-bit internal accumulator.
53 Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
54 The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
55 This approach provides 17 guard bits and there is no risk of overflow as long as <code>max(srcALen, srcBLen)<131072</code>.
56 The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
57 */
58
arm_conv_opt_q7(const q7_t * pSrcA,uint32_t srcALen,const q7_t * pSrcB,uint32_t srcBLen,q7_t * pDst,q15_t * pScratch1,q15_t * pScratch2)59 void arm_conv_opt_q7(
60 const q7_t * pSrcA,
61 uint32_t srcALen,
62 const q7_t * pSrcB,
63 uint32_t srcBLen,
64 q7_t * pDst,
65 q15_t * pScratch1,
66 q15_t * pScratch2)
67 {
68 q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
69 q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
70 q15_t x4; /* Temporary input variable */
71 q15_t *py; /* Temporary input2 pointer */
72 q31_t acc0, acc1, acc2, acc3; /* Accumulators */
73 const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
74 uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
75 q31_t x1, x2, x3, y1; /* Temporary input variables */
76 const q7_t *px; /* Temporary input1 pointer */
77 q7_t *pOut = pDst; /* Output pointer */
78 q7_t out0, out1, out2, out3; /* Temporary variables */
79
80 /* The algorithm implementation is based on the lengths of the inputs. */
81 /* srcB is always made to slide across srcA. */
82 /* So srcBLen is always considered as shorter or equal to srcALen */
83 if (srcALen >= srcBLen)
84 {
85 /* Initialization of inputA pointer */
86 pIn1 = pSrcA;
87
88 /* Initialization of inputB pointer */
89 pIn2 = pSrcB;
90 }
91 else
92 {
93 /* Initialization of inputA pointer */
94 pIn1 = pSrcB;
95
96 /* Initialization of inputB pointer */
97 pIn2 = pSrcA;
98
99 /* srcBLen is always considered as shorter or equal to srcALen */
100 j = srcBLen;
101 srcBLen = srcALen;
102 srcALen = j;
103 }
104
105 /* points to smaller length sequence */
106 px = pIn2 + srcBLen - 1;
107
108 /* Apply loop unrolling and do 4 Copies simultaneously. */
109 k = srcBLen >> 2U;
110
111 /* First part of the processing with loop unrolling copies 4 data points at a time.
112 ** a second loop below copies for the remaining 1 to 3 samples. */
113 while (k > 0U)
114 {
115 /* copy second buffer in reversal manner */
116 x4 = (q15_t) *px--;
117 *pScr2++ = x4;
118 x4 = (q15_t) *px--;
119 *pScr2++ = x4;
120 x4 = (q15_t) *px--;
121 *pScr2++ = x4;
122 x4 = (q15_t) *px--;
123 *pScr2++ = x4;
124
125 /* Decrement loop counter */
126 k--;
127 }
128
129 /* If the count is not a multiple of 4, copy remaining samples here.
130 ** No loop unrolling is used. */
131 k = srcBLen % 0x4U;
132
133 while (k > 0U)
134 {
135 /* copy second buffer in reversal manner for remaining samples */
136 x4 = (q15_t) *px--;
137 *pScr2++ = x4;
138
139 /* Decrement loop counter */
140 k--;
141 }
142
143 /* Fill (srcBLen - 1U) zeros in scratch buffer */
144 arm_fill_q15(0, pScr1, (srcBLen - 1U));
145
146 /* Update temporary scratch pointer */
147 pScr1 += (srcBLen - 1U);
148
149 /* Copy (srcALen) samples in scratch buffer */
150 /* Apply loop unrolling and do 4 Copies simultaneously. */
151 k = srcALen >> 2U;
152
153 /* First part of the processing with loop unrolling copies 4 data points at a time.
154 ** a second loop below copies for the remaining 1 to 3 samples. */
155 while (k > 0U)
156 {
157 /* copy second buffer in reversal manner */
158 x4 = (q15_t) *pIn1++;
159 *pScr1++ = x4;
160 x4 = (q15_t) *pIn1++;
161 *pScr1++ = x4;
162 x4 = (q15_t) *pIn1++;
163 *pScr1++ = x4;
164 x4 = (q15_t) *pIn1++;
165 *pScr1++ = x4;
166
167 /* Decrement loop counter */
168 k--;
169 }
170
171 /* If the count is not a multiple of 4, copy remaining samples here.
172 ** No loop unrolling is used. */
173 k = srcALen % 0x4U;
174
175 while (k > 0U)
176 {
177 /* copy second buffer in reversal manner for remaining samples */
178 x4 = (q15_t) * pIn1++;
179 *pScr1++ = x4;
180
181 /* Decrement the loop counter */
182 k--;
183 }
184
185 /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
186 arm_fill_q15(0, pScr1, (srcBLen - 1U));
187
188 /* Update pointer */
189 pScr1 += (srcBLen - 1U);
190
191 /* Temporary pointer for scratch2 */
192 py = pScratch2;
193
194 /* Initialization of pIn2 pointer */
195 pIn2 = (q7_t *) py;
196
197 pScr2 = py;
198
199 /* Actual convolution process starts here */
200 blkCnt = (srcALen + srcBLen - 1U) >> 2U;
201
202 while (blkCnt > 0)
203 {
204 /* Initialze temporary scratch pointer as scratch1 */
205 pScr1 = pScratch1;
206
207 /* Clear Accumlators */
208 acc0 = 0;
209 acc1 = 0;
210 acc2 = 0;
211 acc3 = 0;
212
213 /* Read two samples from scratch1 buffer */
214 x1 = read_q15x2_ia (&pScr1);
215
216 /* Read next two samples from scratch1 buffer */
217 x2 = read_q15x2_ia (&pScr1);
218
219 tapCnt = (srcBLen) >> 2U;
220
221 while (tapCnt > 0U)
222 {
223 /* Read four samples from smaller buffer */
224 y1 = read_q15x2_ia (&pScr2);
225
226 /* multiply and accumulate */
227 acc0 = __SMLAD(x1, y1, acc0);
228 acc2 = __SMLAD(x2, y1, acc2);
229
230 /* pack input data */
231 #ifndef ARM_MATH_BIG_ENDIAN
232 x3 = __PKHBT(x2, x1, 0);
233 #else
234 x3 = __PKHBT(x1, x2, 0);
235 #endif
236
237 /* multiply and accumulate */
238 acc1 = __SMLADX(x3, y1, acc1);
239
240 /* Read next two samples from scratch1 buffer */
241 x1 = read_q15x2_ia (&pScr1);
242
243 /* pack input data */
244 #ifndef ARM_MATH_BIG_ENDIAN
245 x3 = __PKHBT(x1, x2, 0);
246 #else
247 x3 = __PKHBT(x2, x1, 0);
248 #endif
249
250 acc3 = __SMLADX(x3, y1, acc3);
251
252 /* Read four samples from smaller buffer */
253 y1 = read_q15x2_ia (&pScr2);
254
255 acc0 = __SMLAD(x2, y1, acc0);
256
257 acc2 = __SMLAD(x1, y1, acc2);
258
259 acc1 = __SMLADX(x3, y1, acc1);
260
261 x2 = read_q15x2_ia (&pScr1);
262
263 #ifndef ARM_MATH_BIG_ENDIAN
264 x3 = __PKHBT(x2, x1, 0);
265 #else
266 x3 = __PKHBT(x1, x2, 0);
267 #endif
268
269 acc3 = __SMLADX(x3, y1, acc3);
270
271 /* Decrement loop counter */
272 tapCnt--;
273 }
274
275 /* Update scratch pointer for remaining samples of smaller length sequence */
276 pScr1 -= 4U;
277
278 /* apply same above for remaining samples of smaller length sequence */
279 tapCnt = (srcBLen) & 3U;
280
281 while (tapCnt > 0U)
282 {
283 /* accumulate the results */
284 acc0 += (*pScr1++ * *pScr2);
285 acc1 += (*pScr1++ * *pScr2);
286 acc2 += (*pScr1++ * *pScr2);
287 acc3 += (*pScr1++ * *pScr2++);
288
289 pScr1 -= 3U;
290
291 /* Decrement loop counter */
292 tapCnt--;
293 }
294
295 blkCnt--;
296
297 /* Store the result in the accumulator in the destination buffer. */
298 out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
299 out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
300 out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
301 out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
302
303 write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
304
305 /* Initialization of inputB pointer */
306 pScr2 = py;
307
308 pScratch1 += 4U;
309 }
310
311 blkCnt = (srcALen + srcBLen - 1U) & 0x3;
312
313 /* Calculate convolution for remaining samples of Bigger length sequence */
314 while (blkCnt > 0)
315 {
316 /* Initialze temporary scratch pointer as scratch1 */
317 pScr1 = pScratch1;
318
319 /* Clear Accumlators */
320 acc0 = 0;
321
322 tapCnt = (srcBLen) >> 1U;
323
324 while (tapCnt > 0U)
325 {
326 acc0 += (*pScr1++ * *pScr2++);
327 acc0 += (*pScr1++ * *pScr2++);
328
329 /* Decrement loop counter */
330 tapCnt--;
331 }
332
333 tapCnt = (srcBLen) & 1U;
334
335 /* apply same above for remaining samples of smaller length sequence */
336 while (tapCnt > 0U)
337 {
338 /* accumulate the results */
339 acc0 += (*pScr1++ * *pScr2++);
340
341 /* Decrement loop counter */
342 tapCnt--;
343 }
344
345 blkCnt--;
346
347 /* Store the result in the accumulator in the destination buffer. */
348 *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
349
350 /* Initialization of inputB pointer */
351 pScr2 = py;
352
353 pScratch1 += 1U;
354 }
355
356 }
357
358 /**
359 @} end of Conv group
360 */
361