1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_q31.c
4 * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31
32
33 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
34
35 #include "arm_vec_fft.h"
36
37
_arm_radix4_butterfly_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)38 static void _arm_radix4_butterfly_q31_mve(
39 const arm_cfft_instance_q31 * S,
40 q31_t *pSrc,
41 uint32_t fftLen)
42 {
43 q31x4_t vecTmp0, vecTmp1;
44 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
45 q31x4_t vecA, vecB, vecC, vecD;
46 uint32_t blkCnt;
47 uint32_t n1, n2;
48 uint32_t stage = 0;
49 int32_t iter = 1;
50 static const int32_t strides[4] = {
51 (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
52 (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
53 };
54
55
56 /*
57 * Process first stages
58 * Each stage in middle stages provides two down scaling of the input
59 */
60 n2 = fftLen;
61 n1 = n2;
62 n2 >>= 2u;
63
64 for (int k = fftLen / 4u; k > 1; k >>= 2u)
65 {
66 q31_t const *p_rearranged_twiddle_tab_stride2 =
67 &S->rearranged_twiddle_stride2[
68 S->rearranged_twiddle_tab_stride2_arr[stage]];
69 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
70 S->rearranged_twiddle_tab_stride3_arr[stage]];
71 q31_t const *p_rearranged_twiddle_tab_stride1 =
72 &S->rearranged_twiddle_stride1[
73 S->rearranged_twiddle_tab_stride1_arr[stage]];
74
75 q31_t * pBase = pSrc;
76 for (int i = 0; i < iter; i++)
77 {
78 q31_t *inA = pBase;
79 q31_t *inB = inA + n2 * CMPLX_DIM;
80 q31_t *inC = inB + n2 * CMPLX_DIM;
81 q31_t *inD = inC + n2 * CMPLX_DIM;
82 q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
83 q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
84 q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
85 q31x4_t vecW;
86
87
88 blkCnt = n2 / 2;
89 /*
90 * load 2 x q31 complex pair
91 */
92 vecA = vldrwq_s32(inA);
93 vecC = vldrwq_s32(inC);
94 while (blkCnt > 0U)
95 {
96 vecB = vldrwq_s32(inB);
97 vecD = vldrwq_s32(inD);
98
99 vecSum0 = vhaddq(vecA, vecC);
100 vecDiff0 = vhsubq(vecA, vecC);
101
102 vecSum1 = vhaddq(vecB, vecD);
103 vecDiff1 = vhsubq(vecB, vecD);
104 /*
105 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
106 */
107 vecTmp0 = vhaddq(vecSum0, vecSum1);
108 vst1q(inA, vecTmp0);
109 inA += 4;
110 /*
111 * [ 1 -1 1 -1 ] * [ A B C D ]'
112 */
113 vecTmp0 = vhsubq(vecSum0, vecSum1);
114 /*
115 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
116 */
117 vecW = vld1q(pW2);
118 pW2 += 4;
119 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
120
121 vst1q(inB, vecTmp1);
122 inB += 4;
123 /*
124 * [ 1 -i -1 +i ] * [ A B C D ]'
125 */
126 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
127 /*
128 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
129 */
130 vecW = vld1q(pW1);
131 pW1 += 4;
132 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
133 vst1q(inC, vecTmp1);
134 inC += 4;
135 /*
136 * [ 1 +i -1 -i ] * [ A B C D ]'
137 */
138 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
139 /*
140 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
141 */
142 vecW = vld1q(pW3);
143 pW3 += 4;
144 vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0, q31x4_t);
145 vst1q(inD, vecTmp1);
146 inD += 4;
147
148 vecA = vldrwq_s32(inA);
149 vecC = vldrwq_s32(inC);
150
151 blkCnt--;
152 }
153 pBase += CMPLX_DIM * n1;
154 }
155 n1 = n2;
156 n2 >>= 2u;
157 iter = iter << 2;
158 stage++;
159 }
160
161 /*
162 * End of 1st stages process
163 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
164 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
165 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
166 * data is in 5.27(q27) format for the 16 point as there are no middle stages
167 */
168
169 /*
170 * start of Last stage process
171 */
172 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
173 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
174
175 /*
176 * load scheduling
177 */
178 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
179 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
180
181 blkCnt = (fftLen >> 3);
182 while (blkCnt > 0U)
183 {
184 vecSum0 = vhaddq(vecA, vecC);
185 vecDiff0 = vhsubq(vecA, vecC);
186
187 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
188 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
189
190 vecSum1 = vhaddq(vecB, vecD);
191 vecDiff1 = vhsubq(vecB, vecD);
192 /*
193 * pre-load for next iteration
194 */
195 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
196 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
197
198 vecTmp0 = vhaddq(vecSum0, vecSum1);
199 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
200
201 vecTmp0 = vhsubq(vecSum0, vecSum1);
202 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
203
204 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
205 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
206
207 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
208 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
209
210 blkCnt--;
211 }
212
213 /*
214 * output is in 11.21(q21) format for the 1024 point
215 * output is in 9.23(q23) format for the 256 point
216 * output is in 7.25(q25) format for the 64 point
217 * output is in 5.27(q27) format for the 16 point
218 */
219 }
220
221
arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)222 static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
223 {
224 uint32_t n2;
225 q31_t *pIn0;
226 q31_t *pIn1;
227 const q31_t *pCoef = S->pTwiddle;
228 uint32_t blkCnt;
229 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
230 q31x4_t vecCmplxTmp, vecTw;
231
232 n2 = fftLen >> 1;
233 pIn0 = pSrc;
234 pIn1 = pSrc + fftLen;
235
236 blkCnt = n2 / 2;
237
238 while (blkCnt > 0U)
239 {
240 vecIn0 = vld1q_s32(pIn0);
241 vecIn1 = vld1q_s32(pIn1);
242
243 vecIn0 = vecIn0 >> 1;
244 vecIn1 = vecIn1 >> 1;
245 vecSum = vhaddq(vecIn0, vecIn1);
246 vst1q(pIn0, vecSum);
247 pIn0 += 4;
248
249 vecTw = vld1q_s32(pCoef);
250 pCoef += 4;
251 vecDiff = vhsubq(vecIn0, vecIn1);
252
253 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw, q31x4_t);
254 vst1q(pIn1, vecCmplxTmp);
255 pIn1 += 4;
256
257 blkCnt--;
258 }
259
260 _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
261
262 _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
263
264 pIn0 = pSrc;
265 blkCnt = (fftLen << 1) >> 2;
266 while (blkCnt > 0U)
267 {
268 vecIn0 = vld1q_s32(pIn0);
269 vecIn0 = vecIn0 << 1;
270 vst1q(pIn0, vecIn0);
271 pIn0 += 4;
272 blkCnt--;
273 }
274 /*
275 * tail
276 * (will be merged thru tail predication)
277 */
278 blkCnt = (fftLen << 1) & 3;
279 if (blkCnt > 0U)
280 {
281 mve_pred16_t p0 = vctp32q(blkCnt);
282
283 vecIn0 = vld1q_s32(pIn0);
284 vecIn0 = vecIn0 << 1;
285 vstrwq_p(pIn0, vecIn0, p0);
286 }
287
288 }
289
_arm_radix4_butterfly_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)290 static void _arm_radix4_butterfly_inverse_q31_mve(
291 const arm_cfft_instance_q31 *S,
292 q31_t *pSrc,
293 uint32_t fftLen)
294 {
295 q31x4_t vecTmp0, vecTmp1;
296 q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
297 q31x4_t vecA, vecB, vecC, vecD;
298 uint32_t blkCnt;
299 uint32_t n1, n2;
300 uint32_t stage = 0;
301 int32_t iter = 1;
302 static const int32_t strides[4] = {
303 (0 - 16) * (int32_t)sizeof(q31_t *), (1 - 16) * (int32_t)sizeof(q31_t *),
304 (8 - 16) * (int32_t)sizeof(q31_t *), (9 - 16) * (int32_t)sizeof(q31_t *)
305 };
306
307 /*
308 * Process first stages
309 * Each stage in middle stages provides two down scaling of the input
310 */
311 n2 = fftLen;
312 n1 = n2;
313 n2 >>= 2u;
314
315 for (int k = fftLen / 4u; k > 1; k >>= 2u)
316 {
317 q31_t const *p_rearranged_twiddle_tab_stride2 =
318 &S->rearranged_twiddle_stride2[
319 S->rearranged_twiddle_tab_stride2_arr[stage]];
320 q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
321 S->rearranged_twiddle_tab_stride3_arr[stage]];
322 q31_t const *p_rearranged_twiddle_tab_stride1 =
323 &S->rearranged_twiddle_stride1[
324 S->rearranged_twiddle_tab_stride1_arr[stage]];
325
326 q31_t * pBase = pSrc;
327 for (int i = 0; i < iter; i++)
328 {
329 q31_t *inA = pBase;
330 q31_t *inB = inA + n2 * CMPLX_DIM;
331 q31_t *inC = inB + n2 * CMPLX_DIM;
332 q31_t *inD = inC + n2 * CMPLX_DIM;
333 q31_t const *pW1 = p_rearranged_twiddle_tab_stride1;
334 q31_t const *pW2 = p_rearranged_twiddle_tab_stride2;
335 q31_t const *pW3 = p_rearranged_twiddle_tab_stride3;
336 q31x4_t vecW;
337
338 blkCnt = n2 / 2;
339 /*
340 * load 2 x q31 complex pair
341 */
342 vecA = vldrwq_s32(inA);
343 vecC = vldrwq_s32(inC);
344 while (blkCnt > 0U)
345 {
346 vecB = vldrwq_s32(inB);
347 vecD = vldrwq_s32(inD);
348
349 vecSum0 = vhaddq(vecA, vecC);
350 vecDiff0 = vhsubq(vecA, vecC);
351
352 vecSum1 = vhaddq(vecB, vecD);
353 vecDiff1 = vhsubq(vecB, vecD);
354 /*
355 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
356 */
357 vecTmp0 = vhaddq(vecSum0, vecSum1);
358 vst1q(inA, vecTmp0);
359 inA += 4;
360 /*
361 * [ 1 -1 1 -1 ] * [ A B C D ]'
362 */
363 vecTmp0 = vhsubq(vecSum0, vecSum1);
364 /*
365 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
366 */
367 vecW = vld1q(pW2);
368 pW2 += 4;
369 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
370
371 vst1q(inB, vecTmp1);
372 inB += 4;
373 /*
374 * [ 1 -i -1 +i ] * [ A B C D ]'
375 */
376 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
377 /*
378 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
379 */
380 vecW = vld1q(pW1);
381 pW1 += 4;
382 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
383 vst1q(inC, vecTmp1);
384 inC += 4;
385 /*
386 * [ 1 +i -1 -i ] * [ A B C D ]'
387 */
388 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
389 /*
390 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
391 */
392 vecW = vld1q(pW3);
393 pW3 += 4;
394 vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW, q31x4_t);
395 vst1q(inD, vecTmp1);
396 inD += 4;
397
398 vecA = vldrwq_s32(inA);
399 vecC = vldrwq_s32(inC);
400
401 blkCnt--;
402 }
403 pBase += CMPLX_DIM * n1;
404 }
405 n1 = n2;
406 n2 >>= 2u;
407 iter = iter << 2;
408 stage++;
409 }
410
411 /*
412 * End of 1st stages process
413 * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
414 * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
415 * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
416 * data is in 5.27(q27) format for the 16 point as there are no middle stages
417 */
418
419 /*
420 * start of Last stage process
421 */
422 uint32x4_t vecScGathAddr = vld1q_u32((uint32_t*)strides);
423 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
424
425 /*
426 * load scheduling
427 */
428 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
429 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
430
431 blkCnt = (fftLen >> 3);
432 while (blkCnt > 0U)
433 {
434 vecSum0 = vhaddq(vecA, vecC);
435 vecDiff0 = vhsubq(vecA, vecC);
436
437 vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
438 vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
439
440 vecSum1 = vhaddq(vecB, vecD);
441 vecDiff1 = vhsubq(vecB, vecD);
442 /*
443 * pre-load for next iteration
444 */
445 vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
446 vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
447
448 vecTmp0 = vhaddq(vecSum0, vecSum1);
449 vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
450
451 vecTmp0 = vhsubq(vecSum0, vecSum1);
452 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
453
454 vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
455 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
456
457 vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
458 vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
459
460 blkCnt--;
461 }
462 /*
463 * output is in 11.21(q21) format for the 1024 point
464 * output is in 9.23(q23) format for the 256 point
465 * output is in 7.25(q25) format for the 64 point
466 * output is in 5.27(q27) format for the 16 point
467 */
468 }
469
arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint32_t fftLen)470 static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
471 {
472 uint32_t n2;
473 q31_t *pIn0;
474 q31_t *pIn1;
475 const q31_t *pCoef = S->pTwiddle;
476
477 //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
478 //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
479 uint32_t blkCnt;
480 //uint64x2_t vecOffs;
481 q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
482 q31x4_t vecCmplxTmp, vecTw;
483
484 n2 = fftLen >> 1;
485
486 pIn0 = pSrc;
487 pIn1 = pSrc + fftLen;
488 //vecOffs[0] = 0;
489 //vecOffs[1] = (uint64_t) twidIncr;
490 blkCnt = n2 / 2;
491
492 while (blkCnt > 0U)
493 {
494 vecIn0 = vld1q_s32(pIn0);
495 vecIn1 = vld1q_s32(pIn1);
496
497 vecIn0 = vecIn0 >> 1;
498 vecIn1 = vecIn1 >> 1;
499 vecSum = vhaddq(vecIn0, vecIn1);
500 vst1q(pIn0, vecSum);
501 pIn0 += 4;
502
503 //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
504 vecTw = vld1q_s32(pCoef);
505 pCoef += 4;
506 vecDiff = vhsubq(vecIn0, vecIn1);
507
508 vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw, q31x4_t);
509 vst1q(pIn1, vecCmplxTmp);
510 pIn1 += 4;
511
512 //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
513 blkCnt--;
514 }
515
516 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
517
518 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
519
520 pIn0 = pSrc;
521 blkCnt = (fftLen << 1) >> 2;
522 while (blkCnt > 0U)
523 {
524 vecIn0 = vld1q_s32(pIn0);
525 vecIn0 = vecIn0 << 1;
526 vst1q(pIn0, vecIn0);
527 pIn0 += 4;
528 blkCnt--;
529 }
530 /*
531 * tail
532 * (will be merged thru tail predication)
533 */
534 blkCnt = (fftLen << 1) & 3;
535 if (blkCnt > 0U)
536 {
537 mve_pred16_t p0 = vctp32q(blkCnt);
538
539 vecIn0 = vld1q_s32(pIn0);
540 vecIn0 = vecIn0 << 1;
541 vstrwq_p(pIn0, vecIn0, p0);
542 }
543
544 }
545
546
547 /**
548 @addtogroup ComplexFFTQ31
549 @{
550 */
551
552 /**
553 @brief Processing function for the Q31 complex FFT.
554 @param[in] S points to an instance of the fixed-point CFFT structure
555 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
556 @param[in] ifftFlag flag that selects transform direction
557 - value = 0: forward transform
558 - value = 1: inverse transform
559 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
560 - value = 0: disables bit reversal of output
561 - value = 1: enables bit reversal of output
562 @return none
563 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)564 void arm_cfft_q31(
565 const arm_cfft_instance_q31 * S,
566 q31_t * pSrc,
567 uint8_t ifftFlag,
568 uint8_t bitReverseFlag)
569 {
570 uint32_t fftLen = S->fftLen;
571
572 if (ifftFlag == 1U) {
573
574 switch (fftLen) {
575 case 16:
576 case 64:
577 case 256:
578 case 1024:
579 case 4096:
580 _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
581 break;
582
583 case 32:
584 case 128:
585 case 512:
586 case 2048:
587 arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
588 break;
589 }
590 } else {
591 switch (fftLen) {
592 case 16:
593 case 64:
594 case 256:
595 case 1024:
596 case 4096:
597 _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
598 break;
599
600 case 32:
601 case 128:
602 case 512:
603 case 2048:
604 arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
605 break;
606 }
607 }
608
609
610 if (bitReverseFlag)
611 {
612
613 arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
614
615 }
616 }
617 #else
618
619 extern void arm_radix4_butterfly_q31(
620 q31_t * pSrc,
621 uint32_t fftLen,
622 const q31_t * pCoef,
623 uint32_t twidCoefModifier);
624
625 extern void arm_radix4_butterfly_inverse_q31(
626 q31_t * pSrc,
627 uint32_t fftLen,
628 const q31_t * pCoef,
629 uint32_t twidCoefModifier);
630
631 extern void arm_bitreversal_32(
632 uint32_t * pSrc,
633 const uint16_t bitRevLen,
634 const uint16_t * pBitRevTable);
635
636 void arm_cfft_radix4by2_q31(
637 q31_t * pSrc,
638 uint32_t fftLen,
639 const q31_t * pCoef);
640
641 void arm_cfft_radix4by2_inverse_q31(
642 q31_t * pSrc,
643 uint32_t fftLen,
644 const q31_t * pCoef);
645
646
647 /**
648 @addtogroup ComplexFFTQ31
649 @{
650 */
651
652 /**
653 @brief Processing function for the Q31 complex FFT.
654 @param[in] S points to an instance of the fixed-point CFFT structure
655 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
656 @param[in] ifftFlag flag that selects transform direction
657 - value = 0: forward transform
658 - value = 1: inverse transform
659 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
660 - value = 0: disables bit reversal of output
661 - value = 1: enables bit reversal of output
662 @return none
663 */
arm_cfft_q31(const arm_cfft_instance_q31 * S,q31_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)664 void arm_cfft_q31(
665 const arm_cfft_instance_q31 * S,
666 q31_t * p1,
667 uint8_t ifftFlag,
668 uint8_t bitReverseFlag)
669 {
670 uint32_t L = S->fftLen;
671
672 if (ifftFlag == 1U)
673 {
674 switch (L)
675 {
676 case 16:
677 case 64:
678 case 256:
679 case 1024:
680 case 4096:
681 arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
682 break;
683
684 case 32:
685 case 128:
686 case 512:
687 case 2048:
688 arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
689 break;
690 }
691 }
692 else
693 {
694 switch (L)
695 {
696 case 16:
697 case 64:
698 case 256:
699 case 1024:
700 case 4096:
701 arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
702 break;
703
704 case 32:
705 case 128:
706 case 512:
707 case 2048:
708 arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
709 break;
710 }
711 }
712
713 if ( bitReverseFlag )
714 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
715 }
716
717 /**
718 @} end of ComplexFFTQ31 group
719 */
720
arm_cfft_radix4by2_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)721 void arm_cfft_radix4by2_q31(
722 q31_t * pSrc,
723 uint32_t fftLen,
724 const q31_t * pCoef)
725 {
726 uint32_t i, l;
727 uint32_t n2;
728 q31_t xt, yt, cosVal, sinVal;
729 q31_t p0, p1;
730
731 n2 = fftLen >> 1U;
732 for (i = 0; i < n2; i++)
733 {
734 cosVal = pCoef[2 * i];
735 sinVal = pCoef[2 * i + 1];
736
737 l = i + n2;
738
739 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
740 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
741
742 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
743 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
744
745 mult_32x32_keep32_R(p0, xt, cosVal);
746 mult_32x32_keep32_R(p1, yt, cosVal);
747 multAcc_32x32_keep32_R(p0, yt, sinVal);
748 multSub_32x32_keep32_R(p1, xt, sinVal);
749
750 pSrc[2 * l] = p0 << 1;
751 pSrc[2 * l + 1] = p1 << 1;
752 }
753
754
755 /* first col */
756 arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
757
758 /* second col */
759 arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
760
761 n2 = fftLen >> 1U;
762 for (i = 0; i < n2; i++)
763 {
764 p0 = pSrc[4 * i + 0];
765 p1 = pSrc[4 * i + 1];
766 xt = pSrc[4 * i + 2];
767 yt = pSrc[4 * i + 3];
768
769 p0 <<= 1U;
770 p1 <<= 1U;
771 xt <<= 1U;
772 yt <<= 1U;
773
774 pSrc[4 * i + 0] = p0;
775 pSrc[4 * i + 1] = p1;
776 pSrc[4 * i + 2] = xt;
777 pSrc[4 * i + 3] = yt;
778 }
779
780 }
781
arm_cfft_radix4by2_inverse_q31(q31_t * pSrc,uint32_t fftLen,const q31_t * pCoef)782 void arm_cfft_radix4by2_inverse_q31(
783 q31_t * pSrc,
784 uint32_t fftLen,
785 const q31_t * pCoef)
786 {
787 uint32_t i, l;
788 uint32_t n2;
789 q31_t xt, yt, cosVal, sinVal;
790 q31_t p0, p1;
791
792 n2 = fftLen >> 1U;
793 for (i = 0; i < n2; i++)
794 {
795 cosVal = pCoef[2 * i];
796 sinVal = pCoef[2 * i + 1];
797
798 l = i + n2;
799
800 xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
801 pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
802
803 yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
804 pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
805
806 mult_32x32_keep32_R(p0, xt, cosVal);
807 mult_32x32_keep32_R(p1, yt, cosVal);
808 multSub_32x32_keep32_R(p0, yt, sinVal);
809 multAcc_32x32_keep32_R(p1, xt, sinVal);
810
811 pSrc[2 * l] = p0 << 1U;
812 pSrc[2 * l + 1] = p1 << 1U;
813 }
814
815 /* first col */
816 arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
817
818 /* second col */
819 arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
820
821 n2 = fftLen >> 1U;
822 for (i = 0; i < n2; i++)
823 {
824 p0 = pSrc[4 * i + 0];
825 p1 = pSrc[4 * i + 1];
826 xt = pSrc[4 * i + 2];
827 yt = pSrc[4 * i + 3];
828
829 p0 <<= 1U;
830 p1 <<= 1U;
831 xt <<= 1U;
832 yt <<= 1U;
833
834 pSrc[4 * i + 0] = p0;
835 pSrc[4 * i + 1] = p1;
836 pSrc[4 * i + 2] = xt;
837 pSrc[4 * i + 3] = yt;
838 }
839 }
840 #endif /* defined(ARM_MATH_MVEI) */
841