1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_cfft_f32.c
4 * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
5 *
6 * $Date: 18. March 2019
7 * $Revision: V1.6.0
8 *
9 * Target Processor: Cortex-M cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "arm_math.h"
30 #include "arm_common_tables.h"
31
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37
38
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41 float32_t retValue=1.0;
42
43 switch (fftLen)
44 {
45
46 case 4096U:
47 retValue = 0.000244140625;
48 break;
49
50 case 2048U:
51 retValue = 0.00048828125;
52 break;
53
54 case 1024U:
55 retValue = 0.0009765625f;
56 break;
57
58 case 512U:
59 retValue = 0.001953125;
60 break;
61
62 case 256U:
63 retValue = 0.00390625f;
64 break;
65
66 case 128U:
67 retValue = 0.0078125;
68 break;
69
70 case 64U:
71 retValue = 0.015625f;
72 break;
73
74 case 32U:
75 retValue = 0.03125;
76 break;
77
78 case 16U:
79 retValue = 0.0625f;
80 break;
81
82
83 default:
84 break;
85 }
86 return(retValue);
87 }
88
89
arm_bitreversal_f32_inpl_mve(uint32_t * pSrc,const uint16_t bitRevLen,const uint16_t * pBitRevTab)90 static void arm_bitreversal_f32_inpl_mve(
91 uint32_t *pSrc,
92 const uint16_t bitRevLen,
93 const uint16_t *pBitRevTab)
94
95 {
96 uint64_t *src = (uint64_t *) pSrc;
97 uint32_t blkCnt; /* loop counters */
98 uint32x4_t bitRevTabOff;
99 uint32x4_t one = vdupq_n_u32(1);
100
101 blkCnt = (bitRevLen / 2) / 2;
102 while (blkCnt > 0U) {
103 bitRevTabOff = vldrhq_u32(pBitRevTab);
104 pBitRevTab += 4;
105
106 uint64x2_t bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
107 uint64x2_t bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
108
109 uint64x2_t in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
110 uint64x2_t in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
111
112 vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
113 vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
114
115 /*
116 * Decrement the blockSize loop counter
117 */
118 blkCnt--;
119 }
120 }
121
122
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)123 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
124 {
125 f32x4_t vecTmp0, vecTmp1;
126 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
127 f32x4_t vecA, vecB, vecC, vecD;
128 uint32_t blkCnt;
129 uint32_t n1, n2;
130 uint32_t stage = 0;
131 int32_t iter = 1;
132 static const uint32_t strides[4] = {
133 (0 - 16) * sizeof(q31_t *),
134 (1 - 16) * sizeof(q31_t *),
135 (8 - 16) * sizeof(q31_t *),
136 (9 - 16) * sizeof(q31_t *)
137 };
138
139 n2 = fftLen;
140 n1 = n2;
141 n2 >>= 2u;
142 for (int k = fftLen / 4u; k > 1; k >>= 2)
143 {
144 for (int i = 0; i < iter; i++)
145 {
146 float32_t const *p_rearranged_twiddle_tab_stride1 =
147 &S->rearranged_twiddle_stride1[
148 S->rearranged_twiddle_tab_stride1_arr[stage]];
149 float32_t const *p_rearranged_twiddle_tab_stride2 =
150 &S->rearranged_twiddle_stride2[
151 S->rearranged_twiddle_tab_stride2_arr[stage]];
152 float32_t const *p_rearranged_twiddle_tab_stride3 =
153 &S->rearranged_twiddle_stride3[
154 S->rearranged_twiddle_tab_stride3_arr[stage]];
155 float32_t const *pW1, *pW2, *pW3;
156 float32_t *inA = pSrc + CMPLX_DIM * i * n1;
157 float32_t *inB = inA + n2 * CMPLX_DIM;
158 float32_t *inC = inB + n2 * CMPLX_DIM;
159 float32_t *inD = inC + n2 * CMPLX_DIM;
160 f32x4_t vecW;
161
162
163 pW1 = p_rearranged_twiddle_tab_stride1;
164 pW2 = p_rearranged_twiddle_tab_stride2;
165 pW3 = p_rearranged_twiddle_tab_stride3;
166
167 blkCnt = n2 / 2;
168 /*
169 * load 2 f32 complex pair
170 */
171 vecA = vldrwq_f32(inA);
172 vecC = vldrwq_f32(inC);
173 while (blkCnt > 0U)
174 {
175 vecB = vldrwq_f32(inB);
176 vecD = vldrwq_f32(inD);
177
178 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
179 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
180
181 vecSum1 = vecB + vecD;
182 vecDiff1 = vecB - vecD;
183 /*
184 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
185 */
186 vecTmp0 = vecSum0 + vecSum1;
187 vst1q(inA, vecTmp0);
188 inA += 4;
189
190 /*
191 * [ 1 -1 1 -1 ] * [ A B C D ]'
192 */
193 vecTmp0 = vecSum0 - vecSum1;
194 /*
195 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
196 */
197 vecW = vld1q(pW2);
198 pW2 += 4;
199 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
200 vst1q(inB, vecTmp1);
201 inB += 4;
202
203 /*
204 * [ 1 -i -1 +i ] * [ A B C D ]'
205 */
206 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
207 /*
208 * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
209 */
210 vecW = vld1q(pW1);
211 pW1 +=4;
212 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
213 vst1q(inC, vecTmp1);
214 inC += 4;
215
216 /*
217 * [ 1 +i -1 -i ] * [ A B C D ]'
218 */
219 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
220 /*
221 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
222 */
223 vecW = vld1q(pW3);
224 pW3 += 4;
225 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
226 vst1q(inD, vecTmp1);
227 inD += 4;
228
229 vecA = vldrwq_f32(inA);
230 vecC = vldrwq_f32(inC);
231
232 blkCnt--;
233 }
234 }
235 n1 = n2;
236 n2 >>= 2u;
237 iter = iter << 2;
238 stage++;
239 }
240
241 /*
242 * start of Last stage process
243 */
244 uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
245 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
246
247 /* load scheduling */
248 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
249 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
250
251 blkCnt = (fftLen >> 3);
252 while (blkCnt > 0U)
253 {
254 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
255 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
256
257 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
258 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
259
260 vecSum1 = vecB + vecD;
261 vecDiff1 = vecB - vecD;
262
263 /* pre-load for next iteration */
264 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
265 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
266
267 vecTmp0 = vecSum0 + vecSum1;
268 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
269
270 vecTmp0 = vecSum0 - vecSum1;
271 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
272
273 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
274 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
275
276 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
277 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
278
279 blkCnt--;
280 }
281
282 /*
283 * End of last stage process
284 */
285 }
286
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)287 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
288 {
289 float32_t const *pCoefVec;
290 float32_t const *pCoef = S->pTwiddle;
291 float32_t *pIn0, *pIn1;
292 uint32_t n2;
293 uint32_t blkCnt;
294 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
295 f32x4_t vecCmplxTmp, vecTw;
296
297
298 n2 = fftLen >> 1;
299 pIn0 = pSrc;
300 pIn1 = pSrc + fftLen;
301 pCoefVec = pCoef;
302
303 blkCnt = n2 / 2;
304 while (blkCnt > 0U)
305 {
306 vecIn0 = *(f32x4_t *) pIn0;
307 vecIn1 = *(f32x4_t *) pIn1;
308 vecTw = vld1q(pCoefVec);
309 pCoefVec += 4;
310
311 vecSum = vecIn0 + vecIn1;
312 vecDiff = vecIn0 - vecIn1;
313
314 vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
315
316 vst1q(pIn0, vecSum);
317 pIn0 += 4;
318 vst1q(pIn1, vecCmplxTmp);
319 pIn1 += 4;
320
321 blkCnt--;
322 }
323
324 _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
325
326 _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
327
328 pIn0 = pSrc;
329 }
330
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)331 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
332 {
333 f32x4_t vecTmp0, vecTmp1;
334 f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
335 f32x4_t vecA, vecB, vecC, vecD;
336 f32x4_t vecW;
337 uint32_t blkCnt;
338 uint32_t n1, n2;
339 uint32_t stage = 0;
340 int32_t iter = 1;
341 static const uint32_t strides[4] = {
342 (0 - 16) * sizeof(q31_t *),
343 (1 - 16) * sizeof(q31_t *),
344 (8 - 16) * sizeof(q31_t *),
345 (9 - 16) * sizeof(q31_t *)
346 };
347
348 n2 = fftLen;
349 n1 = n2;
350 n2 >>= 2u;
351 for (int k = fftLen / 4; k > 1; k >>= 2)
352 {
353 for (int i = 0; i < iter; i++)
354 {
355 float32_t const *p_rearranged_twiddle_tab_stride1 =
356 &S->rearranged_twiddle_stride1[
357 S->rearranged_twiddle_tab_stride1_arr[stage]];
358 float32_t const *p_rearranged_twiddle_tab_stride2 =
359 &S->rearranged_twiddle_stride2[
360 S->rearranged_twiddle_tab_stride2_arr[stage]];
361 float32_t const *p_rearranged_twiddle_tab_stride3 =
362 &S->rearranged_twiddle_stride3[
363 S->rearranged_twiddle_tab_stride3_arr[stage]];
364 float32_t const *pW1, *pW2, *pW3;
365 float32_t *inA = pSrc + CMPLX_DIM * i * n1;
366 float32_t *inB = inA + n2 * CMPLX_DIM;
367 float32_t *inC = inB + n2 * CMPLX_DIM;
368 float32_t *inD = inC + n2 * CMPLX_DIM;
369
370 pW1 = p_rearranged_twiddle_tab_stride1;
371 pW2 = p_rearranged_twiddle_tab_stride2;
372 pW3 = p_rearranged_twiddle_tab_stride3;
373
374 blkCnt = n2 / 2;
375 /*
376 * load 2 f32 complex pair
377 */
378 vecA = vldrwq_f32(inA);
379 vecC = vldrwq_f32(inC);
380 while (blkCnt > 0U)
381 {
382 vecB = vldrwq_f32(inB);
383 vecD = vldrwq_f32(inD);
384
385 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
386 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
387
388 vecSum1 = vecB + vecD;
389 vecDiff1 = vecB - vecD;
390 /*
391 * [ 1 1 1 1 ] * [ A B C D ]' .* 1
392 */
393 vecTmp0 = vecSum0 + vecSum1;
394 vst1q(inA, vecTmp0);
395 inA += 4;
396 /*
397 * [ 1 -1 1 -1 ] * [ A B C D ]'
398 */
399 vecTmp0 = vecSum0 - vecSum1;
400 /*
401 * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
402 */
403 vecW = vld1q(pW2);
404 pW2 += 4;
405 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
406 vst1q(inB, vecTmp1);
407 inB += 4;
408
409 /*
410 * [ 1 -i -1 +i ] * [ A B C D ]'
411 */
412 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
413 /*
414 * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
415 */
416 vecW = vld1q(pW1);
417 pW1 += 4;
418 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
419 vst1q(inC, vecTmp1);
420 inC += 4;
421
422 /*
423 * [ 1 +i -1 -i ] * [ A B C D ]'
424 */
425 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
426 /*
427 * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
428 */
429 vecW = vld1q(pW3);
430 pW3 += 4;
431 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
432 vst1q(inD, vecTmp1);
433 inD += 4;
434
435 vecA = vldrwq_f32(inA);
436 vecC = vldrwq_f32(inC);
437
438 blkCnt--;
439 }
440 }
441 n1 = n2;
442 n2 >>= 2u;
443 iter = iter << 2;
444 stage++;
445 }
446
447 /*
448 * start of Last stage process
449 */
450 uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
451 vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
452
453 /*
454 * load scheduling
455 */
456 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
457 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
458
459 blkCnt = (fftLen >> 3);
460 while (blkCnt > 0U)
461 {
462 vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
463 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
464
465 vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
466 vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
467
468 vecSum1 = vecB + vecD;
469 vecDiff1 = vecB - vecD;
470
471 vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
472 vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
473
474 vecTmp0 = vecSum0 + vecSum1;
475 vecTmp0 = vecTmp0 * onebyfftLen;
476 vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
477
478 vecTmp0 = vecSum0 - vecSum1;
479 vecTmp0 = vecTmp0 * onebyfftLen;
480 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
481
482 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
483 vecTmp0 = vecTmp0 * onebyfftLen;
484 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
485
486 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
487 vecTmp0 = vecTmp0 * onebyfftLen;
488 vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
489
490 blkCnt--;
491 }
492
493 /*
494 * End of last stage process
495 */
496 }
497
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)498 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
499 {
500 float32_t const *pCoefVec;
501 float32_t const *pCoef = S->pTwiddle;
502 float32_t *pIn0, *pIn1;
503 uint32_t n2;
504 float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
505 uint32_t blkCnt;
506 f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
507 f32x4_t vecCmplxTmp, vecTw;
508
509
510 n2 = fftLen >> 1;
511 pIn0 = pSrc;
512 pIn1 = pSrc + fftLen;
513 pCoefVec = pCoef;
514
515 blkCnt = n2 / 2;
516 while (blkCnt > 0U)
517 {
518 vecIn0 = *(f32x4_t *) pIn0;
519 vecIn1 = *(f32x4_t *) pIn1;
520 vecTw = vld1q(pCoefVec);
521 pCoefVec += 4;
522
523 vecSum = vecIn0 + vecIn1;
524 vecDiff = vecIn0 - vecIn1;
525
526 vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
527
528 vst1q(pIn0, vecSum);
529 pIn0 += 4;
530 vst1q(pIn1, vecCmplxTmp);
531 pIn1 += 4;
532
533 blkCnt--;
534 }
535
536 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
537
538 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
539 }
540
541
542 /**
543 @addtogroup ComplexFFT
544 @{
545 */
546
547 /**
548 @brief Processing function for the floating-point complex FFT.
549 @param[in] S points to an instance of the floating-point CFFT structure
550 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
551 @param[in] ifftFlag flag that selects transform direction
552 - value = 0: forward transform
553 - value = 1: inverse transform
554 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
555 - value = 0: disables bit reversal of output
556 - value = 1: enables bit reversal of output
557 @return none
558 */
559
560
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)561 void arm_cfft_f32(
562 const arm_cfft_instance_f32 * S,
563 float32_t * pSrc,
564 uint8_t ifftFlag,
565 uint8_t bitReverseFlag)
566 {
567 uint32_t fftLen = S->fftLen;
568
569 if (ifftFlag == 1U) {
570
571 switch (fftLen) {
572 case 16:
573 case 64:
574 case 256:
575 case 1024:
576 case 4096:
577 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
578 break;
579
580 case 32:
581 case 128:
582 case 512:
583 case 2048:
584 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
585 break;
586 }
587 } else {
588 switch (fftLen) {
589 case 16:
590 case 64:
591 case 256:
592 case 1024:
593 case 4096:
594 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
595 break;
596
597 case 32:
598 case 128:
599 case 512:
600 case 2048:
601 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
602 break;
603 }
604 }
605
606
607 if (bitReverseFlag)
608 {
609
610 arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
611
612 }
613 }
614
615
616 #else
617 extern void arm_radix8_butterfly_f32(
618 float32_t * pSrc,
619 uint16_t fftLen,
620 const float32_t * pCoef,
621 uint16_t twidCoefModifier);
622
623 extern void arm_bitreversal_32(
624 uint32_t * pSrc,
625 const uint16_t bitRevLen,
626 const uint16_t * pBitRevTable);
627
628 /**
629 @ingroup groupTransforms
630 */
631
632 /**
633 @defgroup ComplexFFT Complex FFT Functions
634
635 @par
636 The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
637 Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
638 than the DFT, especially for long lengths.
639 The algorithms described in this section
640 operate on complex data. A separate set of functions is devoted to handling
641 of real sequences.
642 @par
643 There are separate algorithms for handling floating-point, Q15, and Q31 data
644 types. The algorithms available for each data type are described next.
645 @par
646 The FFT functions operate in-place. That is, the array holding the input data
647 will also be used to hold the corresponding result. The input data is complex
648 and contains <code>2*fftLen</code> interleaved values as shown below.
649 <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
650 The FFT result will be contained in the same array and the frequency domain
651 values will have the same interleaving.
652
653 @par Floating-point
654 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
655 stages are performed along with a single radix-2 or radix-4 stage, as needed.
656 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
657 a different twiddle factor table.
658 @par
659 The function uses the standard FFT definition and output values may grow by a
660 factor of <code>fftLen</code> when computing the forward transform. The
661 inverse transform includes a scale of <code>1/fftLen</code> as part of the
662 calculation and this matches the textbook definition of the inverse FFT.
663 @par
664 For the MVE version, the new arm_cfft_init_f32 initialization function is
665 <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
666 needed FFTs.</b> Other FFT versions can continue to be initialized as
667 explained below.
668 @par
669 For not MVE versions, pre-initialized data structures containing twiddle factors
670 and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>. Include
671 this header in your function and then pass one of the constant structures as
672 an argument to arm_cfft_f32. For example:
673 @par
674 <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
675 @par
676 computes a 64-point inverse complex FFT including bit reversal.
677 The data structures are treated as constant data and not modified during the
678 calculation. The same data structure can be reused for multiple transforms
679 including mixing forward and inverse transforms.
680 @par
681 Earlier releases of the library provided separate radix-2 and radix-4
682 algorithms that operated on floating-point data. These functions are still
683 provided but are deprecated. The older functions are slower and less general
684 than the new functions.
685 @par
686 An example of initialization of the constants for the arm_cfft_f32 function follows:
687 @code
688 const static arm_cfft_instance_f32 *S;
689 ...
690 switch (length) {
691 case 16:
692 S = &arm_cfft_sR_f32_len16;
693 break;
694 case 32:
695 S = &arm_cfft_sR_f32_len32;
696 break;
697 case 64:
698 S = &arm_cfft_sR_f32_len64;
699 break;
700 case 128:
701 S = &arm_cfft_sR_f32_len128;
702 break;
703 case 256:
704 S = &arm_cfft_sR_f32_len256;
705 break;
706 case 512:
707 S = &arm_cfft_sR_f32_len512;
708 break;
709 case 1024:
710 S = &arm_cfft_sR_f32_len1024;
711 break;
712 case 2048:
713 S = &arm_cfft_sR_f32_len2048;
714 break;
715 case 4096:
716 S = &arm_cfft_sR_f32_len4096;
717 break;
718 }
719 @endcode
720 @par
721 The new arm_cfft_init_f32 can also be used.
722 @par Q15 and Q31
723 The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
724 stages are performed along with a single radix-2 stage, as needed.
725 The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
726 a different twiddle factor table.
727 @par
728 The function uses the standard FFT definition and output values may grow by a
729 factor of <code>fftLen</code> when computing the forward transform. The
730 inverse transform includes a scale of <code>1/fftLen</code> as part of the
731 calculation and this matches the textbook definition of the inverse FFT.
732 @par
733 Pre-initialized data structures containing twiddle factors and bit reversal
734 tables are provided and defined in <code>arm_const_structs.h</code>. Include
735 this header in your function and then pass one of the constant structures as
736 an argument to arm_cfft_q31. For example:
737 @par
738 <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
739 @par
740 computes a 64-point inverse complex FFT including bit reversal.
741 The data structures are treated as constant data and not modified during the
742 calculation. The same data structure can be reused for multiple transforms
743 including mixing forward and inverse transforms.
744 @par
745 Earlier releases of the library provided separate radix-2 and radix-4
746 algorithms that operated on floating-point data. These functions are still
747 provided but are deprecated. The older functions are slower and less general
748 than the new functions.
749 @par
750 An example of initialization of the constants for the arm_cfft_q31 function follows:
751 @code
752 const static arm_cfft_instance_q31 *S;
753 ...
754 switch (length) {
755 case 16:
756 S = &arm_cfft_sR_q31_len16;
757 break;
758 case 32:
759 S = &arm_cfft_sR_q31_len32;
760 break;
761 case 64:
762 S = &arm_cfft_sR_q31_len64;
763 break;
764 case 128:
765 S = &arm_cfft_sR_q31_len128;
766 break;
767 case 256:
768 S = &arm_cfft_sR_q31_len256;
769 break;
770 case 512:
771 S = &arm_cfft_sR_q31_len512;
772 break;
773 case 1024:
774 S = &arm_cfft_sR_q31_len1024;
775 break;
776 case 2048:
777 S = &arm_cfft_sR_q31_len2048;
778 break;
779 case 4096:
780 S = &arm_cfft_sR_q31_len4096;
781 break;
782 }
783 @endcode
784
785 */
786
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)787 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
788 {
789 uint32_t L = S->fftLen;
790 float32_t * pCol1, * pCol2, * pMid1, * pMid2;
791 float32_t * p2 = p1 + L;
792 const float32_t * tw = (float32_t *) S->pTwiddle;
793 float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
794 float32_t m0, m1, m2, m3;
795 uint32_t l;
796
797 pCol1 = p1;
798 pCol2 = p2;
799
800 /* Define new length */
801 L >>= 1;
802
803 /* Initialize mid pointers */
804 pMid1 = p1 + L;
805 pMid2 = p2 + L;
806
807 /* do two dot Fourier transform */
808 for (l = L >> 2; l > 0; l-- )
809 {
810 t1[0] = p1[0];
811 t1[1] = p1[1];
812 t1[2] = p1[2];
813 t1[3] = p1[3];
814
815 t2[0] = p2[0];
816 t2[1] = p2[1];
817 t2[2] = p2[2];
818 t2[3] = p2[3];
819
820 t3[0] = pMid1[0];
821 t3[1] = pMid1[1];
822 t3[2] = pMid1[2];
823 t3[3] = pMid1[3];
824
825 t4[0] = pMid2[0];
826 t4[1] = pMid2[1];
827 t4[2] = pMid2[2];
828 t4[3] = pMid2[3];
829
830 *p1++ = t1[0] + t2[0];
831 *p1++ = t1[1] + t2[1];
832 *p1++ = t1[2] + t2[2];
833 *p1++ = t1[3] + t2[3]; /* col 1 */
834
835 t2[0] = t1[0] - t2[0];
836 t2[1] = t1[1] - t2[1];
837 t2[2] = t1[2] - t2[2];
838 t2[3] = t1[3] - t2[3]; /* for col 2 */
839
840 *pMid1++ = t3[0] + t4[0];
841 *pMid1++ = t3[1] + t4[1];
842 *pMid1++ = t3[2] + t4[2];
843 *pMid1++ = t3[3] + t4[3]; /* col 1 */
844
845 t4[0] = t4[0] - t3[0];
846 t4[1] = t4[1] - t3[1];
847 t4[2] = t4[2] - t3[2];
848 t4[3] = t4[3] - t3[3]; /* for col 2 */
849
850 twR = *tw++;
851 twI = *tw++;
852
853 /* multiply by twiddle factors */
854 m0 = t2[0] * twR;
855 m1 = t2[1] * twI;
856 m2 = t2[1] * twR;
857 m3 = t2[0] * twI;
858
859 /* R = R * Tr - I * Ti */
860 *p2++ = m0 + m1;
861 /* I = I * Tr + R * Ti */
862 *p2++ = m2 - m3;
863
864 /* use vertical symmetry */
865 /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
866 m0 = t4[0] * twI;
867 m1 = t4[1] * twR;
868 m2 = t4[1] * twI;
869 m3 = t4[0] * twR;
870
871 *pMid2++ = m0 - m1;
872 *pMid2++ = m2 + m3;
873
874 twR = *tw++;
875 twI = *tw++;
876
877 m0 = t2[2] * twR;
878 m1 = t2[3] * twI;
879 m2 = t2[3] * twR;
880 m3 = t2[2] * twI;
881
882 *p2++ = m0 + m1;
883 *p2++ = m2 - m3;
884
885 m0 = t4[2] * twI;
886 m1 = t4[3] * twR;
887 m2 = t4[3] * twI;
888 m3 = t4[2] * twR;
889
890 *pMid2++ = m0 - m1;
891 *pMid2++ = m2 + m3;
892 }
893
894 /* first col */
895 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
896
897 /* second col */
898 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
899 }
900
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)901 void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
902 {
903 uint32_t L = S->fftLen >> 1;
904 float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
905 const float32_t *tw2, *tw3, *tw4;
906 float32_t * p2 = p1 + L;
907 float32_t * p3 = p2 + L;
908 float32_t * p4 = p3 + L;
909 float32_t t2[4], t3[4], t4[4], twR, twI;
910 float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
911 float32_t m0, m1, m2, m3;
912 uint32_t l, twMod2, twMod3, twMod4;
913
914 pCol1 = p1; /* points to real values by default */
915 pCol2 = p2;
916 pCol3 = p3;
917 pCol4 = p4;
918 pEnd1 = p2 - 1; /* points to imaginary values by default */
919 pEnd2 = p3 - 1;
920 pEnd3 = p4 - 1;
921 pEnd4 = pEnd3 + L;
922
923 tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
924
925 L >>= 1;
926
927 /* do four dot Fourier transform */
928
929 twMod2 = 2;
930 twMod3 = 4;
931 twMod4 = 6;
932
933 /* TOP */
934 p1ap3_0 = p1[0] + p3[0];
935 p1sp3_0 = p1[0] - p3[0];
936 p1ap3_1 = p1[1] + p3[1];
937 p1sp3_1 = p1[1] - p3[1];
938
939 /* col 2 */
940 t2[0] = p1sp3_0 + p2[1] - p4[1];
941 t2[1] = p1sp3_1 - p2[0] + p4[0];
942 /* col 3 */
943 t3[0] = p1ap3_0 - p2[0] - p4[0];
944 t3[1] = p1ap3_1 - p2[1] - p4[1];
945 /* col 4 */
946 t4[0] = p1sp3_0 - p2[1] + p4[1];
947 t4[1] = p1sp3_1 + p2[0] - p4[0];
948 /* col 1 */
949 *p1++ = p1ap3_0 + p2[0] + p4[0];
950 *p1++ = p1ap3_1 + p2[1] + p4[1];
951
952 /* Twiddle factors are ones */
953 *p2++ = t2[0];
954 *p2++ = t2[1];
955 *p3++ = t3[0];
956 *p3++ = t3[1];
957 *p4++ = t4[0];
958 *p4++ = t4[1];
959
960 tw2 += twMod2;
961 tw3 += twMod3;
962 tw4 += twMod4;
963
964 for (l = (L - 2) >> 1; l > 0; l-- )
965 {
966 /* TOP */
967 p1ap3_0 = p1[0] + p3[0];
968 p1sp3_0 = p1[0] - p3[0];
969 p1ap3_1 = p1[1] + p3[1];
970 p1sp3_1 = p1[1] - p3[1];
971 /* col 2 */
972 t2[0] = p1sp3_0 + p2[1] - p4[1];
973 t2[1] = p1sp3_1 - p2[0] + p4[0];
974 /* col 3 */
975 t3[0] = p1ap3_0 - p2[0] - p4[0];
976 t3[1] = p1ap3_1 - p2[1] - p4[1];
977 /* col 4 */
978 t4[0] = p1sp3_0 - p2[1] + p4[1];
979 t4[1] = p1sp3_1 + p2[0] - p4[0];
980 /* col 1 - top */
981 *p1++ = p1ap3_0 + p2[0] + p4[0];
982 *p1++ = p1ap3_1 + p2[1] + p4[1];
983
984 /* BOTTOM */
985 p1ap3_1 = pEnd1[-1] + pEnd3[-1];
986 p1sp3_1 = pEnd1[-1] - pEnd3[-1];
987 p1ap3_0 = pEnd1[ 0] + pEnd3[0];
988 p1sp3_0 = pEnd1[ 0] - pEnd3[0];
989 /* col 2 */
990 t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
991 t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
992 /* col 3 */
993 t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
994 t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
995 /* col 4 */
996 t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
997 t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
998 /* col 1 - Bottom */
999 *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
1000 *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
1001
1002 /* COL 2 */
1003 /* read twiddle factors */
1004 twR = *tw2++;
1005 twI = *tw2++;
1006 /* multiply by twiddle factors */
1007 /* let Z1 = a + i(b), Z2 = c + i(d) */
1008 /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
1009
1010 /* Top */
1011 m0 = t2[0] * twR;
1012 m1 = t2[1] * twI;
1013 m2 = t2[1] * twR;
1014 m3 = t2[0] * twI;
1015
1016 *p2++ = m0 + m1;
1017 *p2++ = m2 - m3;
1018 /* use vertical symmetry col 2 */
1019 /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
1020 /* Bottom */
1021 m0 = t2[3] * twI;
1022 m1 = t2[2] * twR;
1023 m2 = t2[2] * twI;
1024 m3 = t2[3] * twR;
1025
1026 *pEnd2-- = m0 - m1;
1027 *pEnd2-- = m2 + m3;
1028
1029 /* COL 3 */
1030 twR = tw3[0];
1031 twI = tw3[1];
1032 tw3 += twMod3;
1033 /* Top */
1034 m0 = t3[0] * twR;
1035 m1 = t3[1] * twI;
1036 m2 = t3[1] * twR;
1037 m3 = t3[0] * twI;
1038
1039 *p3++ = m0 + m1;
1040 *p3++ = m2 - m3;
1041 /* use vertical symmetry col 3 */
1042 /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
1043 /* Bottom */
1044 m0 = -t3[3] * twR;
1045 m1 = t3[2] * twI;
1046 m2 = t3[2] * twR;
1047 m3 = t3[3] * twI;
1048
1049 *pEnd3-- = m0 - m1;
1050 *pEnd3-- = m3 - m2;
1051
1052 /* COL 4 */
1053 twR = tw4[0];
1054 twI = tw4[1];
1055 tw4 += twMod4;
1056 /* Top */
1057 m0 = t4[0] * twR;
1058 m1 = t4[1] * twI;
1059 m2 = t4[1] * twR;
1060 m3 = t4[0] * twI;
1061
1062 *p4++ = m0 + m1;
1063 *p4++ = m2 - m3;
1064 /* use vertical symmetry col 4 */
1065 /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
1066 /* Bottom */
1067 m0 = t4[3] * twI;
1068 m1 = t4[2] * twR;
1069 m2 = t4[2] * twI;
1070 m3 = t4[3] * twR;
1071
1072 *pEnd4-- = m0 - m1;
1073 *pEnd4-- = m2 + m3;
1074 }
1075
1076 /* MIDDLE */
1077 /* Twiddle factors are */
1078 /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
1079 p1ap3_0 = p1[0] + p3[0];
1080 p1sp3_0 = p1[0] - p3[0];
1081 p1ap3_1 = p1[1] + p3[1];
1082 p1sp3_1 = p1[1] - p3[1];
1083
1084 /* col 2 */
1085 t2[0] = p1sp3_0 + p2[1] - p4[1];
1086 t2[1] = p1sp3_1 - p2[0] + p4[0];
1087 /* col 3 */
1088 t3[0] = p1ap3_0 - p2[0] - p4[0];
1089 t3[1] = p1ap3_1 - p2[1] - p4[1];
1090 /* col 4 */
1091 t4[0] = p1sp3_0 - p2[1] + p4[1];
1092 t4[1] = p1sp3_1 + p2[0] - p4[0];
1093 /* col 1 - Top */
1094 *p1++ = p1ap3_0 + p2[0] + p4[0];
1095 *p1++ = p1ap3_1 + p2[1] + p4[1];
1096
1097 /* COL 2 */
1098 twR = tw2[0];
1099 twI = tw2[1];
1100
1101 m0 = t2[0] * twR;
1102 m1 = t2[1] * twI;
1103 m2 = t2[1] * twR;
1104 m3 = t2[0] * twI;
1105
1106 *p2++ = m0 + m1;
1107 *p2++ = m2 - m3;
1108 /* COL 3 */
1109 twR = tw3[0];
1110 twI = tw3[1];
1111
1112 m0 = t3[0] * twR;
1113 m1 = t3[1] * twI;
1114 m2 = t3[1] * twR;
1115 m3 = t3[0] * twI;
1116
1117 *p3++ = m0 + m1;
1118 *p3++ = m2 - m3;
1119 /* COL 4 */
1120 twR = tw4[0];
1121 twI = tw4[1];
1122
1123 m0 = t4[0] * twR;
1124 m1 = t4[1] * twI;
1125 m2 = t4[1] * twR;
1126 m3 = t4[0] * twI;
1127
1128 *p4++ = m0 + m1;
1129 *p4++ = m2 - m3;
1130
1131 /* first col */
1132 arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1133
1134 /* second col */
1135 arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1136
1137 /* third col */
1138 arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1139
1140 /* fourth col */
1141 arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1142 }
1143
1144 /**
1145 @addtogroup ComplexFFT
1146 @{
1147 */
1148
1149 /**
1150 @brief Processing function for the floating-point complex FFT.
1151 @param[in] S points to an instance of the floating-point CFFT structure
1152 @param[in,out] p1 points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1153 @param[in] ifftFlag flag that selects transform direction
1154 - value = 0: forward transform
1155 - value = 1: inverse transform
1156 @param[in] bitReverseFlag flag that enables / disables bit reversal of output
1157 - value = 0: disables bit reversal of output
1158 - value = 1: enables bit reversal of output
1159 @return none
1160 */
1161
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1162 void arm_cfft_f32(
1163 const arm_cfft_instance_f32 * S,
1164 float32_t * p1,
1165 uint8_t ifftFlag,
1166 uint8_t bitReverseFlag)
1167 {
1168 uint32_t L = S->fftLen, l;
1169 float32_t invL, * pSrc;
1170
1171 if (ifftFlag == 1U)
1172 {
1173 /* Conjugate input data */
1174 pSrc = p1 + 1;
1175 for (l = 0; l < L; l++)
1176 {
1177 *pSrc = -*pSrc;
1178 pSrc += 2;
1179 }
1180 }
1181
1182 switch (L)
1183 {
1184 case 16:
1185 case 128:
1186 case 1024:
1187 arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1188 break;
1189 case 32:
1190 case 256:
1191 case 2048:
1192 arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1193 break;
1194 case 64:
1195 case 512:
1196 case 4096:
1197 arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1198 break;
1199 }
1200
1201 if ( bitReverseFlag )
1202 arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1203
1204 if (ifftFlag == 1U)
1205 {
1206 invL = 1.0f / (float32_t)L;
1207
1208 /* Conjugate and scale output data */
1209 pSrc = p1;
1210 for (l= 0; l < L; l++)
1211 {
1212 *pSrc++ *= invL ;
1213 *pSrc = -(*pSrc) * invL;
1214 pSrc++;
1215 }
1216 }
1217 }
1218 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1219
1220 /**
1221 @} end of ComplexFFT group
1222 */
1223