• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_f32.c
4  * Description:  Combined Radix Decimation in Frequency CFFT Floating point processing function
5  *
6  * $Date:        18. March 2019
7  * $Revision:    V1.6.0
8  *
9  * Target Processor: Cortex-M cores
10  * -------------------------------------------------------------------- */
11 /*
12  * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
13  *
14  * SPDX-License-Identifier: Apache-2.0
15  *
16  * Licensed under the Apache License, Version 2.0 (the License); you may
17  * not use this file except in compliance with the License.
18  * You may obtain a copy of the License at
19  *
20  * www.apache.org/licenses/LICENSE-2.0
21  *
22  * Unless required by applicable law or agreed to in writing, software
23  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25  * See the License for the specific language governing permissions and
26  * limitations under the License.
27  */
28 
29 #include "arm_math.h"
30 #include "arm_common_tables.h"
31 
32 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
33 
34 #include "arm_helium_utils.h"
35 #include "arm_vec_fft.h"
36 #include "arm_mve_tables.h"
37 
38 
arm_inverse_fft_length_f32(uint16_t fftLen)39 static float32_t arm_inverse_fft_length_f32(uint16_t fftLen)
40 {
41   float32_t retValue=1.0;
42 
43   switch (fftLen)
44   {
45 
46   case 4096U:
47     retValue = 0.000244140625;
48     break;
49 
50   case 2048U:
51     retValue = 0.00048828125;
52     break;
53 
54   case 1024U:
55     retValue = 0.0009765625f;
56     break;
57 
58   case 512U:
59     retValue = 0.001953125;
60     break;
61 
62   case 256U:
63     retValue = 0.00390625f;
64     break;
65 
66   case 128U:
67     retValue = 0.0078125;
68     break;
69 
70   case 64U:
71     retValue = 0.015625f;
72     break;
73 
74   case 32U:
75     retValue = 0.03125;
76     break;
77 
78   case 16U:
79     retValue = 0.0625f;
80     break;
81 
82 
83   default:
84     break;
85   }
86   return(retValue);
87 }
88 
89 
arm_bitreversal_f32_inpl_mve(uint32_t * pSrc,const uint16_t bitRevLen,const uint16_t * pBitRevTab)90 static void arm_bitreversal_f32_inpl_mve(
91         uint32_t *pSrc,
92   const uint16_t  bitRevLen,
93   const uint16_t *pBitRevTab)
94 
95 {
96     uint64_t       *src = (uint64_t *) pSrc;
97     uint32_t        blkCnt;     /* loop counters */
98     uint32x4_t      bitRevTabOff;
99     uint32x4_t      one = vdupq_n_u32(1);
100 
101     blkCnt = (bitRevLen / 2) / 2;
102     while (blkCnt > 0U) {
103         bitRevTabOff = vldrhq_u32(pBitRevTab);
104         pBitRevTab += 4;
105 
106         uint64x2_t      bitRevOff1 = vmullbq_int_u32(bitRevTabOff, one);
107         uint64x2_t      bitRevOff2 = vmulltq_int_u32(bitRevTabOff, one);
108 
109         uint64x2_t      in1 = vldrdq_gather_offset_u64(src, bitRevOff1);
110         uint64x2_t      in2 = vldrdq_gather_offset_u64(src, bitRevOff2);
111 
112         vstrdq_scatter_offset_u64(src, bitRevOff1, in2);
113         vstrdq_scatter_offset_u64(src, bitRevOff2, in1);
114 
115         /*
116          * Decrement the blockSize loop counter
117          */
118         blkCnt--;
119     }
120 }
121 
122 
_arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)123 static void _arm_radix4_butterfly_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen)
124 {
125     f32x4_t vecTmp0, vecTmp1;
126     f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
127     f32x4_t vecA, vecB, vecC, vecD;
128     uint32_t  blkCnt;
129     uint32_t  n1, n2;
130     uint32_t  stage = 0;
131     int32_t  iter = 1;
132     static const uint32_t strides[4] = {
133         (0 - 16) * sizeof(q31_t *),
134         (1 - 16) * sizeof(q31_t *),
135         (8 - 16) * sizeof(q31_t *),
136         (9 - 16) * sizeof(q31_t *)
137     };
138 
139     n2 = fftLen;
140     n1 = n2;
141     n2 >>= 2u;
142     for (int k = fftLen / 4u; k > 1; k >>= 2)
143     {
144         for (int i = 0; i < iter; i++)
145         {
146             float32_t const     *p_rearranged_twiddle_tab_stride1 =
147                                 &S->rearranged_twiddle_stride1[
148                                 S->rearranged_twiddle_tab_stride1_arr[stage]];
149             float32_t const     *p_rearranged_twiddle_tab_stride2 =
150                                 &S->rearranged_twiddle_stride2[
151                                 S->rearranged_twiddle_tab_stride2_arr[stage]];
152             float32_t const     *p_rearranged_twiddle_tab_stride3 =
153                                 &S->rearranged_twiddle_stride3[
154                                 S->rearranged_twiddle_tab_stride3_arr[stage]];
155             float32_t const    *pW1, *pW2, *pW3;
156             float32_t           *inA = pSrc + CMPLX_DIM * i * n1;
157             float32_t           *inB = inA + n2 * CMPLX_DIM;
158             float32_t           *inC = inB + n2 * CMPLX_DIM;
159             float32_t           *inD = inC + n2 * CMPLX_DIM;
160             f32x4_t            vecW;
161 
162 
163             pW1 = p_rearranged_twiddle_tab_stride1;
164             pW2 = p_rearranged_twiddle_tab_stride2;
165             pW3 = p_rearranged_twiddle_tab_stride3;
166 
167             blkCnt = n2 / 2;
168             /*
169              * load 2 f32 complex pair
170              */
171             vecA = vldrwq_f32(inA);
172             vecC = vldrwq_f32(inC);
173             while (blkCnt > 0U)
174             {
175                 vecB = vldrwq_f32(inB);
176                 vecD = vldrwq_f32(inD);
177 
178                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
179                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
180 
181                 vecSum1 = vecB + vecD;
182                 vecDiff1 = vecB - vecD;
183                 /*
184                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
185                  */
186                 vecTmp0 = vecSum0 + vecSum1;
187                 vst1q(inA, vecTmp0);
188                 inA += 4;
189 
190                 /*
191                  * [ 1 -1 1 -1 ] * [ A B C D ]'
192                  */
193                 vecTmp0 = vecSum0 - vecSum1;
194                 /*
195                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
196                  */
197                 vecW = vld1q(pW2);
198                 pW2 += 4;
199                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
200                 vst1q(inB, vecTmp1);
201                 inB += 4;
202 
203                 /*
204                  * [ 1 -i -1 +i ] * [ A B C D ]'
205                  */
206                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
207                 /*
208                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
209                  */
210                 vecW = vld1q(pW1);
211                 pW1 +=4;
212                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
213                 vst1q(inC, vecTmp1);
214                 inC += 4;
215 
216                 /*
217                  * [ 1 +i -1 -i ] * [ A B C D ]'
218                  */
219                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
220                 /*
221                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
222                  */
223                 vecW = vld1q(pW3);
224                 pW3 += 4;
225                 vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
226                 vst1q(inD, vecTmp1);
227                 inD += 4;
228 
229                 vecA = vldrwq_f32(inA);
230                 vecC = vldrwq_f32(inC);
231 
232                 blkCnt--;
233             }
234         }
235         n1 = n2;
236         n2 >>= 2u;
237         iter = iter << 2;
238         stage++;
239     }
240 
241     /*
242      * start of Last stage process
243      */
244     uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
245     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
246 
247     /* load scheduling */
248     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
249     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
250 
251     blkCnt = (fftLen >> 3);
252     while (blkCnt > 0U)
253     {
254         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
255         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
256 
257         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
258         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
259 
260         vecSum1 = vecB + vecD;
261         vecDiff1 = vecB - vecD;
262 
263         /* pre-load for next iteration */
264         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
265         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
266 
267         vecTmp0 = vecSum0 + vecSum1;
268         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
269 
270         vecTmp0 = vecSum0 - vecSum1;
271         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
272 
273         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
274         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
275 
276         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
277         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
278 
279         blkCnt--;
280     }
281 
282     /*
283      * End of last stage process
284      */
285 }
286 
arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)287 static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
288 {
289     float32_t const *pCoefVec;
290     float32_t const  *pCoef = S->pTwiddle;
291     float32_t        *pIn0, *pIn1;
292     uint32_t          n2;
293     uint32_t          blkCnt;
294     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
295     f32x4_t         vecCmplxTmp, vecTw;
296 
297 
298     n2 = fftLen >> 1;
299     pIn0 = pSrc;
300     pIn1 = pSrc + fftLen;
301     pCoefVec = pCoef;
302 
303     blkCnt = n2 / 2;
304     while (blkCnt > 0U)
305     {
306         vecIn0 = *(f32x4_t *) pIn0;
307         vecIn1 = *(f32x4_t *) pIn1;
308         vecTw = vld1q(pCoefVec);
309         pCoefVec += 4;
310 
311         vecSum = vecIn0 + vecIn1;
312         vecDiff = vecIn0 - vecIn1;
313 
314         vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
315 
316         vst1q(pIn0, vecSum);
317         pIn0 += 4;
318         vst1q(pIn1, vecCmplxTmp);
319         pIn1 += 4;
320 
321         blkCnt--;
322     }
323 
324     _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
325 
326     _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
327 
328     pIn0 = pSrc;
329 }
330 
_arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen,float32_t onebyfftLen)331 static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
332 {
333     f32x4_t vecTmp0, vecTmp1;
334     f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
335     f32x4_t vecA, vecB, vecC, vecD;
336     f32x4_t vecW;
337     uint32_t  blkCnt;
338     uint32_t  n1, n2;
339     uint32_t  stage = 0;
340     int32_t  iter = 1;
341     static const uint32_t strides[4] = {
342         (0 - 16) * sizeof(q31_t *),
343         (1 - 16) * sizeof(q31_t *),
344         (8 - 16) * sizeof(q31_t *),
345         (9 - 16) * sizeof(q31_t *)
346     };
347 
348     n2 = fftLen;
349     n1 = n2;
350     n2 >>= 2u;
351     for (int k = fftLen / 4; k > 1; k >>= 2)
352     {
353         for (int i = 0; i < iter; i++)
354         {
355             float32_t const *p_rearranged_twiddle_tab_stride1 =
356                     &S->rearranged_twiddle_stride1[
357                     S->rearranged_twiddle_tab_stride1_arr[stage]];
358             float32_t const *p_rearranged_twiddle_tab_stride2 =
359                     &S->rearranged_twiddle_stride2[
360                     S->rearranged_twiddle_tab_stride2_arr[stage]];
361             float32_t const *p_rearranged_twiddle_tab_stride3 =
362                     &S->rearranged_twiddle_stride3[
363                     S->rearranged_twiddle_tab_stride3_arr[stage]];
364             float32_t const *pW1, *pW2, *pW3;
365             float32_t *inA = pSrc + CMPLX_DIM * i * n1;
366             float32_t *inB = inA + n2 * CMPLX_DIM;
367             float32_t *inC = inB + n2 * CMPLX_DIM;
368             float32_t *inD = inC + n2 * CMPLX_DIM;
369 
370             pW1 = p_rearranged_twiddle_tab_stride1;
371             pW2 = p_rearranged_twiddle_tab_stride2;
372             pW3 = p_rearranged_twiddle_tab_stride3;
373 
374             blkCnt = n2 / 2;
375             /*
376              * load 2 f32 complex pair
377              */
378             vecA = vldrwq_f32(inA);
379             vecC = vldrwq_f32(inC);
380             while (blkCnt > 0U)
381             {
382                 vecB = vldrwq_f32(inB);
383                 vecD = vldrwq_f32(inD);
384 
385                 vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
386                 vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
387 
388                 vecSum1 = vecB + vecD;
389                 vecDiff1 = vecB - vecD;
390                 /*
391                  * [ 1 1 1 1 ] * [ A B C D ]' .* 1
392                  */
393                 vecTmp0 = vecSum0 + vecSum1;
394                 vst1q(inA, vecTmp0);
395                 inA += 4;
396                 /*
397                  * [ 1 -1 1 -1 ] * [ A B C D ]'
398                  */
399                 vecTmp0 = vecSum0 - vecSum1;
400                 /*
401                  * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
402                  */
403                 vecW = vld1q(pW2);
404                 pW2 += 4;
405                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
406                 vst1q(inB, vecTmp1);
407                 inB += 4;
408 
409                 /*
410                  * [ 1 -i -1 +i ] * [ A B C D ]'
411                  */
412                 vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
413                 /*
414                  * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
415                  */
416                 vecW = vld1q(pW1);
417                 pW1 += 4;
418                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
419                 vst1q(inC, vecTmp1);
420                 inC += 4;
421 
422                 /*
423                  * [ 1 +i -1 -i ] * [ A B C D ]'
424                  */
425                 vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
426                 /*
427                  * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
428                  */
429                 vecW = vld1q(pW3);
430                 pW3 += 4;
431                 vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
432                 vst1q(inD, vecTmp1);
433                 inD += 4;
434 
435                 vecA = vldrwq_f32(inA);
436                 vecC = vldrwq_f32(inC);
437 
438                 blkCnt--;
439             }
440         }
441         n1 = n2;
442         n2 >>= 2u;
443         iter = iter << 2;
444         stage++;
445     }
446 
447     /*
448      * start of Last stage process
449      */
450     uint32x4_t vecScGathAddr = *(uint32x4_t *) strides;
451     vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
452 
453     /*
454      * load scheduling
455      */
456     vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
457     vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
458 
459     blkCnt = (fftLen >> 3);
460     while (blkCnt > 0U)
461     {
462         vecSum0 = vecA + vecC;  /* vecSum0 = vaddq(vecA, vecC) */
463         vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
464 
465         vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
466         vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
467 
468         vecSum1 = vecB + vecD;
469         vecDiff1 = vecB - vecD;
470 
471         vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
472         vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
473 
474         vecTmp0 = vecSum0 + vecSum1;
475         vecTmp0 = vecTmp0 * onebyfftLen;
476         vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
477 
478         vecTmp0 = vecSum0 - vecSum1;
479         vecTmp0 = vecTmp0 * onebyfftLen;
480         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
481 
482         vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
483         vecTmp0 = vecTmp0 * onebyfftLen;
484         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
485 
486         vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
487         vecTmp0 = vecTmp0 * onebyfftLen;
488         vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
489 
490         blkCnt--;
491     }
492 
493     /*
494      * End of last stage process
495      */
496 }
497 
arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint32_t fftLen)498 static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
499 {
500     float32_t const *pCoefVec;
501     float32_t const  *pCoef = S->pTwiddle;
502     float32_t        *pIn0, *pIn1;
503     uint32_t          n2;
504     float32_t         onebyfftLen = arm_inverse_fft_length_f32(fftLen);
505     uint32_t          blkCnt;
506     f32x4_t         vecIn0, vecIn1, vecSum, vecDiff;
507     f32x4_t         vecCmplxTmp, vecTw;
508 
509 
510     n2 = fftLen >> 1;
511     pIn0 = pSrc;
512     pIn1 = pSrc + fftLen;
513     pCoefVec = pCoef;
514 
515     blkCnt = n2 / 2;
516     while (blkCnt > 0U)
517     {
518         vecIn0 = *(f32x4_t *) pIn0;
519         vecIn1 = *(f32x4_t *) pIn1;
520         vecTw = vld1q(pCoefVec);
521         pCoefVec += 4;
522 
523         vecSum = vecIn0 + vecIn1;
524         vecDiff = vecIn0 - vecIn1;
525 
526         vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
527 
528         vst1q(pIn0, vecSum);
529         pIn0 += 4;
530         vst1q(pIn1, vecCmplxTmp);
531         pIn1 += 4;
532 
533         blkCnt--;
534     }
535 
536     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
537 
538     _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
539 }
540 
541 
542 /**
543   @addtogroup ComplexFFT
544   @{
545  */
546 
547 /**
548   @brief         Processing function for the floating-point complex FFT.
549   @param[in]     S              points to an instance of the floating-point CFFT structure
550   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
551   @param[in]     ifftFlag       flag that selects transform direction
552                    - value = 0: forward transform
553                    - value = 1: inverse transform
554   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
555                    - value = 0: disables bit reversal of output
556                    - value = 1: enables bit reversal of output
557   @return        none
558  */
559 
560 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * pSrc,uint8_t ifftFlag,uint8_t bitReverseFlag)561 void arm_cfft_f32(
562   const arm_cfft_instance_f32 * S,
563         float32_t * pSrc,
564         uint8_t ifftFlag,
565         uint8_t bitReverseFlag)
566 {
567         uint32_t fftLen = S->fftLen;
568 
569         if (ifftFlag == 1U) {
570 
571             switch (fftLen) {
572             case 16:
573             case 64:
574             case 256:
575             case 1024:
576             case 4096:
577                 _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
578                 break;
579 
580             case 32:
581             case 128:
582             case 512:
583             case 2048:
584                 arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
585                 break;
586             }
587         } else {
588             switch (fftLen) {
589             case 16:
590             case 64:
591             case 256:
592             case 1024:
593             case 4096:
594                 _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
595                 break;
596 
597             case 32:
598             case 128:
599             case 512:
600             case 2048:
601                 arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
602                 break;
603             }
604         }
605 
606 
607         if (bitReverseFlag)
608         {
609 
610             arm_bitreversal_f32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
611 
612         }
613 }
614 
615 
616 #else
617 extern void arm_radix8_butterfly_f32(
618         float32_t * pSrc,
619         uint16_t fftLen,
620   const float32_t * pCoef,
621         uint16_t twidCoefModifier);
622 
623 extern void arm_bitreversal_32(
624         uint32_t * pSrc,
625   const uint16_t bitRevLen,
626   const uint16_t * pBitRevTable);
627 
628 /**
629   @ingroup groupTransforms
630  */
631 
632 /**
633   @defgroup ComplexFFT Complex FFT Functions
634 
635   @par
636                    The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
637                    Discrete Fourier Transform (DFT).  The FFT can be orders of magnitude faster
638                    than the DFT, especially for long lengths.
639                    The algorithms described in this section
640                    operate on complex data.  A separate set of functions is devoted to handling
641                    of real sequences.
642   @par
643                    There are separate algorithms for handling floating-point, Q15, and Q31 data
644                    types.  The algorithms available for each data type are described next.
645   @par
646                    The FFT functions operate in-place.  That is, the array holding the input data
647                    will also be used to hold the corresponding result.  The input data is complex
648                    and contains <code>2*fftLen</code> interleaved values as shown below.
649                    <pre>{real[0], imag[0], real[1], imag[1], ...} </pre>
650                    The FFT result will be contained in the same array and the frequency domain
651                    values will have the same interleaving.
652 
653   @par Floating-point
654                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-8
655                    stages are performed along with a single radix-2 or radix-4 stage, as needed.
656                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
657                    a different twiddle factor table.
658   @par
659                    The function uses the standard FFT definition and output values may grow by a
660                    factor of <code>fftLen</code> when computing the forward transform.  The
661                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
662                    calculation and this matches the textbook definition of the inverse FFT.
663   @par
664                    For the MVE version, the new arm_cfft_init_f32 initialization function is
665                    <b>mandatory</b>. <b>Compilation flags are available to include only the required tables for the
666                    needed FFTs.</b> Other FFT versions can continue to be initialized as
667                    explained below.
668   @par
669                    For not MVE versions, pre-initialized data structures containing twiddle factors
670                    and bit reversal tables are provided and defined in <code>arm_const_structs.h</code>.  Include
671                    this header in your function and then pass one of the constant structures as
672                    an argument to arm_cfft_f32.  For example:
673   @par
674                    <code>arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)</code>
675   @par
676                    computes a 64-point inverse complex FFT including bit reversal.
677                    The data structures are treated as constant data and not modified during the
678                    calculation.  The same data structure can be reused for multiple transforms
679                    including mixing forward and inverse transforms.
680   @par
681                    Earlier releases of the library provided separate radix-2 and radix-4
682                    algorithms that operated on floating-point data.  These functions are still
683                    provided but are deprecated.  The older functions are slower and less general
684                    than the new functions.
685   @par
686                    An example of initialization of the constants for the arm_cfft_f32 function follows:
687   @code
688                    const static arm_cfft_instance_f32 *S;
689                    ...
690                      switch (length) {
691                        case 16:
692                          S = &arm_cfft_sR_f32_len16;
693                          break;
694                        case 32:
695                          S = &arm_cfft_sR_f32_len32;
696                          break;
697                        case 64:
698                          S = &arm_cfft_sR_f32_len64;
699                          break;
700                        case 128:
701                          S = &arm_cfft_sR_f32_len128;
702                          break;
703                        case 256:
704                          S = &arm_cfft_sR_f32_len256;
705                          break;
706                        case 512:
707                          S = &arm_cfft_sR_f32_len512;
708                          break;
709                        case 1024:
710                          S = &arm_cfft_sR_f32_len1024;
711                          break;
712                        case 2048:
713                          S = &arm_cfft_sR_f32_len2048;
714                          break;
715                        case 4096:
716                          S = &arm_cfft_sR_f32_len4096;
717                          break;
718                      }
719   @endcode
720   @par
721                    The new arm_cfft_init_f32 can also be used.
722   @par Q15 and Q31
723                    The floating-point complex FFT uses a mixed-radix algorithm.  Multiple radix-4
724                    stages are performed along with a single radix-2 stage, as needed.
725                    The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
726                    a different twiddle factor table.
727   @par
728                    The function uses the standard FFT definition and output values may grow by a
729                    factor of <code>fftLen</code> when computing the forward transform.  The
730                    inverse transform includes a scale of <code>1/fftLen</code> as part of the
731                    calculation and this matches the textbook definition of the inverse FFT.
732   @par
733                    Pre-initialized data structures containing twiddle factors and bit reversal
734                    tables are provided and defined in <code>arm_const_structs.h</code>.  Include
735                    this header in your function and then pass one of the constant structures as
736                    an argument to arm_cfft_q31. For example:
737   @par
738                    <code>arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)</code>
739   @par
740                    computes a 64-point inverse complex FFT including bit reversal.
741                    The data structures are treated as constant data and not modified during the
742                    calculation.  The same data structure can be reused for multiple transforms
743                    including mixing forward and inverse transforms.
744   @par
745                    Earlier releases of the library provided separate radix-2 and radix-4
746                    algorithms that operated on floating-point data.  These functions are still
747                    provided but are deprecated.  The older functions are slower and less general
748                    than the new functions.
749   @par
750                    An example of initialization of the constants for the arm_cfft_q31 function follows:
751   @code
752                    const static arm_cfft_instance_q31 *S;
753                    ...
754                      switch (length) {
755                        case 16:
756                          S = &arm_cfft_sR_q31_len16;
757                          break;
758                        case 32:
759                          S = &arm_cfft_sR_q31_len32;
760                          break;
761                        case 64:
762                          S = &arm_cfft_sR_q31_len64;
763                          break;
764                        case 128:
765                          S = &arm_cfft_sR_q31_len128;
766                          break;
767                        case 256:
768                          S = &arm_cfft_sR_q31_len256;
769                          break;
770                        case 512:
771                          S = &arm_cfft_sR_q31_len512;
772                          break;
773                        case 1024:
774                          S = &arm_cfft_sR_q31_len1024;
775                          break;
776                        case 2048:
777                          S = &arm_cfft_sR_q31_len2048;
778                          break;
779                        case 4096:
780                          S = &arm_cfft_sR_q31_len4096;
781                          break;
782                      }
783   @endcode
784 
785  */
786 
arm_cfft_radix8by2_f32(arm_cfft_instance_f32 * S,float32_t * p1)787 void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
788 {
789   uint32_t    L  = S->fftLen;
790   float32_t * pCol1, * pCol2, * pMid1, * pMid2;
791   float32_t * p2 = p1 + L;
792   const float32_t * tw = (float32_t *) S->pTwiddle;
793   float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
794   float32_t m0, m1, m2, m3;
795   uint32_t l;
796 
797   pCol1 = p1;
798   pCol2 = p2;
799 
800   /* Define new length */
801   L >>= 1;
802 
803   /* Initialize mid pointers */
804   pMid1 = p1 + L;
805   pMid2 = p2 + L;
806 
807   /* do two dot Fourier transform */
808   for (l = L >> 2; l > 0; l-- )
809   {
810     t1[0] = p1[0];
811     t1[1] = p1[1];
812     t1[2] = p1[2];
813     t1[3] = p1[3];
814 
815     t2[0] = p2[0];
816     t2[1] = p2[1];
817     t2[2] = p2[2];
818     t2[3] = p2[3];
819 
820     t3[0] = pMid1[0];
821     t3[1] = pMid1[1];
822     t3[2] = pMid1[2];
823     t3[3] = pMid1[3];
824 
825     t4[0] = pMid2[0];
826     t4[1] = pMid2[1];
827     t4[2] = pMid2[2];
828     t4[3] = pMid2[3];
829 
830     *p1++ = t1[0] + t2[0];
831     *p1++ = t1[1] + t2[1];
832     *p1++ = t1[2] + t2[2];
833     *p1++ = t1[3] + t2[3];    /* col 1 */
834 
835     t2[0] = t1[0] - t2[0];
836     t2[1] = t1[1] - t2[1];
837     t2[2] = t1[2] - t2[2];
838     t2[3] = t1[3] - t2[3];    /* for col 2 */
839 
840     *pMid1++ = t3[0] + t4[0];
841     *pMid1++ = t3[1] + t4[1];
842     *pMid1++ = t3[2] + t4[2];
843     *pMid1++ = t3[3] + t4[3]; /* col 1 */
844 
845     t4[0] = t4[0] - t3[0];
846     t4[1] = t4[1] - t3[1];
847     t4[2] = t4[2] - t3[2];
848     t4[3] = t4[3] - t3[3];    /* for col 2 */
849 
850     twR = *tw++;
851     twI = *tw++;
852 
853     /* multiply by twiddle factors */
854     m0 = t2[0] * twR;
855     m1 = t2[1] * twI;
856     m2 = t2[1] * twR;
857     m3 = t2[0] * twI;
858 
859     /* R  =  R  *  Tr - I * Ti */
860     *p2++ = m0 + m1;
861     /* I  =  I  *  Tr + R * Ti */
862     *p2++ = m2 - m3;
863 
864     /* use vertical symmetry */
865     /*  0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
866     m0 = t4[0] * twI;
867     m1 = t4[1] * twR;
868     m2 = t4[1] * twI;
869     m3 = t4[0] * twR;
870 
871     *pMid2++ = m0 - m1;
872     *pMid2++ = m2 + m3;
873 
874     twR = *tw++;
875     twI = *tw++;
876 
877     m0 = t2[2] * twR;
878     m1 = t2[3] * twI;
879     m2 = t2[3] * twR;
880     m3 = t2[2] * twI;
881 
882     *p2++ = m0 + m1;
883     *p2++ = m2 - m3;
884 
885     m0 = t4[2] * twI;
886     m1 = t4[3] * twR;
887     m2 = t4[3] * twI;
888     m3 = t4[2] * twR;
889 
890     *pMid2++ = m0 - m1;
891     *pMid2++ = m2 + m3;
892   }
893 
894   /* first col */
895   arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
896 
897   /* second col */
898   arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
899 }
900 
arm_cfft_radix8by4_f32(arm_cfft_instance_f32 * S,float32_t * p1)901 void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
902 {
903     uint32_t    L  = S->fftLen >> 1;
904     float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
905     const float32_t *tw2, *tw3, *tw4;
906     float32_t * p2 = p1 + L;
907     float32_t * p3 = p2 + L;
908     float32_t * p4 = p3 + L;
909     float32_t t2[4], t3[4], t4[4], twR, twI;
910     float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
911     float32_t m0, m1, m2, m3;
912     uint32_t l, twMod2, twMod3, twMod4;
913 
914     pCol1 = p1;         /* points to real values by default */
915     pCol2 = p2;
916     pCol3 = p3;
917     pCol4 = p4;
918     pEnd1 = p2 - 1;     /* points to imaginary values by default */
919     pEnd2 = p3 - 1;
920     pEnd3 = p4 - 1;
921     pEnd4 = pEnd3 + L;
922 
923     tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
924 
925     L >>= 1;
926 
927     /* do four dot Fourier transform */
928 
929     twMod2 = 2;
930     twMod3 = 4;
931     twMod4 = 6;
932 
933     /* TOP */
934     p1ap3_0 = p1[0] + p3[0];
935     p1sp3_0 = p1[0] - p3[0];
936     p1ap3_1 = p1[1] + p3[1];
937     p1sp3_1 = p1[1] - p3[1];
938 
939     /* col 2 */
940     t2[0] = p1sp3_0 + p2[1] - p4[1];
941     t2[1] = p1sp3_1 - p2[0] + p4[0];
942     /* col 3 */
943     t3[0] = p1ap3_0 - p2[0] - p4[0];
944     t3[1] = p1ap3_1 - p2[1] - p4[1];
945     /* col 4 */
946     t4[0] = p1sp3_0 - p2[1] + p4[1];
947     t4[1] = p1sp3_1 + p2[0] - p4[0];
948     /* col 1 */
949     *p1++ = p1ap3_0 + p2[0] + p4[0];
950     *p1++ = p1ap3_1 + p2[1] + p4[1];
951 
952     /* Twiddle factors are ones */
953     *p2++ = t2[0];
954     *p2++ = t2[1];
955     *p3++ = t3[0];
956     *p3++ = t3[1];
957     *p4++ = t4[0];
958     *p4++ = t4[1];
959 
960     tw2 += twMod2;
961     tw3 += twMod3;
962     tw4 += twMod4;
963 
964     for (l = (L - 2) >> 1; l > 0; l-- )
965     {
966       /* TOP */
967       p1ap3_0 = p1[0] + p3[0];
968       p1sp3_0 = p1[0] - p3[0];
969       p1ap3_1 = p1[1] + p3[1];
970       p1sp3_1 = p1[1] - p3[1];
971       /* col 2 */
972       t2[0] = p1sp3_0 + p2[1] - p4[1];
973       t2[1] = p1sp3_1 - p2[0] + p4[0];
974       /* col 3 */
975       t3[0] = p1ap3_0 - p2[0] - p4[0];
976       t3[1] = p1ap3_1 - p2[1] - p4[1];
977       /* col 4 */
978       t4[0] = p1sp3_0 - p2[1] + p4[1];
979       t4[1] = p1sp3_1 + p2[0] - p4[0];
980       /* col 1 - top */
981       *p1++ = p1ap3_0 + p2[0] + p4[0];
982       *p1++ = p1ap3_1 + p2[1] + p4[1];
983 
984       /* BOTTOM */
985       p1ap3_1 = pEnd1[-1] + pEnd3[-1];
986       p1sp3_1 = pEnd1[-1] - pEnd3[-1];
987       p1ap3_0 = pEnd1[ 0] + pEnd3[0];
988       p1sp3_0 = pEnd1[ 0] - pEnd3[0];
989       /* col 2 */
990       t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
991       t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
992       /* col 3 */
993       t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
994       t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
995       /* col 4 */
996       t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
997       t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
998       /* col 1 - Bottom */
999       *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
1000       *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
1001 
1002       /* COL 2 */
1003       /* read twiddle factors */
1004       twR = *tw2++;
1005       twI = *tw2++;
1006       /* multiply by twiddle factors */
1007       /*  let    Z1 = a + i(b),   Z2 = c + i(d) */
1008       /*   =>  Z1 * Z2  =  (a*c - b*d) + i(b*c + a*d) */
1009 
1010       /* Top */
1011       m0 = t2[0] * twR;
1012       m1 = t2[1] * twI;
1013       m2 = t2[1] * twR;
1014       m3 = t2[0] * twI;
1015 
1016       *p2++ = m0 + m1;
1017       *p2++ = m2 - m3;
1018       /* use vertical symmetry col 2 */
1019       /* 0.9997 - 0.0245i  <==>  0.0245 - 0.9997i */
1020       /* Bottom */
1021       m0 = t2[3] * twI;
1022       m1 = t2[2] * twR;
1023       m2 = t2[2] * twI;
1024       m3 = t2[3] * twR;
1025 
1026       *pEnd2-- = m0 - m1;
1027       *pEnd2-- = m2 + m3;
1028 
1029       /* COL 3 */
1030       twR = tw3[0];
1031       twI = tw3[1];
1032       tw3 += twMod3;
1033       /* Top */
1034       m0 = t3[0] * twR;
1035       m1 = t3[1] * twI;
1036       m2 = t3[1] * twR;
1037       m3 = t3[0] * twI;
1038 
1039       *p3++ = m0 + m1;
1040       *p3++ = m2 - m3;
1041       /* use vertical symmetry col 3 */
1042       /* 0.9988 - 0.0491i  <==>  -0.9988 - 0.0491i */
1043       /* Bottom */
1044       m0 = -t3[3] * twR;
1045       m1 =  t3[2] * twI;
1046       m2 =  t3[2] * twR;
1047       m3 =  t3[3] * twI;
1048 
1049       *pEnd3-- = m0 - m1;
1050       *pEnd3-- = m3 - m2;
1051 
1052       /* COL 4 */
1053       twR = tw4[0];
1054       twI = tw4[1];
1055       tw4 += twMod4;
1056       /* Top */
1057       m0 = t4[0] * twR;
1058       m1 = t4[1] * twI;
1059       m2 = t4[1] * twR;
1060       m3 = t4[0] * twI;
1061 
1062       *p4++ = m0 + m1;
1063       *p4++ = m2 - m3;
1064       /* use vertical symmetry col 4 */
1065       /* 0.9973 - 0.0736i  <==>  -0.0736 + 0.9973i */
1066       /* Bottom */
1067       m0 = t4[3] * twI;
1068       m1 = t4[2] * twR;
1069       m2 = t4[2] * twI;
1070       m3 = t4[3] * twR;
1071 
1072       *pEnd4-- = m0 - m1;
1073       *pEnd4-- = m2 + m3;
1074     }
1075 
1076     /* MIDDLE */
1077     /* Twiddle factors are */
1078     /*  1.0000  0.7071-0.7071i  -1.0000i  -0.7071-0.7071i */
1079     p1ap3_0 = p1[0] + p3[0];
1080     p1sp3_0 = p1[0] - p3[0];
1081     p1ap3_1 = p1[1] + p3[1];
1082     p1sp3_1 = p1[1] - p3[1];
1083 
1084     /* col 2 */
1085     t2[0] = p1sp3_0 + p2[1] - p4[1];
1086     t2[1] = p1sp3_1 - p2[0] + p4[0];
1087     /* col 3 */
1088     t3[0] = p1ap3_0 - p2[0] - p4[0];
1089     t3[1] = p1ap3_1 - p2[1] - p4[1];
1090     /* col 4 */
1091     t4[0] = p1sp3_0 - p2[1] + p4[1];
1092     t4[1] = p1sp3_1 + p2[0] - p4[0];
1093     /* col 1 - Top */
1094     *p1++ = p1ap3_0 + p2[0] + p4[0];
1095     *p1++ = p1ap3_1 + p2[1] + p4[1];
1096 
1097     /* COL 2 */
1098     twR = tw2[0];
1099     twI = tw2[1];
1100 
1101     m0 = t2[0] * twR;
1102     m1 = t2[1] * twI;
1103     m2 = t2[1] * twR;
1104     m3 = t2[0] * twI;
1105 
1106     *p2++ = m0 + m1;
1107     *p2++ = m2 - m3;
1108     /* COL 3 */
1109     twR = tw3[0];
1110     twI = tw3[1];
1111 
1112     m0 = t3[0] * twR;
1113     m1 = t3[1] * twI;
1114     m2 = t3[1] * twR;
1115     m3 = t3[0] * twI;
1116 
1117     *p3++ = m0 + m1;
1118     *p3++ = m2 - m3;
1119     /* COL 4 */
1120     twR = tw4[0];
1121     twI = tw4[1];
1122 
1123     m0 = t4[0] * twR;
1124     m1 = t4[1] * twI;
1125     m2 = t4[1] * twR;
1126     m3 = t4[0] * twI;
1127 
1128     *p4++ = m0 + m1;
1129     *p4++ = m2 - m3;
1130 
1131     /* first col */
1132     arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
1133 
1134     /* second col */
1135     arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
1136 
1137     /* third col */
1138     arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
1139 
1140     /* fourth col */
1141     arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
1142 }
1143 
1144 /**
1145   @addtogroup ComplexFFT
1146   @{
1147  */
1148 
1149 /**
1150   @brief         Processing function for the floating-point complex FFT.
1151   @param[in]     S              points to an instance of the floating-point CFFT structure
1152   @param[in,out] p1             points to the complex data buffer of size <code>2*fftLen</code>. Processing occurs in-place
1153   @param[in]     ifftFlag       flag that selects transform direction
1154                    - value = 0: forward transform
1155                    - value = 1: inverse transform
1156   @param[in]     bitReverseFlag flag that enables / disables bit reversal of output
1157                    - value = 0: disables bit reversal of output
1158                    - value = 1: enables bit reversal of output
1159   @return        none
1160  */
1161 
arm_cfft_f32(const arm_cfft_instance_f32 * S,float32_t * p1,uint8_t ifftFlag,uint8_t bitReverseFlag)1162 void arm_cfft_f32(
1163   const arm_cfft_instance_f32 * S,
1164         float32_t * p1,
1165         uint8_t ifftFlag,
1166         uint8_t bitReverseFlag)
1167 {
1168   uint32_t  L = S->fftLen, l;
1169   float32_t invL, * pSrc;
1170 
1171   if (ifftFlag == 1U)
1172   {
1173     /* Conjugate input data */
1174     pSrc = p1 + 1;
1175     for (l = 0; l < L; l++)
1176     {
1177       *pSrc = -*pSrc;
1178       pSrc += 2;
1179     }
1180   }
1181 
1182   switch (L)
1183   {
1184   case 16:
1185   case 128:
1186   case 1024:
1187     arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
1188     break;
1189   case 32:
1190   case 256:
1191   case 2048:
1192     arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
1193     break;
1194   case 64:
1195   case 512:
1196   case 4096:
1197     arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
1198     break;
1199   }
1200 
1201   if ( bitReverseFlag )
1202     arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
1203 
1204   if (ifftFlag == 1U)
1205   {
1206     invL = 1.0f / (float32_t)L;
1207 
1208     /* Conjugate and scale output data */
1209     pSrc = p1;
1210     for (l= 0; l < L; l++)
1211     {
1212       *pSrc++ *=   invL ;
1213       *pSrc    = -(*pSrc) * invL;
1214       pSrc++;
1215     }
1216   }
1217 }
1218 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
1219 
1220 /**
1221   @} end of ComplexFFT group
1222  */
1223