• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* ----------------------------------------------------------------------
2  * Project:      CMSIS DSP Library
3  * Title:        arm_cfft_radix4_q15.c
4  * Description:  This file has function definition of Radix-4 FFT & IFFT function and
5  *               In-place bit reversal using bit reversal table
6  *
7  * $Date:        23 April 2021
8  * $Revision:    V1.9.0
9  *
10  * Target Processor: Cortex-M and Cortex-A cores
11  * -------------------------------------------------------------------- */
12 /*
13  * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
14  *
15  * SPDX-License-Identifier: Apache-2.0
16  *
17  * Licensed under the Apache License, Version 2.0 (the License); you may
18  * not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  *
21  * www.apache.org/licenses/LICENSE-2.0
22  *
23  * Unless required by applicable law or agreed to in writing, software
24  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
25  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
26  * See the License for the specific language governing permissions and
27  * limitations under the License.
28  */
29 
30 #include "dsp/transform_functions.h"
31 
32 
33 void arm_radix4_butterfly_q15(
34         q15_t * pSrc16,
35         uint32_t fftLen,
36   const q15_t * pCoef16,
37         uint32_t twidCoefModifier);
38 
39 void arm_radix4_butterfly_inverse_q15(
40         q15_t * pSrc16,
41         uint32_t fftLen,
42   const q15_t * pCoef16,
43         uint32_t twidCoefModifier);
44 
45 void arm_bitreversal_q15(
46         q15_t * pSrc,
47         uint32_t fftLen,
48         uint16_t bitRevFactor,
49   const uint16_t * pBitRevTab);
50 
51 /**
52   @addtogroup ComplexFFTDeprecated
53   @{
54  */
55 
56 
57 /**
58   @brief               Processing function for the Q15 CFFT/CIFFT.
59   @deprecated          Do not use this function.  It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
60   @param[in]     S     points to an instance of the Q15 CFFT/CIFFT structure.
61   @param[in,out] pSrc  points to the complex data buffer. Processing occurs in-place.
62   @return        none
63 
64   @par Input and output formats:
65                  Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
66                  Hence the output format is different for different FFT sizes.
67                  The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
68   @par
69 
70 | CFFT Size | Input format  | Output format | Number of bits to upscale |
71 | --------: | ------------: | ------------: | ------------------------: |
72 | 16        | 1.15          | 5.11          | 4                         |
73 | 64        | 1.15          | 7.9           | 6                         |
74 | 256       | 1.15          | 9.7           | 8                         |
75 | 1024      | 1.15          | 11.5          | 10                        |
76 
77 | CIFFT Size | Input format  | Output format | Number of bits to upscale |
78 | ---------: | ------------: | ------------: | ------------------------: |
79 | 16         | 1.15          | 5.11          | 0                         |
80 | 64         | 1.15          | 7.9           | 0                         |
81 | 256        | 1.15          | 9.7           | 0                         |
82 | 1024       | 1.15          | 11.5          | 0                         |
83 
84  */
85 
arm_cfft_radix4_q15(const arm_cfft_radix4_instance_q15 * S,q15_t * pSrc)86 void arm_cfft_radix4_q15(
87   const arm_cfft_radix4_instance_q15 * S,
88         q15_t * pSrc)
89 {
90   if (S->ifftFlag == 1U)
91   {
92     /*  Complex IFFT radix-4  */
93     arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
94   }
95   else
96   {
97     /*  Complex FFT radix-4  */
98     arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
99   }
100 
101   if (S->bitReverseFlag == 1U)
102   {
103     /*  Bit Reversal */
104     arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
105   }
106 
107 }
108 
109 /**
110   @} end of ComplexFFTDeprecated group
111  */
112 
113 /*
114  * Radix-4 FFT algorithm used is :
115  *
116  * Input real and imaginary data:
117  * x(n) = xa + j * ya
118  * x(n+N/4 ) = xb + j * yb
119  * x(n+N/2 ) = xc + j * yc
120  * x(n+3N 4) = xd + j * yd
121  *
122  *
123  * Output real and imaginary data:
124  * x(4r) = xa'+ j * ya'
125  * x(4r+1) = xb'+ j * yb'
126  * x(4r+2) = xc'+ j * yc'
127  * x(4r+3) = xd'+ j * yd'
128  *
129  *
130  * Twiddle factors for radix-4 FFT:
131  * Wn = co1 + j * (- si1)
132  * W2n = co2 + j * (- si2)
133  * W3n = co3 + j * (- si3)
134 
135  * The real and imaginary output values for the radix-4 butterfly are
136  * xa' = xa + xb + xc + xd
137  * ya' = ya + yb + yc + yd
138  * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
139  * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
140  * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
141  * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
142  * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
143  * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
144  *
145  */
146 
147 /**
148   @brief         Core function for the Q15 CFFT butterfly process.
149   @param[in,out] pSrc16          points to the in-place buffer of Q15 data type
150   @param[in]     fftLen           length of the FFT
151   @param[in]     pCoef16         points to twiddle coefficient buffer
152   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
153   @return        none
154  */
155 
arm_radix4_butterfly_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)156 void arm_radix4_butterfly_q15(
157         q15_t * pSrc16,
158         uint32_t fftLen,
159   const q15_t * pCoef16,
160         uint32_t twidCoefModifier)
161 {
162 
163 #if defined (ARM_MATH_DSP)
164 
165         q31_t R, S, T, U;
166         q31_t C1, C2, C3, out1, out2;
167         uint32_t n1, n2, ic, i0, j, k;
168 
169         q15_t *ptr1;
170         q15_t *pSi0;
171         q15_t *pSi1;
172         q15_t *pSi2;
173         q15_t *pSi3;
174 
175         q31_t xaya, xbyb, xcyc, xdyd;
176 
177   /* Total process is divided into three stages */
178 
179   /* process first stage, middle stages, & last stage */
180 
181   /*  Initializations for the first stage */
182   n2 = fftLen;
183   n1 = n2;
184 
185   /* n2 = fftLen/4 */
186   n2 >>= 2U;
187 
188   /* Index for twiddle coefficient */
189   ic = 0U;
190 
191   /* Index for input read and output write */
192   j = n2;
193 
194   pSi0 = pSrc16;
195   pSi1 = pSi0 + 2 * n2;
196   pSi2 = pSi1 + 2 * n2;
197   pSi3 = pSi2 + 2 * n2;
198 
199   /* Input is in 1.15(q15) format */
200 
201   /*  start of first stage process */
202   do
203   {
204     /*  Butterfly implementation */
205 
206     /* Reading i0, i0+fftLen/2 inputs */
207     /* Read ya (real), xa(imag) input */
208     T = read_q15x2 (pSi0);
209     T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
210     T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
211 /*
212     in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
213      T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
214 */
215 
216     /* Read yc (real), xc(imag) input */
217     S = read_q15x2 (pSi2);
218     S = __SHADD16(S, 0);
219     S = __SHADD16(S, 0);
220 
221     /* R = packed((ya + yc), (xa + xc) ) */
222     R = __QADD16(T, S);
223 
224     /* S = packed((ya - yc), (xa - xc) ) */
225     S = __QSUB16(T, S);
226 
227     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
228     /* Read yb (real), xb(imag) input */
229     T = read_q15x2 (pSi1);
230     T = __SHADD16(T, 0);
231     T = __SHADD16(T, 0);
232 
233     /* Read yd (real), xd(imag) input */
234     U = read_q15x2 (pSi3);
235     U = __SHADD16(U, 0);
236     U = __SHADD16(U, 0);
237 
238     /* T = packed((yb + yd), (xb + xd) ) */
239     T = __QADD16(T, U);
240 
241     /*  writing the butterfly processed i0 sample */
242     /* xa' = xa + xb + xc + xd */
243     /* ya' = ya + yb + yc + yd */
244     write_q15x2_ia (&pSi0, __SHADD16(R, T));
245 
246     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
247     R = __QSUB16(R, T);
248 
249     /* co2 & si2 are read from SIMD Coefficient pointer */
250     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
251 
252 #ifndef ARM_MATH_BIG_ENDIAN
253     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
254     out1 = __SMUAD(C2, R) >> 16U;
255     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
256     out2 = __SMUSDX(C2, R);
257 #else
258     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
259     out1 = __SMUSDX(R, C2) >> 16U;
260     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
261     out2 = __SMUAD(C2, R);
262 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
263 
264     /*  Reading i0+fftLen/4 */
265     /* T = packed(yb, xb) */
266     T = read_q15x2 (pSi1);
267     T = __SHADD16(T, 0);
268     T = __SHADD16(T, 0);
269 
270     /* writing the butterfly processed i0 + fftLen/4 sample */
271     /* writing output(xc', yc') in little endian format */
272     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
273 
274     /*  Butterfly calculations */
275     /* U = packed(yd, xd) */
276     U = read_q15x2 (pSi3);
277     U = __SHADD16(U, 0);
278     U = __SHADD16(U, 0);
279 
280     /* T = packed(yb-yd, xb-xd) */
281     T = __QSUB16(T, U);
282 
283 #ifndef ARM_MATH_BIG_ENDIAN
284     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
285     R = __QASX(S, T);
286     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
287     S = __QSAX(S, T);
288 #else
289     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
290     R = __QSAX(S, T);
291     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
292     S = __QASX(S, T);
293 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
294 
295     /* co1 & si1 are read from SIMD Coefficient pointer */
296     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
297     /*  Butterfly process for the i0+fftLen/2 sample */
298 
299 #ifndef ARM_MATH_BIG_ENDIAN
300     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
301     out1 = __SMUAD(C1, S) >> 16U;
302     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
303     out2 = __SMUSDX(C1, S);
304 #else
305     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
306     out1 = __SMUSDX(S, C1) >> 16U;
307     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
308     out2 = __SMUAD(C1, S);
309 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
310 
311     /* writing output(xb', yb') in little endian format */
312     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
313 
314     /* co3 & si3 are read from SIMD Coefficient pointer */
315     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
316     /*  Butterfly process for the i0+3fftLen/4 sample */
317 
318 #ifndef ARM_MATH_BIG_ENDIAN
319     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
320     out1 = __SMUAD(C3, R) >> 16U;
321     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
322     out2 = __SMUSDX(C3, R);
323 #else
324     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
325     out1 = __SMUSDX(R, C3) >> 16U;
326     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
327     out2 = __SMUAD(C3, R);
328 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
329 
330     /* writing output(xd', yd') in little endian format */
331     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
332 
333     /*  Twiddle coefficients index modifier */
334     ic = ic + twidCoefModifier;
335 
336   } while (--j);
337   /* data is in 4.11(q11) format */
338 
339   /* end of first stage process */
340 
341 
342   /* start of middle stage process */
343 
344   /*  Twiddle coefficients index modifier */
345   twidCoefModifier <<= 2U;
346 
347   /*  Calculation of Middle stage */
348   for (k = fftLen / 4U; k > 4U; k >>= 2U)
349   {
350     /*  Initializations for the middle stage */
351     n1 = n2;
352     n2 >>= 2U;
353     ic = 0U;
354 
355     for (j = 0U; j <= (n2 - 1U); j++)
356     {
357       /*  index calculation for the coefficients */
358       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
359       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
360       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
361 
362       /*  Twiddle coefficients index modifier */
363       ic = ic + twidCoefModifier;
364 
365       pSi0 = pSrc16 + 2 * j;
366       pSi1 = pSi0 + 2 * n2;
367       pSi2 = pSi1 + 2 * n2;
368       pSi3 = pSi2 + 2 * n2;
369 
370       /*  Butterfly implementation */
371       for (i0 = j; i0 < fftLen; i0 += n1)
372       {
373         /*  Reading i0, i0+fftLen/2 inputs */
374         /* Read ya (real), xa(imag) input */
375         T = read_q15x2 (pSi0);
376 
377         /* Read yc (real), xc(imag) input */
378         S = read_q15x2 (pSi2);
379 
380         /* R = packed( (ya + yc), (xa + xc)) */
381         R = __QADD16(T, S);
382 
383         /* S = packed((ya - yc), (xa - xc)) */
384         S = __QSUB16(T, S);
385 
386         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
387         /* Read yb (real), xb(imag) input */
388         T = read_q15x2 (pSi1);
389 
390         /* Read yd (real), xd(imag) input */
391         U = read_q15x2 (pSi3);
392 
393         /* T = packed( (yb + yd), (xb + xd)) */
394         T = __QADD16(T, U);
395 
396         /*  writing the butterfly processed i0 sample */
397 
398         /* xa' = xa + xb + xc + xd */
399         /* ya' = ya + yb + yc + yd */
400         out1 = __SHADD16(R, T);
401         out1 = __SHADD16(out1, 0);
402         write_q15x2 (pSi0, out1);
403         pSi0 += 2 * n1;
404 
405         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
406         R = __SHSUB16(R, T);
407 
408 #ifndef ARM_MATH_BIG_ENDIAN
409         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
410         out1 = __SMUAD(C2, R) >> 16U;
411 
412         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
413         out2 = __SMUSDX(C2, R);
414 #else
415         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
416         out1 = __SMUSDX(R, C2) >> 16U;
417 
418         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
419         out2 = __SMUAD(C2, R);
420 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
421 
422         /*  Reading i0+3fftLen/4 */
423         /* Read yb (real), xb(imag) input */
424         T = read_q15x2 (pSi1);
425 
426         /*  writing the butterfly processed i0 + fftLen/4 sample */
427         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
428         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
429         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
430         pSi1 += 2 * n1;
431 
432         /*  Butterfly calculations */
433 
434         /* Read yd (real), xd(imag) input */
435         U = read_q15x2 (pSi3);
436 
437         /* T = packed(yb-yd, xb-xd) */
438         T = __QSUB16(T, U);
439 
440 #ifndef ARM_MATH_BIG_ENDIAN
441         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
442         R = __SHASX(S, T);
443 
444         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
445         S = __SHSAX(S, T);
446 
447 
448         /*  Butterfly process for the i0+fftLen/2 sample */
449         out1 = __SMUAD(C1, S) >> 16U;
450         out2 = __SMUSDX(C1, S);
451 #else
452         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
453         R = __SHSAX(S, T);
454 
455         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
456         S = __SHASX(S, T);
457 
458 
459         /*  Butterfly process for the i0+fftLen/2 sample */
460         out1 = __SMUSDX(S, C1) >> 16U;
461         out2 = __SMUAD(C1, S);
462 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
463 
464         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
465         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
466         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
467         pSi2 += 2 * n1;
468 
469         /*  Butterfly process for the i0+3fftLen/4 sample */
470 
471 #ifndef ARM_MATH_BIG_ENDIAN
472         out1 = __SMUAD(C3, R) >> 16U;
473         out2 = __SMUSDX(C3, R);
474 #else
475         out1 = __SMUSDX(R, C3) >> 16U;
476         out2 = __SMUAD(C3, R);
477 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
478 
479         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
480         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
481         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
482         pSi3 += 2 * n1;
483       }
484     }
485     /*  Twiddle coefficients index modifier */
486     twidCoefModifier <<= 2U;
487   }
488   /* end of middle stage process */
489 
490 
491   /* data is in 10.6(q6) format for the 1024 point */
492   /* data is in 8.8(q8) format for the 256 point */
493   /* data is in 6.10(q10) format for the 64 point */
494   /* data is in 4.12(q12) format for the 16 point */
495 
496   /*  Initializations for the last stage */
497   j = fftLen >> 2;
498 
499   ptr1 = &pSrc16[0];
500 
501   /* start of last stage process */
502 
503   /*  Butterfly implementation */
504   do
505   {
506     /* Read xa (real), ya(imag) input */
507     xaya = read_q15x2_ia (&ptr1);
508 
509     /* Read xb (real), yb(imag) input */
510     xbyb = read_q15x2_ia (&ptr1);
511 
512     /* Read xc (real), yc(imag) input */
513     xcyc = read_q15x2_ia (&ptr1);
514 
515     /* Read xd (real), yd(imag) input */
516     xdyd = read_q15x2_ia (&ptr1);
517 
518     /* R = packed((ya + yc), (xa + xc)) */
519     R = __QADD16(xaya, xcyc);
520 
521     /* T = packed((yb + yd), (xb + xd)) */
522     T = __QADD16(xbyb, xdyd);
523 
524     /* pointer updation for writing */
525     ptr1 = ptr1 - 8U;
526 
527 
528     /* xa' = xa + xb + xc + xd */
529     /* ya' = ya + yb + yc + yd */
530     write_q15x2_ia (&ptr1, __SHADD16(R, T));
531 
532     /* T = packed((yb + yd), (xb + xd)) */
533     T = __QADD16(xbyb, xdyd);
534 
535     /* xc' = (xa-xb+xc-xd) */
536     /* yc' = (ya-yb+yc-yd) */
537     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
538 
539     /* S = packed((ya - yc), (xa - xc)) */
540     S = __QSUB16(xaya, xcyc);
541 
542     /* Read yd (real), xd(imag) input */
543     /* T = packed( (yb - yd), (xb - xd))  */
544     U = __QSUB16(xbyb, xdyd);
545 
546 #ifndef ARM_MATH_BIG_ENDIAN
547     /* xb' = (xa+yb-xc-yd) */
548     /* yb' = (ya-xb-yc+xd) */
549     write_q15x2_ia (&ptr1, __SHSAX(S, U));
550 
551     /* xd' = (xa-yb-xc+yd) */
552     /* yd' = (ya+xb-yc-xd) */
553     write_q15x2_ia (&ptr1, __SHASX(S, U));
554 #else
555     /* xb' = (xa+yb-xc-yd) */
556     /* yb' = (ya-xb-yc+xd) */
557     write_q15x2_ia (&ptr1, __SHASX(S, U));
558 
559     /* xd' = (xa-yb-xc+yd) */
560     /* yd' = (ya+xb-yc-xd) */
561     write_q15x2_ia (&ptr1, __SHSAX(S, U));
562 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
563 
564   } while (--j);
565 
566   /* end of last stage process */
567 
568   /* output is in 11.5(q5) format for the 1024 point */
569   /* output is in 9.7(q7) format for the 256 point   */
570   /* output is in 7.9(q9) format for the 64 point  */
571   /* output is in 5.11(q11) format for the 16 point  */
572 
573 
574 #else /* #if defined (ARM_MATH_DSP) */
575 
576         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
577         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
578         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
579 
580   /* Total process is divided into three stages */
581 
582   /* process first stage, middle stages, & last stage */
583 
584   /*  Initializations for the first stage */
585   n2 = fftLen;
586   n1 = n2;
587 
588   /* n2 = fftLen/4 */
589   n2 >>= 2U;
590 
591   /* Index for twiddle coefficient */
592   ic = 0U;
593 
594   /* Index for input read and output write */
595   i0 = 0U;
596   j = n2;
597 
598   /* Input is in 1.15(q15) format */
599 
600   /*  start of first stage process */
601   do
602   {
603     /*  Butterfly implementation */
604 
605     /*  index calculation for the input as, */
606     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
607     i1 = i0 + n2;
608     i2 = i1 + n2;
609     i3 = i2 + n2;
610 
611     /*  Reading i0, i0+fftLen/2 inputs */
612 
613     /* input is down scale by 4 to avoid overflow */
614     /* Read ya (real), xa(imag) input */
615     T0 = pSrc16[i0 * 2U] >> 2U;
616     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
617 
618     /* input is down scale by 4 to avoid overflow */
619     /* Read yc (real), xc(imag) input */
620     S0 = pSrc16[i2 * 2U] >> 2U;
621     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
622 
623     /* R0 = (ya + yc) */
624     R0 = __SSAT(T0 + S0, 16U);
625     /* R1 = (xa + xc) */
626     R1 = __SSAT(T1 + S1, 16U);
627 
628     /* S0 = (ya - yc) */
629     S0 = __SSAT(T0 - S0, 16);
630     /* S1 = (xa - xc) */
631     S1 = __SSAT(T1 - S1, 16);
632 
633     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
634     /* input is down scale by 4 to avoid overflow */
635     /* Read yb (real), xb(imag) input */
636     T0 = pSrc16[i1 * 2U] >> 2U;
637     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
638 
639     /* input is down scale by 4 to avoid overflow */
640     /* Read yd (real), xd(imag) input */
641     U0 = pSrc16[i3 * 2U] >> 2U;
642     U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
643 
644     /* T0 = (yb + yd) */
645     T0 = __SSAT(T0 + U0, 16U);
646     /* T1 = (xb + xd) */
647     T1 = __SSAT(T1 + U1, 16U);
648 
649     /*  writing the butterfly processed i0 sample */
650     /* ya' = ya + yb + yc + yd */
651     /* xa' = xa + xb + xc + xd */
652     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
653     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
654 
655     /* R0 = (ya + yc) - (yb + yd) */
656     /* R1 = (xa + xc) - (xb + xd) */
657     R0 = __SSAT(R0 - T0, 16U);
658     R1 = __SSAT(R1 - T1, 16U);
659 
660     /* co2 & si2 are read from Coefficient pointer */
661     Co2 = pCoef16[2U * ic * 2U];
662     Si2 = pCoef16[(2U * ic * 2U) + 1];
663 
664     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
665     out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
666     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
667     out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
668 
669     /*  Reading i0+fftLen/4 */
670     /* input is down scale by 4 to avoid overflow */
671     /* T0 = yb, T1 =  xb */
672     T0 = pSrc16[i1 * 2U] >> 2;
673     T1 = pSrc16[(i1 * 2U) + 1] >> 2;
674 
675     /* writing the butterfly processed i0 + fftLen/4 sample */
676     /* writing output(xc', yc') in little endian format */
677     pSrc16[i1 * 2U] = out1;
678     pSrc16[(i1 * 2U) + 1] = out2;
679 
680     /*  Butterfly calculations */
681     /* input is down scale by 4 to avoid overflow */
682     /* U0 = yd, U1 = xd */
683     U0 = pSrc16[i3 * 2U] >> 2;
684     U1 = pSrc16[(i3 * 2U) + 1] >> 2;
685     /* T0 = yb-yd */
686     T0 = __SSAT(T0 - U0, 16);
687     /* T1 = xb-xd */
688     T1 = __SSAT(T1 - U1, 16);
689 
690     /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
691     R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
692     R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
693 
694     /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
695     S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
696     S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
697 
698     /* co1 & si1 are read from Coefficient pointer */
699     Co1 = pCoef16[ic * 2U];
700     Si1 = pCoef16[(ic * 2U) + 1];
701     /*  Butterfly process for the i0+fftLen/2 sample */
702     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
703     out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
704     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
705     out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
706 
707     /* writing output(xb', yb') in little endian format */
708     pSrc16[i2 * 2U] = out1;
709     pSrc16[(i2 * 2U) + 1] = out2;
710 
711     /* Co3 & si3 are read from Coefficient pointer */
712     Co3 = pCoef16[3U * (ic * 2U)];
713     Si3 = pCoef16[(3U * (ic * 2U)) + 1];
714     /*  Butterfly process for the i0+3fftLen/4 sample */
715     /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
716     out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
717     /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
718     out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
719     /* writing output(xd', yd') in little endian format */
720     pSrc16[i3 * 2U] = out1;
721     pSrc16[(i3 * 2U) + 1] = out2;
722 
723     /*  Twiddle coefficients index modifier */
724     ic = ic + twidCoefModifier;
725 
726     /*  Updating input index */
727     i0 = i0 + 1U;
728 
729   } while (--j);
730   /* data is in 4.11(q11) format */
731 
732   /* end of first stage process */
733 
734 
735   /* start of middle stage process */
736 
737   /*  Twiddle coefficients index modifier */
738   twidCoefModifier <<= 2U;
739 
740   /*  Calculation of Middle stage */
741   for (k = fftLen / 4U; k > 4U; k >>= 2U)
742   {
743     /*  Initializations for the middle stage */
744     n1 = n2;
745     n2 >>= 2U;
746     ic = 0U;
747 
748     for (j = 0U; j <= (n2 - 1U); j++)
749     {
750       /*  index calculation for the coefficients */
751       Co1 = pCoef16[ic * 2U];
752       Si1 = pCoef16[(ic * 2U) + 1U];
753       Co2 = pCoef16[2U * (ic * 2U)];
754       Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
755       Co3 = pCoef16[3U * (ic * 2U)];
756       Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
757 
758       /*  Twiddle coefficients index modifier */
759       ic = ic + twidCoefModifier;
760 
761       /*  Butterfly implementation */
762       for (i0 = j; i0 < fftLen; i0 += n1)
763       {
764         /*  index calculation for the input as, */
765         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
766         i1 = i0 + n2;
767         i2 = i1 + n2;
768         i3 = i2 + n2;
769 
770         /*  Reading i0, i0+fftLen/2 inputs */
771         /* Read ya (real), xa(imag) input */
772         T0 = pSrc16[i0 * 2U];
773         T1 = pSrc16[(i0 * 2U) + 1U];
774 
775         /* Read yc (real), xc(imag) input */
776         S0 = pSrc16[i2 * 2U];
777         S1 = pSrc16[(i2 * 2U) + 1U];
778 
779         /* R0 = (ya + yc), R1 = (xa + xc) */
780         R0 = __SSAT(T0 + S0, 16);
781         R1 = __SSAT(T1 + S1, 16);
782 
783         /* S0 = (ya - yc), S1 =(xa - xc) */
784         S0 = __SSAT(T0 - S0, 16);
785         S1 = __SSAT(T1 - S1, 16);
786 
787         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
788         /* Read yb (real), xb(imag) input */
789         T0 = pSrc16[i1 * 2U];
790         T1 = pSrc16[(i1 * 2U) + 1U];
791 
792         /* Read yd (real), xd(imag) input */
793         U0 = pSrc16[i3 * 2U];
794         U1 = pSrc16[(i3 * 2U) + 1U];
795 
796 
797         /* T0 = (yb + yd), T1 = (xb + xd) */
798         T0 = __SSAT(T0 + U0, 16);
799         T1 = __SSAT(T1 + U1, 16);
800 
801         /*  writing the butterfly processed i0 sample */
802 
803         /* xa' = xa + xb + xc + xd */
804         /* ya' = ya + yb + yc + yd */
805         out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
806         out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
807 
808         pSrc16[i0 * 2U] = out1;
809         pSrc16[(2U * i0) + 1U] = out2;
810 
811         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
812         R0 = (R0 >> 1U) - (T0 >> 1U);
813         R1 = (R1 >> 1U) - (T1 >> 1U);
814 
815         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
816         out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
817 
818         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
819         out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
820 
821         /*  Reading i0+3fftLen/4 */
822         /* Read yb (real), xb(imag) input */
823         T0 = pSrc16[i1 * 2U];
824         T1 = pSrc16[(i1 * 2U) + 1U];
825 
826         /*  writing the butterfly processed i0 + fftLen/4 sample */
827         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
828         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
829         pSrc16[i1 * 2U] = out1;
830         pSrc16[(i1 * 2U) + 1U] = out2;
831 
832         /*  Butterfly calculations */
833 
834         /* Read yd (real), xd(imag) input */
835         U0 = pSrc16[i3 * 2U];
836         U1 = pSrc16[(i3 * 2U) + 1U];
837 
838         /* T0 = yb-yd, T1 = xb-xd */
839         T0 = __SSAT(T0 - U0, 16);
840         T1 = __SSAT(T1 - U1, 16);
841 
842         /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
843         R0 = (S0 >> 1U) - (T1 >> 1U);
844         R1 = (S1 >> 1U) + (T0 >> 1U);
845 
846         /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
847         S0 = (S0 >> 1U) + (T1 >> 1U);
848         S1 = (S1 >> 1U) - (T0 >> 1U);
849 
850         /*  Butterfly process for the i0+fftLen/2 sample */
851         out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
852 
853         out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
854 
855         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
856         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
857         pSrc16[i2 * 2U] = out1;
858         pSrc16[(i2 * 2U) + 1U] = out2;
859 
860         /*  Butterfly process for the i0+3fftLen/4 sample */
861         out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
862 
863         out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
864         /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
865         /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
866         pSrc16[i3 * 2U] = out1;
867         pSrc16[(i3 * 2U) + 1U] = out2;
868       }
869     }
870     /*  Twiddle coefficients index modifier */
871     twidCoefModifier <<= 2U;
872   }
873   /* end of middle stage process */
874 
875 
876   /* data is in 10.6(q6) format for the 1024 point */
877   /* data is in 8.8(q8) format for the 256 point */
878   /* data is in 6.10(q10) format for the 64 point */
879   /* data is in 4.12(q12) format for the 16 point */
880 
881   /*  Initializations for the last stage */
882   n1 = n2;
883   n2 >>= 2U;
884 
885   /* start of last stage process */
886 
887   /*  Butterfly implementation */
888   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
889   {
890     /*  index calculation for the input as, */
891     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
892     i1 = i0 + n2;
893     i2 = i1 + n2;
894     i3 = i2 + n2;
895 
896     /*  Reading i0, i0+fftLen/2 inputs */
897     /* Read ya (real), xa(imag) input */
898     T0 = pSrc16[i0 * 2U];
899     T1 = pSrc16[(i0 * 2U) + 1U];
900 
901     /* Read yc (real), xc(imag) input */
902     S0 = pSrc16[i2 * 2U];
903     S1 = pSrc16[(i2 * 2U) + 1U];
904 
905     /* R0 = (ya + yc), R1 = (xa + xc) */
906     R0 = __SSAT(T0 + S0, 16U);
907     R1 = __SSAT(T1 + S1, 16U);
908 
909     /* S0 = (ya - yc), S1 = (xa - xc) */
910     S0 = __SSAT(T0 - S0, 16U);
911     S1 = __SSAT(T1 - S1, 16U);
912 
913     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
914     /* Read yb (real), xb(imag) input */
915     T0 = pSrc16[i1 * 2U];
916     T1 = pSrc16[(i1 * 2U) + 1U];
917     /* Read yd (real), xd(imag) input */
918     U0 = pSrc16[i3 * 2U];
919     U1 = pSrc16[(i3 * 2U) + 1U];
920 
921     /* T0 = (yb + yd), T1 = (xb + xd)) */
922     T0 = __SSAT(T0 + U0, 16U);
923     T1 = __SSAT(T1 + U1, 16U);
924 
925     /*  writing the butterfly processed i0 sample */
926     /* xa' = xa + xb + xc + xd */
927     /* ya' = ya + yb + yc + yd */
928     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
929     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
930 
931     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
932     R0 = (R0 >> 1U) - (T0 >> 1U);
933     R1 = (R1 >> 1U) - (T1 >> 1U);
934     /* Read yb (real), xb(imag) input */
935     T0 = pSrc16[i1 * 2U];
936     T1 = pSrc16[(i1 * 2U) + 1U];
937 
938     /*  writing the butterfly processed i0 + fftLen/4 sample */
939     /* xc' = (xa-xb+xc-xd) */
940     /* yc' = (ya-yb+yc-yd) */
941     pSrc16[i1 * 2U] = R0;
942     pSrc16[(i1 * 2U) + 1U] = R1;
943 
944     /* Read yd (real), xd(imag) input */
945     U0 = pSrc16[i3 * 2U];
946     U1 = pSrc16[(i3 * 2U) + 1U];
947     /* T0 = (yb - yd), T1 = (xb - xd)  */
948     T0 = __SSAT(T0 - U0, 16U);
949     T1 = __SSAT(T1 - U1, 16U);
950 
951     /*  writing the butterfly processed i0 + fftLen/2 sample */
952     /* xb' = (xa+yb-xc-yd) */
953     /* yb' = (ya-xb-yc+xd) */
954     pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
955     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
956 
957     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
958     /* xd' = (xa-yb-xc+yd) */
959     /* yd' = (ya+xb-yc-xd) */
960     pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
961     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
962 
963   }
964 
965   /* end of last stage process */
966 
967   /* output is in 11.5(q5) format for the 1024 point */
968   /* output is in 9.7(q7) format for the 256 point   */
969   /* output is in 7.9(q9) format for the 64 point  */
970   /* output is in 5.11(q11) format for the 16 point  */
971 
972 #endif /* #if defined (ARM_MATH_DSP) */
973 
974 }
975 
976 
977 /**
978   @brief         Core function for the Q15 CIFFT butterfly process.
979   @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
980   @param[in]     fftLen           length of the FFT
981   @param[in]     pCoef16          points to twiddle coefficient buffer
982   @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
983   @return        none
984  */
985 
986 /*
987  * Radix-4 IFFT algorithm used is :
988  *
989  * CIFFT uses same twiddle coefficients as CFFT function
990  *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
991  *
992  *
993  * IFFT is implemented with following changes in equations from FFT
994  *
995  * Input real and imaginary data:
996  * x(n) = xa + j * ya
997  * x(n+N/4 ) = xb + j * yb
998  * x(n+N/2 ) = xc + j * yc
999  * x(n+3N 4) = xd + j * yd
1000  *
1001  *
1002  * Output real and imaginary data:
1003  * x(4r) = xa'+ j * ya'
1004  * x(4r+1) = xb'+ j * yb'
1005  * x(4r+2) = xc'+ j * yc'
1006  * x(4r+3) = xd'+ j * yd'
1007  *
1008  *
1009  * Twiddle factors for radix-4 IFFT:
1010  * Wn = co1 + j * (si1)
1011  * W2n = co2 + j * (si2)
1012  * W3n = co3 + j * (si3)
1013 
1014  * The real and imaginary output values for the radix-4 butterfly are
1015  * xa' = xa + xb + xc + xd
1016  * ya' = ya + yb + yc + yd
1017  * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
1018  * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
1019  * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
1020  * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
1021  * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
1022  * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
1023  *
1024  */
1025 
arm_radix4_butterfly_inverse_q15(q15_t * pSrc16,uint32_t fftLen,const q15_t * pCoef16,uint32_t twidCoefModifier)1026 void arm_radix4_butterfly_inverse_q15(
1027         q15_t * pSrc16,
1028         uint32_t fftLen,
1029   const q15_t * pCoef16,
1030         uint32_t twidCoefModifier)
1031 {
1032 
1033 #if defined (ARM_MATH_DSP)
1034 
1035         q31_t R, S, T, U;
1036         q31_t C1, C2, C3, out1, out2;
1037         uint32_t n1, n2, ic, i0, j, k;
1038 
1039         q15_t *ptr1;
1040         q15_t *pSi0;
1041         q15_t *pSi1;
1042         q15_t *pSi2;
1043         q15_t *pSi3;
1044 
1045         q31_t xaya, xbyb, xcyc, xdyd;
1046 
1047   /* Total process is divided into three stages */
1048 
1049   /* process first stage, middle stages, & last stage */
1050 
1051   /*  Initializations for the first stage */
1052   n2 = fftLen;
1053   n1 = n2;
1054 
1055   /* n2 = fftLen/4 */
1056   n2 >>= 2U;
1057 
1058   /* Index for twiddle coefficient */
1059   ic = 0U;
1060 
1061   /* Index for input read and output write */
1062   j = n2;
1063 
1064   pSi0 = pSrc16;
1065   pSi1 = pSi0 + 2 * n2;
1066   pSi2 = pSi1 + 2 * n2;
1067   pSi3 = pSi2 + 2 * n2;
1068 
1069   /* Input is in 1.15(q15) format */
1070 
1071   /*  start of first stage process */
1072   do
1073   {
1074     /*  Butterfly implementation */
1075 
1076     /*  Reading i0, i0+fftLen/2 inputs */
1077     /* Read ya (real), xa(imag) input */
1078     T = read_q15x2 (pSi0);
1079     T = __SHADD16(T, 0);
1080     T = __SHADD16(T, 0);
1081 
1082     /* Read yc (real), xc(imag) input */
1083     S = read_q15x2 (pSi2);
1084     S = __SHADD16(S, 0);
1085     S = __SHADD16(S, 0);
1086 
1087     /* R = packed((ya + yc), (xa + xc) ) */
1088     R = __QADD16(T, S);
1089 
1090     /* S = packed((ya - yc), (xa - xc) ) */
1091     S = __QSUB16(T, S);
1092 
1093     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1094     /* Read yb (real), xb(imag) input */
1095     T = read_q15x2 (pSi1);
1096     T = __SHADD16(T, 0);
1097     T = __SHADD16(T, 0);
1098 
1099     /* Read yd (real), xd(imag) input */
1100     U = read_q15x2 (pSi3);
1101     U = __SHADD16(U, 0);
1102     U = __SHADD16(U, 0);
1103 
1104     /* T = packed((yb + yd), (xb + xd) ) */
1105     T = __QADD16(T, U);
1106 
1107     /*  writing the butterfly processed i0 sample */
1108     /* xa' = xa + xb + xc + xd */
1109     /* ya' = ya + yb + yc + yd */
1110     write_q15x2_ia (&pSi0, __SHADD16(R, T));
1111 
1112     /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
1113     R = __QSUB16(R, T);
1114 
1115     /* co2 & si2 are read from SIMD Coefficient pointer */
1116     C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1117 
1118 #ifndef ARM_MATH_BIG_ENDIAN
1119     /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1120     out1 = __SMUSD(C2, R) >> 16U;
1121     /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1122     out2 = __SMUADX(C2, R);
1123 #else
1124     /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1125     out1 = __SMUADX(C2, R) >> 16U;
1126     /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1127     out2 = __SMUSD(__QSUB16(0, C2), R);
1128 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1129 
1130     /*  Reading i0+fftLen/4 */
1131     /* T = packed(yb, xb) */
1132     T = read_q15x2 (pSi1);
1133     T = __SHADD16(T, 0);
1134     T = __SHADD16(T, 0);
1135 
1136     /* writing the butterfly processed i0 + fftLen/4 sample */
1137     /* writing output(xc', yc') in little endian format */
1138     write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
1139 
1140     /*  Butterfly calculations */
1141     /* U = packed(yd, xd) */
1142     U = read_q15x2 (pSi3);
1143     U = __SHADD16(U, 0);
1144     U = __SHADD16(U, 0);
1145 
1146     /* T = packed(yb-yd, xb-xd) */
1147     T = __QSUB16(T, U);
1148 
1149 #ifndef ARM_MATH_BIG_ENDIAN
1150     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1151     R = __QSAX(S, T);
1152     /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
1153     S = __QASX(S, T);
1154 #else
1155     /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1156     R = __QASX(S, T);
1157     /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1158     S = __QSAX(S, T);
1159 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1160 
1161     /* co1 & si1 are read from SIMD Coefficient pointer */
1162     C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1163     /*  Butterfly process for the i0+fftLen/2 sample */
1164 
1165 #ifndef ARM_MATH_BIG_ENDIAN
1166     /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1167     out1 = __SMUSD(C1, S) >> 16U;
1168     /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1169     out2 = __SMUADX(C1, S);
1170 #else
1171     /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1172     out1 = __SMUADX(C1, S) >> 16U;
1173     /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1174     out2 = __SMUSD(__QSUB16(0, C1), S);
1175 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1176 
1177     /* writing output(xb', yb') in little endian format */
1178     write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
1179 
1180     /* co3 & si3 are read from SIMD Coefficient pointer */
1181     C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1182     /*  Butterfly process for the i0+3fftLen/4 sample */
1183 
1184 #ifndef ARM_MATH_BIG_ENDIAN
1185     /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1186     out1 = __SMUSD(C3, R) >> 16U;
1187     /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1188     out2 = __SMUADX(C3, R);
1189 #else
1190     /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1191     out1 = __SMUADX(C3, R) >> 16U;
1192     /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1193     out2 = __SMUSD(__QSUB16(0, C3), R);
1194 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1195 
1196     /* writing output(xd', yd') in little endian format */
1197     write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
1198 
1199     /*  Twiddle coefficients index modifier */
1200     ic = ic + twidCoefModifier;
1201 
1202   } while (--j);
1203   /* data is in 4.11(q11) format */
1204 
1205   /* end of first stage process */
1206 
1207 
1208   /* start of middle stage process */
1209 
1210   /*  Twiddle coefficients index modifier */
1211   twidCoefModifier <<= 2U;
1212 
1213   /*  Calculation of Middle stage */
1214   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1215   {
1216     /*  Initializations for the middle stage */
1217     n1 = n2;
1218     n2 >>= 2U;
1219     ic = 0U;
1220 
1221     for (j = 0U; j <= (n2 - 1U); j++)
1222     {
1223       /*  index calculation for the coefficients */
1224       C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
1225       C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
1226       C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
1227 
1228       /*  Twiddle coefficients index modifier */
1229       ic = ic + twidCoefModifier;
1230 
1231       pSi0 = pSrc16 + 2 * j;
1232       pSi1 = pSi0 + 2 * n2;
1233       pSi2 = pSi1 + 2 * n2;
1234       pSi3 = pSi2 + 2 * n2;
1235 
1236       /*  Butterfly implementation */
1237       for (i0 = j; i0 < fftLen; i0 += n1)
1238       {
1239         /*  Reading i0, i0+fftLen/2 inputs */
1240         /* Read ya (real), xa(imag) input */
1241         T = read_q15x2 (pSi0);
1242 
1243         /* Read yc (real), xc(imag) input */
1244         S = read_q15x2 (pSi2);
1245 
1246         /* R = packed( (ya + yc), (xa + xc)) */
1247         R = __QADD16(T, S);
1248 
1249         /* S = packed((ya - yc), (xa - xc)) */
1250         S = __QSUB16(T, S);
1251 
1252         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1253         /* Read yb (real), xb(imag) input */
1254         T = read_q15x2 (pSi1);
1255 
1256         /* Read yd (real), xd(imag) input */
1257         U = read_q15x2 (pSi3);
1258 
1259         /* T = packed( (yb + yd), (xb + xd)) */
1260         T = __QADD16(T, U);
1261 
1262         /*  writing the butterfly processed i0 sample */
1263 
1264         /* xa' = xa + xb + xc + xd */
1265         /* ya' = ya + yb + yc + yd */
1266         out1 = __SHADD16(R, T);
1267         out1 = __SHADD16(out1, 0);
1268         write_q15x2 (pSi0, out1);
1269         pSi0 += 2 * n1;
1270 
1271         /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
1272         R = __SHSUB16(R, T);
1273 
1274 #ifndef ARM_MATH_BIG_ENDIAN
1275         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1276         out1 = __SMUSD(C2, R) >> 16U;
1277 
1278         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1279         out2 = __SMUADX(C2, R);
1280 #else
1281         /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1282         out1 = __SMUADX(R, C2) >> 16U;
1283 
1284         /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
1285         out2 = __SMUSD(__QSUB16(0, C2), R);
1286 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1287 
1288         /*  Reading i0+3fftLen/4 */
1289         /* Read yb (real), xb(imag) input */
1290         T = read_q15x2 (pSi1);
1291 
1292         /*  writing the butterfly processed i0 + fftLen/4 sample */
1293         /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
1294         /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
1295         write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
1296         pSi1 += 2 * n1;
1297 
1298         /*  Butterfly calculations */
1299 
1300         /* Read yd (real), xd(imag) input */
1301         U = read_q15x2 (pSi3);
1302 
1303         /* T = packed(yb-yd, xb-xd) */
1304         T = __QSUB16(T, U);
1305 
1306 #ifndef ARM_MATH_BIG_ENDIAN
1307         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1308         R = __SHSAX(S, T);
1309 
1310         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1311         S = __SHASX(S, T);
1312 
1313         /*  Butterfly process for the i0+fftLen/2 sample */
1314         out1 = __SMUSD(C1, S) >> 16U;
1315         out2 = __SMUADX(C1, S);
1316 #else
1317         /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
1318         R = __SHASX(S, T);
1319 
1320         /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
1321         S = __SHSAX(S, T);
1322 
1323         /*  Butterfly process for the i0+fftLen/2 sample */
1324         out1 = __SMUADX(S, C1) >> 16U;
1325         out2 = __SMUSD(__QSUB16(0, C1), S);
1326 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1327 
1328         /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
1329         /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
1330         write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
1331         pSi2 += 2 * n1;
1332 
1333         /*  Butterfly process for the i0+3fftLen/4 sample */
1334 
1335 #ifndef ARM_MATH_BIG_ENDIAN
1336         out1 = __SMUSD(C3, R) >> 16U;
1337         out2 = __SMUADX(C3, R);
1338 #else
1339         out1 = __SMUADX(C3, R) >> 16U;
1340         out2 = __SMUSD(__QSUB16(0, C3), R);
1341 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1342 
1343         /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
1344         /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
1345         write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
1346         pSi3 += 2 * n1;
1347       }
1348     }
1349     /*  Twiddle coefficients index modifier */
1350     twidCoefModifier <<= 2U;
1351   }
1352   /* end of middle stage process */
1353 
1354   /* data is in 10.6(q6) format for the 1024 point */
1355   /* data is in 8.8(q8) format for the 256 point */
1356   /* data is in 6.10(q10) format for the 64 point */
1357   /* data is in 4.12(q12) format for the 16 point */
1358 
1359   /*  Initializations for the last stage */
1360   j = fftLen >> 2;
1361 
1362   ptr1 = &pSrc16[0];
1363 
1364   /* start of last stage process */
1365 
1366   /*  Butterfly implementation */
1367   do
1368   {
1369     /* Read xa (real), ya(imag) input */
1370     xaya = read_q15x2_ia (&ptr1);
1371 
1372     /* Read xb (real), yb(imag) input */
1373     xbyb = read_q15x2_ia (&ptr1);
1374 
1375     /* Read xc (real), yc(imag) input */
1376     xcyc = read_q15x2_ia (&ptr1);
1377 
1378     /* Read xd (real), yd(imag) input */
1379     xdyd = read_q15x2_ia (&ptr1);
1380 
1381     /* R = packed((ya + yc), (xa + xc)) */
1382     R = __QADD16(xaya, xcyc);
1383 
1384     /* T = packed((yb + yd), (xb + xd)) */
1385     T = __QADD16(xbyb, xdyd);
1386 
1387     /* pointer updation for writing */
1388     ptr1 = ptr1 - 8U;
1389 
1390 
1391     /* xa' = xa + xb + xc + xd */
1392     /* ya' = ya + yb + yc + yd */
1393     write_q15x2_ia (&ptr1, __SHADD16(R, T));
1394 
1395     /* T = packed((yb + yd), (xb + xd)) */
1396     T = __QADD16(xbyb, xdyd);
1397 
1398     /* xc' = (xa-xb+xc-xd) */
1399     /* yc' = (ya-yb+yc-yd) */
1400     write_q15x2_ia (&ptr1, __SHSUB16(R, T));
1401 
1402     /* S = packed((ya - yc), (xa - xc)) */
1403     S = __QSUB16(xaya, xcyc);
1404 
1405     /* Read yd (real), xd(imag) input */
1406     /* T = packed( (yb - yd), (xb - xd))  */
1407     U = __QSUB16(xbyb, xdyd);
1408 
1409 #ifndef ARM_MATH_BIG_ENDIAN
1410     /* xb' = (xa+yb-xc-yd) */
1411     /* yb' = (ya-xb-yc+xd) */
1412     write_q15x2_ia (&ptr1, __SHASX(S, U));
1413 
1414     /* xd' = (xa-yb-xc+yd) */
1415     /* yd' = (ya+xb-yc-xd) */
1416     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1417 #else
1418     /* xb' = (xa+yb-xc-yd) */
1419     /* yb' = (ya-xb-yc+xd) */
1420     write_q15x2_ia (&ptr1, __SHSAX(S, U));
1421 
1422     /* xd' = (xa-yb-xc+yd) */
1423     /* yd' = (ya+xb-yc-xd) */
1424     write_q15x2_ia (&ptr1, __SHASX(S, U));
1425 #endif /* #ifndef ARM_MATH_BIG_ENDIAN */
1426 
1427   } while (--j);
1428 
1429   /* end of last stage  process */
1430 
1431   /* output is in 11.5(q5) format for the 1024 point */
1432   /* output is in 9.7(q7) format for the 256 point   */
1433   /* output is in 7.9(q9) format for the 64 point  */
1434   /* output is in 5.11(q11) format for the 16 point  */
1435 
1436 
1437 #else /* arm_radix4_butterfly_inverse_q15 */
1438 
1439         q15_t R0, R1, S0, S1, T0, T1, U0, U1;
1440         q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
1441         uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
1442 
1443   /* Total process is divided into three stages */
1444 
1445   /* process first stage, middle stages, & last stage */
1446 
1447   /*  Initializations for the first stage */
1448   n2 = fftLen;
1449   n1 = n2;
1450 
1451   /* n2 = fftLen/4 */
1452   n2 >>= 2U;
1453 
1454   /* Index for twiddle coefficient */
1455   ic = 0U;
1456 
1457   /* Index for input read and output write */
1458   i0 = 0U;
1459 
1460   j = n2;
1461 
1462   /* Input is in 1.15(q15) format */
1463 
1464   /*  Start of first stage process */
1465   do
1466   {
1467     /*  Butterfly implementation */
1468 
1469     /*  index calculation for the input as, */
1470     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1471     i1 = i0 + n2;
1472     i2 = i1 + n2;
1473     i3 = i2 + n2;
1474 
1475     /*  Reading i0, i0+fftLen/2 inputs */
1476     /* input is down scale by 4 to avoid overflow */
1477     /* Read ya (real), xa(imag) input */
1478     T0 = pSrc16[i0 * 2U] >> 2U;
1479     T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
1480     /* input is down scale by 4 to avoid overflow */
1481     /* Read yc (real), xc(imag) input */
1482     S0 = pSrc16[i2 * 2U] >> 2U;
1483     S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
1484 
1485     /* R0 = (ya + yc), R1 = (xa + xc) */
1486     R0 = __SSAT(T0 + S0, 16U);
1487     R1 = __SSAT(T1 + S1, 16U);
1488     /* S0 = (ya - yc), S1 = (xa - xc) */
1489     S0 = __SSAT(T0 - S0, 16U);
1490     S1 = __SSAT(T1 - S1, 16U);
1491 
1492     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1493     /* input is down scale by 4 to avoid overflow */
1494     /* Read yb (real), xb(imag) input */
1495     T0 = pSrc16[i1 * 2U] >> 2U;
1496     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1497     /* Read yd (real), xd(imag) input */
1498     /* input is down scale by 4 to avoid overflow */
1499     U0 = pSrc16[i3 * 2U] >> 2U;
1500     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1501 
1502     /* T0 = (yb + yd), T1 = (xb + xd) */
1503     T0 = __SSAT(T0 + U0, 16U);
1504     T1 = __SSAT(T1 + U1, 16U);
1505 
1506     /*  writing the butterfly processed i0 sample */
1507     /* xa' = xa + xb + xc + xd */
1508     /* ya' = ya + yb + yc + yd */
1509     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1510     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1511 
1512     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
1513     R0 = __SSAT(R0 - T0, 16U);
1514     R1 = __SSAT(R1 - T1, 16U);
1515     /* co2 & si2 are read from Coefficient pointer */
1516     Co2 = pCoef16[2U * ic * 2U];
1517     Si2 = pCoef16[(2U * ic * 2U) + 1U];
1518     /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1519     out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
1520     /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1521     out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
1522 
1523     /*  Reading i0+fftLen/4 */
1524     /* input is down scale by 4 to avoid overflow */
1525     /* T0 = yb, T1 = xb */
1526     T0 = pSrc16[i1 * 2U] >> 2U;
1527     T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
1528 
1529     /* writing the butterfly processed i0 + fftLen/4 sample */
1530     /* writing output(xc', yc') in little endian format */
1531     pSrc16[i1 * 2U] = out1;
1532     pSrc16[(i1 * 2U) + 1U] = out2;
1533 
1534     /*  Butterfly calculations */
1535     /* input is down scale by 4 to avoid overflow */
1536     /* U0 = yd, U1 = xd) */
1537     U0 = pSrc16[i3 * 2U] >> 2U;
1538     U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
1539 
1540     /* T0 = yb-yd, T1 = xb-xd) */
1541     T0 = __SSAT(T0 - U0, 16U);
1542     T1 = __SSAT(T1 - U1, 16U);
1543     /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1544     R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
1545     R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
1546     /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1547     S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
1548     S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
1549 
1550     /* co1 & si1 are read from Coefficient pointer */
1551     Co1 = pCoef16[ic * 2U];
1552     Si1 = pCoef16[(ic * 2U) + 1U];
1553     /*  Butterfly process for the i0+fftLen/2 sample */
1554     /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1555     out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1556     /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1557     out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1558     /* writing output(xb', yb') in little endian format */
1559     pSrc16[i2 * 2U] = out1;
1560     pSrc16[(i2 * 2U) + 1U] = out2;
1561 
1562     /* Co3 & si3 are read from Coefficient pointer */
1563     Co3 = pCoef16[3U * ic * 2U];
1564     Si3 = pCoef16[(3U * ic * 2U) + 1U];
1565     /*  Butterfly process for the i0+3fftLen/4 sample */
1566     /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1567     out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1568     /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1569     out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1570     /* writing output(xd', yd') in little endian format */
1571     pSrc16[i3 * 2U] = out1;
1572     pSrc16[(i3 * 2U) + 1U] = out2;
1573 
1574     /*  Twiddle coefficients index modifier */
1575     ic = ic + twidCoefModifier;
1576 
1577     /*  Updating input index */
1578     i0 = i0 + 1U;
1579 
1580   } while (--j);
1581 
1582   /*  End of first stage process */
1583 
1584   /* data is in 4.11(q11) format */
1585 
1586 
1587   /*  Start of Middle stage process */
1588 
1589   /*  Twiddle coefficients index modifier */
1590   twidCoefModifier <<= 2U;
1591 
1592   /*  Calculation of Middle stage */
1593   for (k = fftLen / 4U; k > 4U; k >>= 2U)
1594   {
1595     /*  Initializations for the middle stage */
1596     n1 = n2;
1597     n2 >>= 2U;
1598     ic = 0U;
1599 
1600     for (j = 0U; j <= (n2 - 1U); j++)
1601     {
1602       /*  index calculation for the coefficients */
1603       Co1 = pCoef16[ic * 2U];
1604       Si1 = pCoef16[(ic * 2U) + 1U];
1605       Co2 = pCoef16[2U * ic * 2U];
1606       Si2 = pCoef16[2U * ic * 2U + 1U];
1607       Co3 = pCoef16[3U * ic * 2U];
1608       Si3 = pCoef16[(3U * ic * 2U) + 1U];
1609 
1610       /*  Twiddle coefficients index modifier */
1611       ic = ic + twidCoefModifier;
1612 
1613       /*  Butterfly implementation */
1614       for (i0 = j; i0 < fftLen; i0 += n1)
1615       {
1616         /*  index calculation for the input as, */
1617         /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1618         i1 = i0 + n2;
1619         i2 = i1 + n2;
1620         i3 = i2 + n2;
1621 
1622         /*  Reading i0, i0+fftLen/2 inputs */
1623         /* Read ya (real), xa(imag) input */
1624         T0 = pSrc16[i0 * 2U];
1625         T1 = pSrc16[(i0 * 2U) + 1U];
1626 
1627         /* Read yc (real), xc(imag) input */
1628         S0 = pSrc16[i2 * 2U];
1629         S1 = pSrc16[(i2 * 2U) + 1U];
1630 
1631 
1632         /* R0 = (ya + yc), R1 = (xa + xc) */
1633         R0 = __SSAT(T0 + S0, 16U);
1634         R1 = __SSAT(T1 + S1, 16U);
1635         /* S0 = (ya - yc), S1 = (xa - xc) */
1636         S0 = __SSAT(T0 - S0, 16U);
1637         S1 = __SSAT(T1 - S1, 16U);
1638 
1639         /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1640         /* Read yb (real), xb(imag) input */
1641         T0 = pSrc16[i1 * 2U];
1642         T1 = pSrc16[(i1 * 2U) + 1U];
1643 
1644         /* Read yd (real), xd(imag) input */
1645         U0 = pSrc16[i3 * 2U];
1646         U1 = pSrc16[(i3 * 2U) + 1U];
1647 
1648         /* T0 = (yb + yd), T1 = (xb + xd) */
1649         T0 = __SSAT(T0 + U0, 16U);
1650         T1 = __SSAT(T1 + U1, 16U);
1651 
1652         /*  writing the butterfly processed i0 sample */
1653         /* xa' = xa + xb + xc + xd */
1654         /* ya' = ya + yb + yc + yd */
1655         pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
1656         pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
1657 
1658         /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1659         R0 = (R0 >> 1U) - (T0 >> 1U);
1660         R1 = (R1 >> 1U) - (T1 >> 1U);
1661 
1662         /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
1663         out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
1664         /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1665         out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
1666 
1667         /*  Reading i0+3fftLen/4 */
1668         /* Read yb (real), xb(imag) input */
1669         T0 = pSrc16[i1 * 2U];
1670         T1 = pSrc16[(i1 * 2U) + 1U];
1671 
1672         /*  writing the butterfly processed i0 + fftLen/4 sample */
1673         /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
1674         /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
1675         pSrc16[i1 * 2U] = out1;
1676         pSrc16[(i1 * 2U) + 1U] = out2;
1677 
1678         /*  Butterfly calculations */
1679         /* Read yd (real), xd(imag) input */
1680         U0 = pSrc16[i3 * 2U];
1681         U1 = pSrc16[(i3 * 2U) + 1U];
1682 
1683         /* T0 = yb-yd, T1 = xb-xd) */
1684         T0 = __SSAT(T0 - U0, 16U);
1685         T1 = __SSAT(T1 - U1, 16U);
1686 
1687         /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
1688         R0 = (S0 >> 1U) + (T1 >> 1U);
1689         R1 = (S1 >> 1U) - (T0 >> 1U);
1690 
1691         /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
1692         S0 = (S0 >> 1U) - (T1 >> 1U);
1693         S1 = (S1 >> 1U) + (T0 >> 1U);
1694 
1695         /*  Butterfly process for the i0+fftLen/2 sample */
1696         out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
1697         out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
1698         /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
1699         /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
1700         pSrc16[i2 * 2U] = out1;
1701         pSrc16[(i2 * 2U) + 1U] = out2;
1702 
1703         /*  Butterfly process for the i0+3fftLen/4 sample */
1704         out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
1705 
1706         out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
1707         /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
1708         /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
1709         pSrc16[i3 * 2U] = out1;
1710         pSrc16[(i3 * 2U) + 1U] = out2;
1711 
1712 
1713       }
1714     }
1715     /*  Twiddle coefficients index modifier */
1716     twidCoefModifier <<= 2U;
1717   }
1718   /*  End of Middle stages process */
1719 
1720 
1721   /* data is in 10.6(q6) format for the 1024 point */
1722   /* data is in 8.8(q8) format for the 256 point   */
1723   /* data is in 6.10(q10) format for the 64 point  */
1724   /* data is in 4.12(q12) format for the 16 point  */
1725 
1726   /* start of last stage process */
1727 
1728 
1729   /*  Initializations for the last stage */
1730   n1 = n2;
1731   n2 >>= 2U;
1732 
1733   /*  Butterfly implementation */
1734   for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
1735   {
1736     /*  index calculation for the input as, */
1737     /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
1738     i1 = i0 + n2;
1739     i2 = i1 + n2;
1740     i3 = i2 + n2;
1741 
1742     /*  Reading i0, i0+fftLen/2 inputs */
1743     /* Read ya (real), xa(imag) input */
1744     T0 = pSrc16[i0 * 2U];
1745     T1 = pSrc16[(i0 * 2U) + 1U];
1746     /* Read yc (real), xc(imag) input */
1747     S0 = pSrc16[i2 * 2U];
1748     S1 = pSrc16[(i2 * 2U) + 1U];
1749 
1750     /* R0 = (ya + yc), R1 = (xa + xc) */
1751     R0 = __SSAT(T0 + S0, 16U);
1752     R1 = __SSAT(T1 + S1, 16U);
1753     /* S0 = (ya - yc), S1 = (xa - xc) */
1754     S0 = __SSAT(T0 - S0, 16U);
1755     S1 = __SSAT(T1 - S1, 16U);
1756 
1757     /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
1758     /* Read yb (real), xb(imag) input */
1759     T0 = pSrc16[i1 * 2U];
1760     T1 = pSrc16[(i1 * 2U) + 1U];
1761     /* Read yd (real), xd(imag) input */
1762     U0 = pSrc16[i3 * 2U];
1763     U1 = pSrc16[(i3 * 2U) + 1U];
1764 
1765     /* T0 = (yb + yd), T1 = (xb + xd) */
1766     T0 = __SSAT(T0 + U0, 16U);
1767     T1 = __SSAT(T1 + U1, 16U);
1768 
1769     /*  writing the butterfly processed i0 sample */
1770     /* xa' = xa + xb + xc + xd */
1771     /* ya' = ya + yb + yc + yd */
1772     pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
1773     pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
1774 
1775     /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
1776     R0 = (R0 >> 1U) - (T0 >> 1U);
1777     R1 = (R1 >> 1U) - (T1 >> 1U);
1778 
1779     /* Read yb (real), xb(imag) input */
1780     T0 = pSrc16[i1 * 2U];
1781     T1 = pSrc16[(i1 * 2U) + 1U];
1782 
1783     /*  writing the butterfly processed i0 + fftLen/4 sample */
1784     /* xc' = (xa-xb+xc-xd) */
1785     /* yc' = (ya-yb+yc-yd) */
1786     pSrc16[i1 * 2U] = R0;
1787     pSrc16[(i1 * 2U) + 1U] = R1;
1788 
1789     /* Read yd (real), xd(imag) input */
1790     U0 = pSrc16[i3 * 2U];
1791     U1 = pSrc16[(i3 * 2U) + 1U];
1792     /* T0 = (yb - yd), T1 = (xb - xd) */
1793     T0 = __SSAT(T0 - U0, 16U);
1794     T1 = __SSAT(T1 - U1, 16U);
1795 
1796     /*  writing the butterfly processed i0 + fftLen/2 sample */
1797     /* xb' = (xa-yb-xc+yd) */
1798     /* yb' = (ya+xb-yc-xd) */
1799     pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
1800     pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
1801 
1802 
1803     /*  writing the butterfly processed i0 + 3fftLen/4 sample */
1804     /* xd' = (xa+yb-xc-yd) */
1805     /* yd' = (ya-xb-yc+xd) */
1806     pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
1807     pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
1808   }
1809   /* end of last stage  process */
1810 
1811   /* output is in 11.5(q5) format for the 1024 point */
1812   /* output is in 9.7(q7) format for the 256 point   */
1813   /* output is in 7.9(q9) format for the 64 point  */
1814   /* output is in 5.11(q11) format for the 16 point  */
1815 
1816 #endif /* #if defined (ARM_MATH_DSP) */
1817 
1818 }
1819