1 /* ----------------------------------------------------------------------
2 * Project: CMSIS DSP Library
3 * Title: arm_rfft_fast_f32.c
4 * Description: RFFT & RIFFT Floating point process function
5 *
6 * $Date: 23 April 2021
7 * $Revision: V1.9.0
8 *
9 * Target Processor: Cortex-M and Cortex-A cores
10 * -------------------------------------------------------------------- */
11 /*
12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
13 *
14 * SPDX-License-Identifier: Apache-2.0
15 *
16 * Licensed under the Apache License, Version 2.0 (the License); you may
17 * not use this file except in compliance with the License.
18 * You may obtain a copy of the License at
19 *
20 * www.apache.org/licenses/LICENSE-2.0
21 *
22 * Unless required by applicable law or agreed to in writing, software
23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
25 * See the License for the specific language governing permissions and
26 * limitations under the License.
27 */
28
29 #include "dsp/transform_functions.h"
30
31 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
stage_rfft_f32(const arm_rfft_fast_instance_f32 * S,float32_t * p,float32_t * pOut)32 void stage_rfft_f32(
33 const arm_rfft_fast_instance_f32 * S,
34 float32_t * p,
35 float32_t * pOut)
36 {
37 int32_t k; /* Loop Counter */
38 float32_t twR, twI; /* RFFT Twiddle coefficients */
39 const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
40 float32_t *pA = p; /* increasing pointer */
41 float32_t *pB = p; /* decreasing pointer */
42 float32_t xAR, xAI, xBR, xBI; /* temporary variables */
43 float32_t t1a, t1b; /* temporary variables */
44 float32_t p0, p1, p2, p3; /* temporary variables */
45
46 float32x4x2_t tw,xA,xB;
47 float32x4x2_t tmp1, tmp2, res;
48
49 uint32x4_t vecStridesFwd, vecStridesBkwd;
50
51 vecStridesFwd = vidupq_u32((uint32_t)0, 2);
52 vecStridesBkwd = -vecStridesFwd;
53
54 int blockCnt;
55
56
57 k = (S->Sint).fftLen - 1;
58
59 /* Pack first and last sample of the frequency domain together */
60
61 xBR = pB[0];
62 xBI = pB[1];
63 xAR = pA[0];
64 xAI = pA[1];
65
66 twR = *pCoeff++ ;
67 twI = *pCoeff++ ;
68
69 // U1 = XA(1) + XB(1); % It is real
70 t1a = xBR + xAR ;
71
72 // U2 = XB(1) - XA(1); % It is imaginary
73 t1b = xBI + xAI ;
74
75 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
76 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
77 *pOut++ = 0.5f * ( t1a + t1b );
78 *pOut++ = 0.5f * ( t1a - t1b );
79
80 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
81 pB = p + 2*k;
82 pA += 2;
83
84 blockCnt = k >> 2;
85 while (blockCnt > 0)
86 {
87 /*
88 function X = my_split_rfft(X, ifftFlag)
89 % X is a series of real numbers
90 L = length(X);
91 XC = X(1:2:end) +i*X(2:2:end);
92 XA = fft(XC);
93 XB = conj(XA([1 end:-1:2]));
94 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
95 for l = 2:L/2
96 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
97 end
98 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
99 X = XA;
100 */
101
102
103 xA = vld2q_f32(pA);
104 pA += 8;
105
106 xB = vld2q_f32(pB);
107
108 xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
109 xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
110
111 xB.val[1] = vnegq_f32(xB.val[1]);
112 pB -= 8;
113
114
115 tw = vld2q_f32(pCoeff);
116 pCoeff += 8;
117
118
119 tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
120 tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
121
122 tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
123 tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
124
125 res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
126 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
127
128 res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
129 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
130
131 res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
132 res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
133
134 res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
135 res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
136
137
138 vst2q_f32(pOut, res);
139 pOut += 8;
140
141
142 blockCnt--;
143 }
144
145 blockCnt = k & 3;
146 while (blockCnt > 0)
147 {
148 /*
149 function X = my_split_rfft(X, ifftFlag)
150 % X is a series of real numbers
151 L = length(X);
152 XC = X(1:2:end) +i*X(2:2:end);
153 XA = fft(XC);
154 XB = conj(XA([1 end:-1:2]));
155 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
156 for l = 2:L/2
157 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
158 end
159 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
160 X = XA;
161 */
162
163 xBI = pB[1];
164 xBR = pB[0];
165 xAR = pA[0];
166 xAI = pA[1];
167
168 twR = *pCoeff++;
169 twI = *pCoeff++;
170
171 t1a = xBR - xAR ;
172 t1b = xBI + xAI ;
173
174 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
175 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
176 p0 = twR * t1a;
177 p1 = twI * t1a;
178 p2 = twR * t1b;
179 p3 = twI * t1b;
180
181 *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
182 *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
183
184 pA += 2;
185 pB -= 2;
186 blockCnt--;
187 }
188 }
189
190 /* Prepares data for inverse cfft */
merge_rfft_f32(const arm_rfft_fast_instance_f32 * S,float32_t * p,float32_t * pOut)191 void merge_rfft_f32(
192 const arm_rfft_fast_instance_f32 * S,
193 float32_t * p,
194 float32_t * pOut)
195 {
196 int32_t k; /* Loop Counter */
197 float32_t twR, twI; /* RFFT Twiddle coefficients */
198 const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
199 float32_t *pA = p; /* increasing pointer */
200 float32_t *pB = p; /* decreasing pointer */
201 float32_t xAR, xAI, xBR, xBI; /* temporary variables */
202 float32_t t1a, t1b, r, s, t, u; /* temporary variables */
203
204 float32x4x2_t tw,xA,xB;
205 float32x4x2_t tmp1, tmp2, res;
206 uint32x4_t vecStridesFwd, vecStridesBkwd;
207
208 vecStridesFwd = vidupq_u32((uint32_t)0, 2);
209 vecStridesBkwd = -vecStridesFwd;
210
211 int blockCnt;
212
213
214 k = (S->Sint).fftLen - 1;
215
216 xAR = pA[0];
217 xAI = pA[1];
218
219 pCoeff += 2 ;
220
221 *pOut++ = 0.5f * ( xAR + xAI );
222 *pOut++ = 0.5f * ( xAR - xAI );
223
224 pB = p + 2*k ;
225 pA += 2 ;
226
227 blockCnt = k >> 2;
228 while (blockCnt > 0)
229 {
230 /* G is half of the frequency complex spectrum */
231 //for k = 2:N
232 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
233 xA = vld2q_f32(pA);
234 pA += 8;
235
236 xB = vld2q_f32(pB);
237
238 xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
239 xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
240
241 xB.val[1] = vnegq_f32(xB.val[1]);
242 pB -= 8;
243
244
245 tw = vld2q_f32(pCoeff);
246 tw.val[1] = vnegq_f32(tw.val[1]);
247 pCoeff += 8;
248
249
250 tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
251 tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
252
253 tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
254 tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
255
256 res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
257 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
258
259 res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
260 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
261
262 res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
263 res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
264
265 res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
266 res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
267
268
269 vst2q_f32(pOut, res);
270 pOut += 8;
271
272
273 blockCnt--;
274 }
275
276 blockCnt = k & 3;
277 while (blockCnt > 0)
278 {
279 /* G is half of the frequency complex spectrum */
280 //for k = 2:N
281 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
282 xBI = pB[1] ;
283 xBR = pB[0] ;
284 xAR = pA[0];
285 xAI = pA[1];
286
287 twR = *pCoeff++;
288 twI = *pCoeff++;
289
290 t1a = xAR - xBR ;
291 t1b = xAI + xBI ;
292
293 r = twR * t1a;
294 s = twI * t1b;
295 t = twI * t1a;
296 u = twR * t1b;
297
298 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
299 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
300 *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
301 *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
302
303 pA += 2;
304 pB -= 2;
305 blockCnt--;
306 }
307
308 }
309 #else
stage_rfft_f32(const arm_rfft_fast_instance_f32 * S,float32_t * p,float32_t * pOut)310 void stage_rfft_f32(
311 const arm_rfft_fast_instance_f32 * S,
312 float32_t * p,
313 float32_t * pOut)
314 {
315 int32_t k; /* Loop Counter */
316 float32_t twR, twI; /* RFFT Twiddle coefficients */
317 const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
318 float32_t *pA = p; /* increasing pointer */
319 float32_t *pB = p; /* decreasing pointer */
320 float32_t xAR, xAI, xBR, xBI; /* temporary variables */
321 float32_t t1a, t1b; /* temporary variables */
322 float32_t p0, p1, p2, p3; /* temporary variables */
323
324
325 k = (S->Sint).fftLen - 1;
326
327 /* Pack first and last sample of the frequency domain together */
328
329 xBR = pB[0];
330 xBI = pB[1];
331 xAR = pA[0];
332 xAI = pA[1];
333
334 twR = *pCoeff++ ;
335 twI = *pCoeff++ ;
336
337
338 // U1 = XA(1) + XB(1); % It is real
339 t1a = xBR + xAR ;
340
341 // U2 = XB(1) - XA(1); % It is imaginary
342 t1b = xBI + xAI ;
343
344 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
345 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
346 *pOut++ = 0.5f * ( t1a + t1b );
347 *pOut++ = 0.5f * ( t1a - t1b );
348
349 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
350 pB = p + 2*k;
351 pA += 2;
352
353 do
354 {
355 /*
356 function X = my_split_rfft(X, ifftFlag)
357 % X is a series of real numbers
358 L = length(X);
359 XC = X(1:2:end) +i*X(2:2:end);
360 XA = fft(XC);
361 XB = conj(XA([1 end:-1:2]));
362 TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
363 for l = 2:L/2
364 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
365 end
366 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
367 X = XA;
368 */
369
370 xBI = pB[1];
371 xBR = pB[0];
372 xAR = pA[0];
373 xAI = pA[1];
374
375 twR = *pCoeff++;
376 twI = *pCoeff++;
377
378 t1a = xBR - xAR ;
379 t1b = xBI + xAI ;
380
381 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
382 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
383 p0 = twR * t1a;
384 p1 = twI * t1a;
385 p2 = twR * t1b;
386 p3 = twI * t1b;
387
388 *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
389 *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
390
391
392 pA += 2;
393 pB -= 2;
394 k--;
395 } while (k > 0);
396 }
397
398 /* Prepares data for inverse cfft */
merge_rfft_f32(const arm_rfft_fast_instance_f32 * S,float32_t * p,float32_t * pOut)399 void merge_rfft_f32(
400 const arm_rfft_fast_instance_f32 * S,
401 float32_t * p,
402 float32_t * pOut)
403 {
404 int32_t k; /* Loop Counter */
405 float32_t twR, twI; /* RFFT Twiddle coefficients */
406 const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
407 float32_t *pA = p; /* increasing pointer */
408 float32_t *pB = p; /* decreasing pointer */
409 float32_t xAR, xAI, xBR, xBI; /* temporary variables */
410 float32_t t1a, t1b, r, s, t, u; /* temporary variables */
411
412 k = (S->Sint).fftLen - 1;
413
414 xAR = pA[0];
415 xAI = pA[1];
416
417 pCoeff += 2 ;
418
419 *pOut++ = 0.5f * ( xAR + xAI );
420 *pOut++ = 0.5f * ( xAR - xAI );
421
422 pB = p + 2*k ;
423 pA += 2 ;
424
425 while (k > 0)
426 {
427 /* G is half of the frequency complex spectrum */
428 //for k = 2:N
429 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
430 xBI = pB[1] ;
431 xBR = pB[0] ;
432 xAR = pA[0];
433 xAI = pA[1];
434
435 twR = *pCoeff++;
436 twI = *pCoeff++;
437
438 t1a = xAR - xBR ;
439 t1b = xAI + xBI ;
440
441 r = twR * t1a;
442 s = twI * t1b;
443 t = twI * t1a;
444 u = twR * t1b;
445
446 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
447 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
448 *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
449 *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
450
451 pA += 2;
452 pB -= 2;
453 k--;
454 }
455
456 }
457
458 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
459
460 /**
461 @ingroup groupTransforms
462 */
463
464 /**
465 @defgroup RealFFT Real FFT Functions
466
467 @par
468 The CMSIS DSP library includes specialized algorithms for computing the
469 FFT of real data sequences. The FFT is defined over complex data but
470 in many applications the input is real. Real FFT algorithms take advantage
471 of the symmetry properties of the FFT and have a speed advantage over complex
472 algorithms of the same length.
473 @par
474 The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
475 @par
476 The real length N forward FFT of a sequence is computed using the steps shown below.
477 @par
478 \image html RFFT.gif "Real Fast Fourier Transform"
479 @par
480 The real sequence is initially treated as if it were complex to perform a CFFT.
481 Later, a processing stage reshapes the data to obtain half of the frequency spectrum
482 in complex format. Except the first complex number that contains the two real numbers
483 X[0] and X[N/2] all the data is complex. In other words, the first complex sample
484 contains two real values packed.
485 @par
486 The input for the inverse RFFT should keep the same format as the output of the
487 forward RFFT. A first processing stage pre-process the data to later perform an
488 inverse CFFT.
489 @par
490 \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
491 @par
492 The algorithms for floating-point, Q15, and Q31 data are slightly different
493 and we describe each algorithm in turn.
494 @par Floating-point
495 The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32().
496 The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated
497 but are still documented.
498 @par
499 The FFT of a real N-point sequence has even symmetry in the frequency domain.
500 The second half of the data equals the conjugate of the first half flipped in frequency.
501 Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
502 These are packed into the output array in alternating real and imaginary components:
503 @par
504 X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
505 real[(N/2)-1], imag[(N/2)-1 }
506 @par
507 It happens that the first complex number (real[0], imag[0]) is actually
508 all real. real[0] represents the DC offset, and imag[0] should be 0.
509 (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
510 the first harmonic and so on.
511 @par
512 The real FFT functions pack the frequency domain data in this fashion.
513 The forward transform outputs the data in this form and the inverse
514 transform expects input data in this form. The function always performs
515 the needed bitreversal so that the input and output data is always in
516 normal order. The functions support lengths of [32, 64, 128, ..., 4096]
517 samples.
518 @par Q15 and Q31
519 The real algorithms are defined in a similar manner and utilize N/2 complex
520 transforms behind the scenes.
521 @par
522 The complex transforms used internally include scaling to prevent fixed-point
523 overflows. The overall scaling equals 1/(fftLen/2).
524 Due to the use of complex transform internally, the source buffer is
525 modified by the rfft.
526 @par
527 A separate instance structure must be defined for each transform used but
528 twiddle factor and bit reversal tables can be reused.
529 @par
530 There is also an associated initialization function for each data type.
531 The initialization function performs the following operations:
532 - Sets the values of the internal structure fields.
533 - Initializes twiddle factor table and bit reversal table pointers.
534 - Initializes the internal complex FFT data structure.
535 @par
536 Use of the initialization function is optional **except for MVE versions where it is mandatory**.
537 If you don't use the initialization functions, then the structures should be initialized with code
538 similar to the one below:
539 <pre>
540 arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
541 arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
542 </pre>
543 where <code>fftLenReal</code> is the length of the real transform;
544 <code>fftLenBy2</code> length of the internal complex transform (fftLenReal/2).
545 <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform.
546 <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order
547 output (=1).
548 <code>twidCoefRModifier</code> stride modifier for the twiddle factor table.
549 The value is based on the FFT length;
550 <code>pTwiddleAReal</code>points to the A array of twiddle coefficients;
551 <code>pTwiddleBReal</code>points to the B array of twiddle coefficients;
552 <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure
553 must also be initialized.
554 @par
555 Note that with MVE versions you can't initialize instance structures directly and **must
556 use the initialization function**.
557 */
558
559 /**
560 @addtogroup RealFFT
561 @{
562 */
563
564 /**
565 @brief Processing function for the floating-point real FFT.
566 @param[in] S points to an arm_rfft_fast_instance_f32 structure
567 @param[in] p points to input buffer (Source buffer is modified by this function.)
568 @param[in] pOut points to output buffer
569 @param[in] ifftFlag
570 - value = 0: RFFT
571 - value = 1: RIFFT
572 @return none
573 */
574
arm_rfft_fast_f32(const arm_rfft_fast_instance_f32 * S,float32_t * p,float32_t * pOut,uint8_t ifftFlag)575 void arm_rfft_fast_f32(
576 const arm_rfft_fast_instance_f32 * S,
577 float32_t * p,
578 float32_t * pOut,
579 uint8_t ifftFlag)
580 {
581 const arm_cfft_instance_f32 * Sint = &(S->Sint);
582
583 /* Calculation of Real FFT */
584 if (ifftFlag)
585 {
586 /* Real FFT compression */
587 merge_rfft_f32(S, p, pOut);
588 /* Complex radix-4 IFFT process */
589 arm_cfft_f32( Sint, pOut, ifftFlag, 1);
590 }
591 else
592 {
593 /* Calculation of RFFT of input */
594 arm_cfft_f32( Sint, p, ifftFlag, 1);
595
596 /* Real FFT extraction */
597 stage_rfft_f32(S, p, pOut);
598 }
599 }
600
601 /**
602 * @} end of RealFFT group
603 */
604