1 /*
2 * Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
3 * This file is part of KISS FFT - https://github.com/mborgerding/kissfft
4 *
5 * SPDX-License-Identifier: BSD-3-Clause
6 * See COPYING file for more information.
7 */
8
9
10 #include "_kiss_fft_guts_f32.h"
11 /* The guts header contains all the multiplication and addition macros that are defined for
12 fixed or floating point complex numbers. It also delares the kf_ internal functions.
13 */
14
15 static void
kf_bfly2(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m)16 kf_bfly2 (kiss_fft_f32_cpx * Fout,
17 const size_t fstride, const kiss_fft_f32_cfg st, int m)
18 {
19 kiss_fft_f32_cpx *Fout2;
20 kiss_fft_f32_cpx *tw1 = st->twiddles;
21 kiss_fft_f32_cpx t;
22 Fout2 = Fout + m;
23 do {
24 C_FIXDIV (*Fout, 2);
25 C_FIXDIV (*Fout2, 2);
26
27 C_MUL (t, *Fout2, *tw1);
28 tw1 += fstride;
29 C_SUB (*Fout2, *Fout, t);
30 C_ADDTO (*Fout, t);
31 ++Fout2;
32 ++Fout;
33 } while (--m);
34 }
35
36 static void
kf_bfly4(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,const size_t m)37 kf_bfly4 (kiss_fft_f32_cpx * Fout,
38 const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
39 {
40 kiss_fft_f32_cpx *tw1, *tw2, *tw3;
41 kiss_fft_f32_cpx scratch[6];
42 size_t k = m;
43 const size_t m2 = 2 * m;
44 const size_t m3 = 3 * m;
45
46
47 tw3 = tw2 = tw1 = st->twiddles;
48
49 do {
50 C_FIXDIV (*Fout, 4);
51 C_FIXDIV (Fout[m], 4);
52 C_FIXDIV (Fout[m2], 4);
53 C_FIXDIV (Fout[m3], 4);
54
55 C_MUL (scratch[0], Fout[m], *tw1);
56 C_MUL (scratch[1], Fout[m2], *tw2);
57 C_MUL (scratch[2], Fout[m3], *tw3);
58
59 C_SUB (scratch[5], *Fout, scratch[1]);
60 C_ADDTO (*Fout, scratch[1]);
61 C_ADD (scratch[3], scratch[0], scratch[2]);
62 C_SUB (scratch[4], scratch[0], scratch[2]);
63 C_SUB (Fout[m2], *Fout, scratch[3]);
64 tw1 += fstride;
65 tw2 += fstride * 2;
66 tw3 += fstride * 3;
67 C_ADDTO (*Fout, scratch[3]);
68
69 if (st->inverse) {
70 Fout[m].r = scratch[5].r - scratch[4].i;
71 Fout[m].i = scratch[5].i + scratch[4].r;
72 Fout[m3].r = scratch[5].r + scratch[4].i;
73 Fout[m3].i = scratch[5].i - scratch[4].r;
74 } else {
75 Fout[m].r = scratch[5].r + scratch[4].i;
76 Fout[m].i = scratch[5].i - scratch[4].r;
77 Fout[m3].r = scratch[5].r - scratch[4].i;
78 Fout[m3].i = scratch[5].i + scratch[4].r;
79 }
80 ++Fout;
81 } while (--k);
82 }
83
84 static void
kf_bfly3(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,size_t m)85 kf_bfly3 (kiss_fft_f32_cpx * Fout,
86 const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
87 {
88 size_t k = m;
89 const size_t m2 = 2 * m;
90 kiss_fft_f32_cpx *tw1, *tw2;
91 kiss_fft_f32_cpx scratch[5];
92 kiss_fft_f32_cpx epi3;
93 epi3 = st->twiddles[fstride * m];
94
95 tw1 = tw2 = st->twiddles;
96
97 do {
98 C_FIXDIV (*Fout, 3);
99 C_FIXDIV (Fout[m], 3);
100 C_FIXDIV (Fout[m2], 3);
101
102 C_MUL (scratch[1], Fout[m], *tw1);
103 C_MUL (scratch[2], Fout[m2], *tw2);
104
105 C_ADD (scratch[3], scratch[1], scratch[2]);
106 C_SUB (scratch[0], scratch[1], scratch[2]);
107 tw1 += fstride;
108 tw2 += fstride * 2;
109
110 Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
111 Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
112
113 C_MULBYSCALAR (scratch[0], epi3.i);
114
115 C_ADDTO (*Fout, scratch[3]);
116
117 Fout[m2].r = Fout[m].r + scratch[0].i;
118 Fout[m2].i = Fout[m].i - scratch[0].r;
119
120 Fout[m].r -= scratch[0].i;
121 Fout[m].i += scratch[0].r;
122
123 ++Fout;
124 } while (--k);
125 }
126
127 static void
kf_bfly5(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m)128 kf_bfly5 (kiss_fft_f32_cpx * Fout,
129 const size_t fstride, const kiss_fft_f32_cfg st, int m)
130 {
131 kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
132 int u;
133 kiss_fft_f32_cpx scratch[13];
134 kiss_fft_f32_cpx *twiddles = st->twiddles;
135 kiss_fft_f32_cpx *tw;
136 kiss_fft_f32_cpx ya, yb;
137 ya = twiddles[fstride * m];
138 yb = twiddles[fstride * 2 * m];
139
140 Fout0 = Fout;
141 Fout1 = Fout0 + m;
142 Fout2 = Fout0 + 2 * m;
143 Fout3 = Fout0 + 3 * m;
144 Fout4 = Fout0 + 4 * m;
145
146 tw = st->twiddles;
147 for (u = 0; u < m; ++u) {
148 C_FIXDIV (*Fout0, 5);
149 C_FIXDIV (*Fout1, 5);
150 C_FIXDIV (*Fout2, 5);
151 C_FIXDIV (*Fout3, 5);
152 C_FIXDIV (*Fout4, 5);
153 scratch[0] = *Fout0;
154
155 C_MUL (scratch[1], *Fout1, tw[u * fstride]);
156 C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
157 C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
158 C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
159
160 C_ADD (scratch[7], scratch[1], scratch[4]);
161 C_SUB (scratch[10], scratch[1], scratch[4]);
162 C_ADD (scratch[8], scratch[2], scratch[3]);
163 C_SUB (scratch[9], scratch[2], scratch[3]);
164
165 Fout0->r += scratch[7].r + scratch[8].r;
166 Fout0->i += scratch[7].i + scratch[8].i;
167
168 scratch[5].r =
169 scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
170 scratch[5].i =
171 scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
172
173 scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
174 scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
175
176 C_SUB (*Fout1, scratch[5], scratch[6]);
177 C_ADD (*Fout4, scratch[5], scratch[6]);
178
179 scratch[11].r =
180 scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
181 scratch[11].i =
182 scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
183 scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
184 scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
185
186 C_ADD (*Fout2, scratch[11], scratch[12]);
187 C_SUB (*Fout3, scratch[11], scratch[12]);
188
189 ++Fout0;
190 ++Fout1;
191 ++Fout2;
192 ++Fout3;
193 ++Fout4;
194 }
195 }
196
197 /* perform the butterfly for one stage of a mixed radix FFT */
198 static void
kf_bfly_generic(kiss_fft_f32_cpx * Fout,const size_t fstride,const kiss_fft_f32_cfg st,int m,int p)199 kf_bfly_generic (kiss_fft_f32_cpx * Fout,
200 const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
201 {
202 int u, k, q1, q;
203 kiss_fft_f32_cpx *twiddles = st->twiddles;
204 kiss_fft_f32_cpx t;
205 int Norig = st->nfft;
206
207 kiss_fft_f32_cpx *scratch =
208 (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
209 p);
210
211 for (u = 0; u < m; ++u) {
212 k = u;
213 for (q1 = 0; q1 < p; ++q1) {
214 scratch[q1] = Fout[k];
215 C_FIXDIV (scratch[q1], p);
216 k += m;
217 }
218
219 k = u;
220 for (q1 = 0; q1 < p; ++q1) {
221 int twidx = 0;
222 Fout[k] = scratch[0];
223 for (q = 1; q < p; ++q) {
224 twidx += fstride * k;
225 if (twidx >= Norig)
226 twidx -= Norig;
227 C_MUL (t, scratch[q], twiddles[twidx]);
228 C_ADDTO (Fout[k], t);
229 }
230 k += m;
231 }
232 }
233 KISS_FFT_F32_TMP_FREE (scratch);
234 }
235
236 static void
kf_work(kiss_fft_f32_cpx * Fout,const kiss_fft_f32_cpx * f,const size_t fstride,int in_stride,int * factors,const kiss_fft_f32_cfg st)237 kf_work (kiss_fft_f32_cpx * Fout,
238 const kiss_fft_f32_cpx * f,
239 const size_t fstride, int in_stride, int *factors,
240 const kiss_fft_f32_cfg st)
241 {
242 kiss_fft_f32_cpx *Fout_beg = Fout;
243 const int p = *factors++; /* the radix */
244 const int m = *factors++; /* stage's fft length/p */
245 const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
246
247 #ifdef _OPENMP
248 // use openmp extensions at the
249 // top-level (not recursive)
250 if (fstride == 1 && p <= 5 && m != 1) {
251 int k;
252
253 // execute the p different work units in different threads
254 # pragma omp parallel for
255 for (k = 0; k < p; ++k)
256 kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
257 in_stride, factors, st);
258 // all threads have joined by this point
259
260 switch (p) {
261 case 2:
262 kf_bfly2 (Fout, fstride, st, m);
263 break;
264 case 3:
265 kf_bfly3 (Fout, fstride, st, m);
266 break;
267 case 4:
268 kf_bfly4 (Fout, fstride, st, m);
269 break;
270 case 5:
271 kf_bfly5 (Fout, fstride, st, m);
272 break;
273 default:
274 kf_bfly_generic (Fout, fstride, st, m, p);
275 break;
276 }
277 return;
278 }
279 #endif
280
281 if (m == 1) {
282 do {
283 *Fout = *f;
284 f += fstride * in_stride;
285 } while (++Fout != Fout_end);
286 } else {
287 do {
288 // recursive call:
289 // DFT of size m*p performed by doing
290 // p instances of smaller DFTs of size m,
291 // each one takes a decimated version of the input
292 kf_work (Fout, f, fstride * p, in_stride, factors, st);
293 f += fstride * in_stride;
294 } while ((Fout += m) != Fout_end);
295 }
296
297 Fout = Fout_beg;
298
299 // recombine the p smaller DFTs
300 switch (p) {
301 case 2:
302 kf_bfly2 (Fout, fstride, st, m);
303 break;
304 case 3:
305 kf_bfly3 (Fout, fstride, st, m);
306 break;
307 case 4:
308 kf_bfly4 (Fout, fstride, st, m);
309 break;
310 case 5:
311 kf_bfly5 (Fout, fstride, st, m);
312 break;
313 default:
314 kf_bfly_generic (Fout, fstride, st, m, p);
315 break;
316 }
317 }
318
319 /* facbuf is populated by p1,m1,p2,m2, ...
320 where
321 p[i] * m[i] = m[i-1]
322 m0 = n */
323 static void
kf_factor(int n,int * facbuf)324 kf_factor (int n, int *facbuf)
325 {
326 int p = 4;
327 double floor_sqrt;
328 floor_sqrt = floor (sqrt ((double) n));
329
330 /*factor out powers of 4, powers of 2, then any remaining primes */
331 do {
332 while (n % p) {
333 switch (p) {
334 case 4:
335 p = 2;
336 break;
337 case 2:
338 p = 3;
339 break;
340 default:
341 p += 2;
342 break;
343 }
344 if (p > floor_sqrt)
345 p = n; /* no more factors, skip to end */
346 }
347 n /= p;
348 *facbuf++ = p;
349 *facbuf++ = n;
350 } while (n > 1);
351 }
352
353 /*
354 *
355 * User-callable function to allocate all necessary storage space for the fft.
356 *
357 * The return value is a contiguous block of memory, allocated with malloc. As such,
358 * It can be freed with free(), rather than a kiss_fft_f32-specific function.
359 * */
360 kiss_fft_f32_cfg
kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)361 kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
362 {
363 kiss_fft_f32_cfg st = NULL;
364 size_t memneeded = sizeof (struct kiss_fft_f32_state)
365 + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
366
367 if (lenmem == NULL) {
368 st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
369 } else {
370 if (mem != NULL && *lenmem >= memneeded)
371 st = (kiss_fft_f32_cfg) mem;
372 *lenmem = memneeded;
373 }
374 if (st) {
375 int i;
376 st->nfft = nfft;
377 st->inverse = inverse_fft;
378
379 for (i = 0; i < nfft; ++i) {
380 const double pi =
381 3.141592653589793238462643383279502884197169399375105820974944;
382 double phase = -2 * pi * i / nfft;
383 if (st->inverse)
384 phase *= -1;
385 kf_cexp (st->twiddles + i, phase);
386 }
387
388 kf_factor (nfft, st->factors);
389 }
390 return st;
391 }
392
393
394 void
kiss_fft_f32_stride(kiss_fft_f32_cfg st,const kiss_fft_f32_cpx * fin,kiss_fft_f32_cpx * fout,int in_stride)395 kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
396 kiss_fft_f32_cpx * fout, int in_stride)
397 {
398 if (fin == fout) {
399 //NOTE: this is not really an in-place FFT algorithm.
400 //It just performs an out-of-place FFT into a temp buffer
401 kiss_fft_f32_cpx *tmpbuf =
402 (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
403 st->nfft);
404 kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
405 memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
406 KISS_FFT_F32_TMP_FREE (tmpbuf);
407 } else {
408 kf_work (fout, fin, 1, in_stride, st->factors, st);
409 }
410 }
411
412 void
kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx * fin,kiss_fft_f32_cpx * fout)413 kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
414 kiss_fft_f32_cpx * fout)
415 {
416 kiss_fft_f32_stride (cfg, fin, fout, 1);
417 }
418
419
420 void
kiss_fft_f32_cleanup(void)421 kiss_fft_f32_cleanup (void)
422 {
423 // nothing needed any more
424 }
425
426 int
kiss_fft_f32_next_fast_size(int n)427 kiss_fft_f32_next_fast_size (int n)
428 {
429 while (1) {
430 int m = n;
431 while ((m % 2) == 0)
432 m /= 2;
433 while ((m % 3) == 0)
434 m /= 3;
435 while ((m % 5) == 0)
436 m /= 5;
437 if (m <= 1)
438 break; /* n is completely factorable by twos, threes, and fives */
439 n++;
440 }
441 return n;
442 }
443