1 /*Copyright (c) 2003-2004, Mark Borgerding
2 Lots of modifications by Jean-Marc Valin
3 Copyright (c) 2005-2007, Xiph.Org Foundation
4 Copyright (c) 2008, Xiph.Org Foundation, CSIRO
5
6 All rights reserved.
7
8 Redistribution and use in source and binary forms, with or without
9 modification, are permitted provided that the following conditions are met:
10
11 * Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
13 * Redistributions in binary form must reproduce the above copyright notice,
14 this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
16
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.*/
28
29 /* This code is originally from Mark Borgerding's KISS-FFT but has been
30 heavily modified to better suit Opus */
31
32 #ifndef SKIP_CONFIG_H
33 # ifdef HAVE_CONFIG_H
34 # include "config.h"
35 # endif
36 #endif
37
38 #include "_kiss_fft_guts.h"
39 #include "arch.h"
40 #include "os_support.h"
41 #include "mathops.h"
42 #include "stack_alloc.h"
43
44 /* The guts header contains all the multiplication and addition macros that are defined for
45 complex numbers. It also delares the kf_ internal functions.
46 */
47
kf_bfly2(kiss_fft_cpx * Fout,int m,int N)48 static void kf_bfly2(
49 kiss_fft_cpx * Fout,
50 int m,
51 int N
52 )
53 {
54 kiss_fft_cpx * Fout2;
55 int i;
56 (void)m;
57 #ifdef CUSTOM_MODES
58 if (m==1)
59 {
60 celt_assert(m==1);
61 for (i=0;i<N;i++)
62 {
63 kiss_fft_cpx t;
64 Fout2 = Fout + 1;
65 t = *Fout2;
66 C_SUB( *Fout2 , *Fout , t );
67 C_ADDTO( *Fout , t );
68 Fout += 2;
69 }
70 } else
71 #endif
72 {
73 opus_val16 tw;
74 tw = QCONST16(0.7071067812f, 15);
75 /* We know that m==4 here because the radix-2 is just after a radix-4 */
76 celt_assert(m==4);
77 for (i=0;i<N;i++)
78 {
79 kiss_fft_cpx t;
80 Fout2 = Fout + 4;
81 t = Fout2[0];
82 C_SUB( Fout2[0] , Fout[0] , t );
83 C_ADDTO( Fout[0] , t );
84
85 t.r = S_MUL(ADD32_ovflw(Fout2[1].r, Fout2[1].i), tw);
86 t.i = S_MUL(SUB32_ovflw(Fout2[1].i, Fout2[1].r), tw);
87 C_SUB( Fout2[1] , Fout[1] , t );
88 C_ADDTO( Fout[1] , t );
89
90 t.r = Fout2[2].i;
91 t.i = -Fout2[2].r;
92 C_SUB( Fout2[2] , Fout[2] , t );
93 C_ADDTO( Fout[2] , t );
94
95 t.r = S_MUL(SUB32_ovflw(Fout2[3].i, Fout2[3].r), tw);
96 t.i = S_MUL(NEG32_ovflw(ADD32_ovflw(Fout2[3].i, Fout2[3].r)), tw);
97 C_SUB( Fout2[3] , Fout[3] , t );
98 C_ADDTO( Fout[3] , t );
99 Fout += 8;
100 }
101 }
102 }
103
kf_bfly4(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)104 static void kf_bfly4(
105 kiss_fft_cpx * Fout,
106 const size_t fstride,
107 const kiss_fft_state *st,
108 int m,
109 int N,
110 int mm
111 )
112 {
113 int i;
114
115 if (m==1)
116 {
117 /* Degenerate case where all the twiddles are 1. */
118 for (i=0;i<N;i++)
119 {
120 kiss_fft_cpx scratch0, scratch1;
121
122 C_SUB( scratch0 , *Fout, Fout[2] );
123 C_ADDTO(*Fout, Fout[2]);
124 C_ADD( scratch1 , Fout[1] , Fout[3] );
125 C_SUB( Fout[2], *Fout, scratch1 );
126 C_ADDTO( *Fout , scratch1 );
127 C_SUB( scratch1 , Fout[1] , Fout[3] );
128
129 Fout[1].r = ADD32_ovflw(scratch0.r, scratch1.i);
130 Fout[1].i = SUB32_ovflw(scratch0.i, scratch1.r);
131 Fout[3].r = SUB32_ovflw(scratch0.r, scratch1.i);
132 Fout[3].i = ADD32_ovflw(scratch0.i, scratch1.r);
133 Fout+=4;
134 }
135 } else {
136 int j;
137 kiss_fft_cpx scratch[6];
138 const kiss_twiddle_cpx *tw1,*tw2,*tw3;
139 const int m2=2*m;
140 const int m3=3*m;
141 kiss_fft_cpx * Fout_beg = Fout;
142 for (i=0;i<N;i++)
143 {
144 Fout = Fout_beg + i*mm;
145 tw3 = tw2 = tw1 = st->twiddles;
146 /* m is guaranteed to be a multiple of 4. */
147 for (j=0;j<m;j++)
148 {
149 C_MUL(scratch[0],Fout[m] , *tw1 );
150 C_MUL(scratch[1],Fout[m2] , *tw2 );
151 C_MUL(scratch[2],Fout[m3] , *tw3 );
152
153 C_SUB( scratch[5] , *Fout, scratch[1] );
154 C_ADDTO(*Fout, scratch[1]);
155 C_ADD( scratch[3] , scratch[0] , scratch[2] );
156 C_SUB( scratch[4] , scratch[0] , scratch[2] );
157 C_SUB( Fout[m2], *Fout, scratch[3] );
158 tw1 += fstride;
159 tw2 += fstride*2;
160 tw3 += fstride*3;
161 C_ADDTO( *Fout , scratch[3] );
162
163 Fout[m].r = ADD32_ovflw(scratch[5].r, scratch[4].i);
164 Fout[m].i = SUB32_ovflw(scratch[5].i, scratch[4].r);
165 Fout[m3].r = SUB32_ovflw(scratch[5].r, scratch[4].i);
166 Fout[m3].i = ADD32_ovflw(scratch[5].i, scratch[4].r);
167 ++Fout;
168 }
169 }
170 }
171 }
172
173
174 #ifndef RADIX_TWO_ONLY
175
kf_bfly3(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)176 static void kf_bfly3(
177 kiss_fft_cpx * Fout,
178 const size_t fstride,
179 const kiss_fft_state *st,
180 int m,
181 int N,
182 int mm
183 )
184 {
185 int i;
186 size_t k;
187 const size_t m2 = 2*m;
188 const kiss_twiddle_cpx *tw1,*tw2;
189 kiss_fft_cpx scratch[5];
190 kiss_twiddle_cpx epi3;
191
192 kiss_fft_cpx * Fout_beg = Fout;
193 #ifdef FIXED_POINT
194 /*epi3.r = -16384;*/ /* Unused */
195 epi3.i = -28378;
196 #else
197 epi3 = st->twiddles[fstride*m];
198 #endif
199 for (i=0;i<N;i++)
200 {
201 Fout = Fout_beg + i*mm;
202 tw1=tw2=st->twiddles;
203 /* For non-custom modes, m is guaranteed to be a multiple of 4. */
204 k=m;
205 do {
206
207 C_MUL(scratch[1],Fout[m] , *tw1);
208 C_MUL(scratch[2],Fout[m2] , *tw2);
209
210 C_ADD(scratch[3],scratch[1],scratch[2]);
211 C_SUB(scratch[0],scratch[1],scratch[2]);
212 tw1 += fstride;
213 tw2 += fstride*2;
214
215 Fout[m].r = SUB32_ovflw(Fout->r, HALF_OF(scratch[3].r));
216 Fout[m].i = SUB32_ovflw(Fout->i, HALF_OF(scratch[3].i));
217
218 C_MULBYSCALAR( scratch[0] , epi3.i );
219
220 C_ADDTO(*Fout,scratch[3]);
221
222 Fout[m2].r = ADD32_ovflw(Fout[m].r, scratch[0].i);
223 Fout[m2].i = SUB32_ovflw(Fout[m].i, scratch[0].r);
224
225 Fout[m].r = SUB32_ovflw(Fout[m].r, scratch[0].i);
226 Fout[m].i = ADD32_ovflw(Fout[m].i, scratch[0].r);
227
228 ++Fout;
229 } while(--k);
230 }
231 }
232
233
234 #ifndef OVERRIDE_kf_bfly5
kf_bfly5(kiss_fft_cpx * Fout,const size_t fstride,const kiss_fft_state * st,int m,int N,int mm)235 static void kf_bfly5(
236 kiss_fft_cpx * Fout,
237 const size_t fstride,
238 const kiss_fft_state *st,
239 int m,
240 int N,
241 int mm
242 )
243 {
244 kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
245 int i, u;
246 kiss_fft_cpx scratch[13];
247 const kiss_twiddle_cpx *tw;
248 kiss_twiddle_cpx ya,yb;
249 kiss_fft_cpx * Fout_beg = Fout;
250
251 #ifdef FIXED_POINT
252 ya.r = 10126;
253 ya.i = -31164;
254 yb.r = -26510;
255 yb.i = -19261;
256 #else
257 ya = st->twiddles[fstride*m];
258 yb = st->twiddles[fstride*2*m];
259 #endif
260 tw=st->twiddles;
261
262 for (i=0;i<N;i++)
263 {
264 Fout = Fout_beg + i*mm;
265 Fout0=Fout;
266 Fout1=Fout0+m;
267 Fout2=Fout0+2*m;
268 Fout3=Fout0+3*m;
269 Fout4=Fout0+4*m;
270
271 /* For non-custom modes, m is guaranteed to be a multiple of 4. */
272 for ( u=0; u<m; ++u ) {
273 scratch[0] = *Fout0;
274
275 C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
276 C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
277 C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
278 C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
279
280 C_ADD( scratch[7],scratch[1],scratch[4]);
281 C_SUB( scratch[10],scratch[1],scratch[4]);
282 C_ADD( scratch[8],scratch[2],scratch[3]);
283 C_SUB( scratch[9],scratch[2],scratch[3]);
284
285 Fout0->r = ADD32_ovflw(Fout0->r, ADD32_ovflw(scratch[7].r, scratch[8].r));
286 Fout0->i = ADD32_ovflw(Fout0->i, ADD32_ovflw(scratch[7].i, scratch[8].i));
287
288 scratch[5].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,ya.r), S_MUL(scratch[8].r,yb.r)));
289 scratch[5].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,ya.r), S_MUL(scratch[8].i,yb.r)));
290
291 scratch[6].r = ADD32_ovflw(S_MUL(scratch[10].i,ya.i), S_MUL(scratch[9].i,yb.i));
292 scratch[6].i = NEG32_ovflw(ADD32_ovflw(S_MUL(scratch[10].r,ya.i), S_MUL(scratch[9].r,yb.i)));
293
294 C_SUB(*Fout1,scratch[5],scratch[6]);
295 C_ADD(*Fout4,scratch[5],scratch[6]);
296
297 scratch[11].r = ADD32_ovflw(scratch[0].r, ADD32_ovflw(S_MUL(scratch[7].r,yb.r), S_MUL(scratch[8].r,ya.r)));
298 scratch[11].i = ADD32_ovflw(scratch[0].i, ADD32_ovflw(S_MUL(scratch[7].i,yb.r), S_MUL(scratch[8].i,ya.r)));
299 scratch[12].r = SUB32_ovflw(S_MUL(scratch[9].i,ya.i), S_MUL(scratch[10].i,yb.i));
300 scratch[12].i = SUB32_ovflw(S_MUL(scratch[10].r,yb.i), S_MUL(scratch[9].r,ya.i));
301
302 C_ADD(*Fout2,scratch[11],scratch[12]);
303 C_SUB(*Fout3,scratch[11],scratch[12]);
304
305 ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
306 }
307 }
308 }
309 #endif /* OVERRIDE_kf_bfly5 */
310
311
312 #endif
313
314
315 #ifdef CUSTOM_MODES
316
317 static
compute_bitrev_table(int Fout,opus_int16 * f,const size_t fstride,int in_stride,opus_int16 * factors,const kiss_fft_state * st)318 void compute_bitrev_table(
319 int Fout,
320 opus_int16 *f,
321 const size_t fstride,
322 int in_stride,
323 opus_int16 * factors,
324 const kiss_fft_state *st
325 )
326 {
327 const int p=*factors++; /* the radix */
328 const int m=*factors++; /* stage's fft length/p */
329
330 /*printf ("fft %d %d %d %d %d %d\n", p*m, m, p, s2, fstride*in_stride, N);*/
331 if (m==1)
332 {
333 int j;
334 for (j=0;j<p;j++)
335 {
336 *f = Fout+j;
337 f += fstride*in_stride;
338 }
339 } else {
340 int j;
341 for (j=0;j<p;j++)
342 {
343 compute_bitrev_table( Fout , f, fstride*p, in_stride, factors,st);
344 f += fstride*in_stride;
345 Fout += m;
346 }
347 }
348 }
349
350 /* facbuf is populated by p1,m1,p2,m2, ...
351 where
352 p[i] * m[i] = m[i-1]
353 m0 = n */
354 static
kf_factor(int n,opus_int16 * facbuf)355 int kf_factor(int n,opus_int16 * facbuf)
356 {
357 int p=4;
358 int i;
359 int stages=0;
360 int nbak = n;
361
362 /*factor out powers of 4, powers of 2, then any remaining primes */
363 do {
364 while (n % p) {
365 switch (p) {
366 case 4: p = 2; break;
367 case 2: p = 3; break;
368 default: p += 2; break;
369 }
370 if (p>32000 || (opus_int32)p*(opus_int32)p > n)
371 p = n; /* no more factors, skip to end */
372 }
373 n /= p;
374 #ifdef RADIX_TWO_ONLY
375 if (p!=2 && p != 4)
376 #else
377 if (p>5)
378 #endif
379 {
380 return 0;
381 }
382 facbuf[2*stages] = p;
383 if (p==2 && stages > 1)
384 {
385 facbuf[2*stages] = 4;
386 facbuf[2] = 2;
387 }
388 stages++;
389 } while (n > 1);
390 n = nbak;
391 /* Reverse the order to get the radix 4 at the end, so we can use the
392 fast degenerate case. It turns out that reversing the order also
393 improves the noise behaviour. */
394 for (i=0;i<stages/2;i++)
395 {
396 int tmp;
397 tmp = facbuf[2*i];
398 facbuf[2*i] = facbuf[2*(stages-i-1)];
399 facbuf[2*(stages-i-1)] = tmp;
400 }
401 for (i=0;i<stages;i++)
402 {
403 n /= facbuf[2*i];
404 facbuf[2*i+1] = n;
405 }
406 return 1;
407 }
408
compute_twiddles(kiss_twiddle_cpx * twiddles,int nfft)409 static void compute_twiddles(kiss_twiddle_cpx *twiddles, int nfft)
410 {
411 int i;
412 #ifdef FIXED_POINT
413 for (i=0;i<nfft;++i) {
414 opus_val32 phase = -i;
415 kf_cexp2(twiddles+i, DIV32(SHL32(phase,17),nfft));
416 }
417 #else
418 for (i=0;i<nfft;++i) {
419 const double pi=3.14159265358979323846264338327;
420 double phase = ( -2*pi /nfft ) * i;
421 kf_cexp(twiddles+i, phase );
422 }
423 #endif
424 }
425
opus_fft_alloc_arch_c(kiss_fft_state * st)426 int opus_fft_alloc_arch_c(kiss_fft_state *st) {
427 (void)st;
428 return 0;
429 }
430
431 /*
432 *
433 * Allocates all necessary storage space for the fft and ifft.
434 * The return value is a contiguous block of memory. As such,
435 * It can be freed with free().
436 * */
opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,const kiss_fft_state * base,int arch)437 kiss_fft_state *opus_fft_alloc_twiddles(int nfft,void * mem,size_t * lenmem,
438 const kiss_fft_state *base, int arch)
439 {
440 kiss_fft_state *st=NULL;
441 size_t memneeded = sizeof(struct kiss_fft_state); /* twiddle factors*/
442
443 if ( lenmem==NULL ) {
444 st = ( kiss_fft_state*)KISS_FFT_MALLOC( memneeded );
445 }else{
446 if (mem != NULL && *lenmem >= memneeded)
447 st = (kiss_fft_state*)mem;
448 *lenmem = memneeded;
449 }
450 if (st) {
451 opus_int16 *bitrev;
452 kiss_twiddle_cpx *twiddles;
453
454 st->nfft=nfft;
455 #ifdef FIXED_POINT
456 st->scale_shift = celt_ilog2(st->nfft);
457 if (st->nfft == 1<<st->scale_shift)
458 st->scale = Q15ONE;
459 else
460 st->scale = (1073741824+st->nfft/2)/st->nfft>>(15-st->scale_shift);
461 #else
462 st->scale = 1.f/nfft;
463 #endif
464 if (base != NULL)
465 {
466 st->twiddles = base->twiddles;
467 st->shift = 0;
468 while (st->shift < 32 && nfft<<st->shift != base->nfft)
469 st->shift++;
470 if (st->shift>=32)
471 goto fail;
472 } else {
473 st->twiddles = twiddles = (kiss_twiddle_cpx*)KISS_FFT_MALLOC(sizeof(kiss_twiddle_cpx)*nfft);
474 compute_twiddles(twiddles, nfft);
475 st->shift = -1;
476 }
477 if (!kf_factor(nfft,st->factors))
478 {
479 goto fail;
480 }
481
482 /* bitrev */
483 st->bitrev = bitrev = (opus_int16*)KISS_FFT_MALLOC(sizeof(opus_int16)*nfft);
484 if (st->bitrev==NULL)
485 goto fail;
486 compute_bitrev_table(0, bitrev, 1,1, st->factors,st);
487
488 /* Initialize architecture specific fft parameters */
489 if (opus_fft_alloc_arch(st, arch))
490 goto fail;
491 }
492 return st;
493 fail:
494 opus_fft_free(st, arch);
495 return NULL;
496 }
497
opus_fft_alloc(int nfft,void * mem,size_t * lenmem,int arch)498 kiss_fft_state *opus_fft_alloc(int nfft,void * mem,size_t * lenmem, int arch)
499 {
500 return opus_fft_alloc_twiddles(nfft, mem, lenmem, NULL, arch);
501 }
502
opus_fft_free_arch_c(kiss_fft_state * st)503 void opus_fft_free_arch_c(kiss_fft_state *st) {
504 (void)st;
505 }
506
opus_fft_free(const kiss_fft_state * cfg,int arch)507 void opus_fft_free(const kiss_fft_state *cfg, int arch)
508 {
509 if (cfg)
510 {
511 opus_fft_free_arch((kiss_fft_state *)cfg, arch);
512 opus_free((opus_int16*)cfg->bitrev);
513 if (cfg->shift < 0)
514 opus_free((kiss_twiddle_cpx*)cfg->twiddles);
515 opus_free((kiss_fft_state*)cfg);
516 }
517 }
518
519 #endif /* CUSTOM_MODES */
520
opus_fft_impl(const kiss_fft_state * st,kiss_fft_cpx * fout)521 void opus_fft_impl(const kiss_fft_state *st,kiss_fft_cpx *fout)
522 {
523 int m2, m;
524 int p;
525 int L;
526 int fstride[MAXFACTORS];
527 int i;
528 int shift;
529
530 /* st->shift can be -1 */
531 shift = st->shift>0 ? st->shift : 0;
532
533 fstride[0] = 1;
534 L=0;
535 do {
536 p = st->factors[2*L];
537 m = st->factors[2*L+1];
538 fstride[L+1] = fstride[L]*p;
539 L++;
540 } while(m!=1);
541 m = st->factors[2*L-1];
542 for (i=L-1;i>=0;i--)
543 {
544 if (i!=0)
545 m2 = st->factors[2*i-1];
546 else
547 m2 = 1;
548 switch (st->factors[2*i])
549 {
550 case 2:
551 kf_bfly2(fout, m, fstride[i]);
552 break;
553 case 4:
554 kf_bfly4(fout,fstride[i]<<shift,st,m, fstride[i], m2);
555 break;
556 #ifndef RADIX_TWO_ONLY
557 case 3:
558 kf_bfly3(fout,fstride[i]<<shift,st,m, fstride[i], m2);
559 break;
560 case 5:
561 kf_bfly5(fout,fstride[i]<<shift,st,m, fstride[i], m2);
562 break;
563 #endif
564 }
565 m = m2;
566 }
567 }
568
opus_fft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)569 void opus_fft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
570 {
571 int i;
572 opus_val16 scale;
573 #ifdef FIXED_POINT
574 /* Allows us to scale with MULT16_32_Q16(), which is faster than
575 MULT16_32_Q15() on ARM. */
576 int scale_shift = st->scale_shift-1;
577 #endif
578 scale = st->scale;
579
580 celt_assert2 (fin != fout, "In-place FFT not supported");
581 /* Bit-reverse the input */
582 for (i=0;i<st->nfft;i++)
583 {
584 kiss_fft_cpx x = fin[i];
585 fout[st->bitrev[i]].r = SHR32(MULT16_32_Q16(scale, x.r), scale_shift);
586 fout[st->bitrev[i]].i = SHR32(MULT16_32_Q16(scale, x.i), scale_shift);
587 }
588 opus_fft_impl(st, fout);
589 }
590
591
opus_ifft_c(const kiss_fft_state * st,const kiss_fft_cpx * fin,kiss_fft_cpx * fout)592 void opus_ifft_c(const kiss_fft_state *st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
593 {
594 int i;
595 celt_assert2 (fin != fout, "In-place FFT not supported");
596 /* Bit-reverse the input */
597 for (i=0;i<st->nfft;i++)
598 fout[st->bitrev[i]] = fin[i];
599 for (i=0;i<st->nfft;i++)
600 fout[i].i = -fout[i].i;
601 opus_fft_impl(st, fout);
602 for (i=0;i<st->nfft;i++)
603 fout[i].i = -fout[i].i;
604 }
605