• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ** FFT and FHT routines
3 **  Copyright 1988, 1993; Ron Mayer
4 **      Copyright (c) 1999-2000 Takehiro Tominaga
5 **
6 **  fht(fz,n);
7 **      Does a hartley transform of "n" points in the array "fz".
8 **
9 ** NOTE: This routine uses at least 2 patented algorithms, and may be
10 **       under the restrictions of a bunch of different organizations.
11 **       Although I wrote it completely myself; it is kind of a derivative
12 **       of a routine I once authored and released under the GPL, so it
13 **       may fall under the free software foundation's restrictions;
14 **       it was worked on as a Stanford Univ project, so they claim
15 **       some rights to it; it was further optimized at work here, so
16 **       I think this company claims parts of it.  The patents are
17 **       held by R. Bracewell (the FHT algorithm) and O. Buneman (the
18 **       trig generator), both at Stanford Univ.
19 **       If it were up to me, I'd say go do whatever you want with it;
20 **       but it would be polite to give credit to the following people
21 **       if you use this anywhere:
22 **           Euler     - probable inventor of the fourier transform.
23 **           Gauss     - probable inventor of the FFT.
24 **           Hartley   - probable inventor of the hartley transform.
25 **           Buneman   - for a really cool trig generator
26 **           Mayer(me) - for authoring this particular version and
27 **                       including all the optimizations in one package.
28 **       Thanks,
29 **       Ron Mayer; mayer@acuson.com
30 ** and added some optimization by
31 **           Mather    - idea of using lookup table
32 **           Takehiro  - some dirty hack for speed up
33 */
34 
35 /* $Id$ */
36 
37 #ifdef HAVE_CONFIG_H
38 # include <config.h>
39 #endif
40 
41 #include "lame.h"
42 #include "machine.h"
43 #include "encoder.h"
44 #include "util.h"
45 #include "fft.h"
46 
47 #include "vector/lame_intrin.h"
48 
49 
50 
51 #define TRI_SIZE (5-1)  /* 1024 =  4**5 */
52 
53 /* fft.c    */
54 
55 static const FLOAT costab[TRI_SIZE * 2] = {
56     9.238795325112867e-01, 3.826834323650898e-01,
57     9.951847266721969e-01, 9.801714032956060e-02,
58     9.996988186962042e-01, 2.454122852291229e-02,
59     9.999811752826011e-01, 6.135884649154475e-03
60 };
61 
62 static void
fht(FLOAT * fz,int n)63 fht(FLOAT * fz, int n)
64 {
65     const FLOAT *tri = costab;
66     int     k4;
67     FLOAT  *fi, *gi;
68     FLOAT const *fn;
69 
70     n <<= 1;            /* to get BLKSIZE, because of 3DNow! ASM routine */
71     fn = fz + n;
72     k4 = 4;
73     do {
74         FLOAT   s1, c1;
75         int     i, k1, k2, k3, kx;
76         kx = k4 >> 1;
77         k1 = k4;
78         k2 = k4 << 1;
79         k3 = k2 + k1;
80         k4 = k2 << 1;
81         fi = fz;
82         gi = fi + kx;
83         do {
84             FLOAT   f0, f1, f2, f3;
85             f1 = fi[0] - fi[k1];
86             f0 = fi[0] + fi[k1];
87             f3 = fi[k2] - fi[k3];
88             f2 = fi[k2] + fi[k3];
89             fi[k2] = f0 - f2;
90             fi[0] = f0 + f2;
91             fi[k3] = f1 - f3;
92             fi[k1] = f1 + f3;
93             f1 = gi[0] - gi[k1];
94             f0 = gi[0] + gi[k1];
95             f3 = SQRT2 * gi[k3];
96             f2 = SQRT2 * gi[k2];
97             gi[k2] = f0 - f2;
98             gi[0] = f0 + f2;
99             gi[k3] = f1 - f3;
100             gi[k1] = f1 + f3;
101             gi += k4;
102             fi += k4;
103         } while (fi < fn);
104         c1 = tri[0];
105         s1 = tri[1];
106         for (i = 1; i < kx; i++) {
107             FLOAT   c2, s2;
108             c2 = 1 - (2 * s1) * s1;
109             s2 = (2 * s1) * c1;
110             fi = fz + i;
111             gi = fz + k1 - i;
112             do {
113                 FLOAT   a, b, g0, f0, f1, g1, f2, g2, f3, g3;
114                 b = s2 * fi[k1] - c2 * gi[k1];
115                 a = c2 * fi[k1] + s2 * gi[k1];
116                 f1 = fi[0] - a;
117                 f0 = fi[0] + a;
118                 g1 = gi[0] - b;
119                 g0 = gi[0] + b;
120                 b = s2 * fi[k3] - c2 * gi[k3];
121                 a = c2 * fi[k3] + s2 * gi[k3];
122                 f3 = fi[k2] - a;
123                 f2 = fi[k2] + a;
124                 g3 = gi[k2] - b;
125                 g2 = gi[k2] + b;
126                 b = s1 * f2 - c1 * g3;
127                 a = c1 * f2 + s1 * g3;
128                 fi[k2] = f0 - a;
129                 fi[0] = f0 + a;
130                 gi[k3] = g1 - b;
131                 gi[k1] = g1 + b;
132                 b = c1 * g2 - s1 * f3;
133                 a = s1 * g2 + c1 * f3;
134                 gi[k2] = g0 - a;
135                 gi[0] = g0 + a;
136                 fi[k3] = f1 - b;
137                 fi[k1] = f1 + b;
138                 gi += k4;
139                 fi += k4;
140             } while (fi < fn);
141             c2 = c1;
142             c1 = c2 * tri[0] - s1 * tri[1];
143             s1 = c2 * tri[1] + s1 * tri[0];
144         }
145         tri += 2;
146     } while (k4 < n);
147 }
148 
149 
150 static const unsigned char rv_tbl[] = {
151     0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
152     0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
153     0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
154     0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
155     0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
156     0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
157     0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
158     0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
159     0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
160     0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
161     0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
162     0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
163     0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
164     0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
165     0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
166     0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
167 };
168 
169 #define ch01(index)  (buffer[chn][index])
170 
171 #define ml00(f) (window[i        ] * f(i))
172 #define ml10(f) (window[i + 0x200] * f(i + 0x200))
173 #define ml20(f) (window[i + 0x100] * f(i + 0x100))
174 #define ml30(f) (window[i + 0x300] * f(i + 0x300))
175 
176 #define ml01(f) (window[i + 0x001] * f(i + 0x001))
177 #define ml11(f) (window[i + 0x201] * f(i + 0x201))
178 #define ml21(f) (window[i + 0x101] * f(i + 0x101))
179 #define ml31(f) (window[i + 0x301] * f(i + 0x301))
180 
181 #define ms00(f) (window_s[i       ] * f(i + k))
182 #define ms10(f) (window_s[0x7f - i] * f(i + k + 0x80))
183 #define ms20(f) (window_s[i + 0x40] * f(i + k + 0x40))
184 #define ms30(f) (window_s[0x3f - i] * f(i + k + 0xc0))
185 
186 #define ms01(f) (window_s[i + 0x01] * f(i + k + 0x01))
187 #define ms11(f) (window_s[0x7e - i] * f(i + k + 0x81))
188 #define ms21(f) (window_s[i + 0x41] * f(i + k + 0x41))
189 #define ms31(f) (window_s[0x3e - i] * f(i + k + 0xc1))
190 
191 void
fft_short(lame_internal_flags const * const gfc,FLOAT x_real[3][BLKSIZE_s],int chn,const sample_t * const buffer[2])192 fft_short(lame_internal_flags const *const gfc,
193           FLOAT x_real[3][BLKSIZE_s], int chn, const sample_t *const buffer[2])
194 {
195     int     i;
196     int     j;
197     int     b;
198 
199 #define window_s gfc->cd_psy->window_s
200 #define window gfc->cd_psy->window
201 
202     for (b = 0; b < 3; b++) {
203         FLOAT  *x = &x_real[b][BLKSIZE_s / 2];
204         short const k = (576 / 3) * (b + 1);
205         j = BLKSIZE_s / 8 - 1;
206         do {
207             FLOAT   f0, f1, f2, f3, w;
208 
209             i = rv_tbl[j << 2];
210 
211             f0 = ms00(ch01);
212             w = ms10(ch01);
213             f1 = f0 - w;
214             f0 = f0 + w;
215             f2 = ms20(ch01);
216             w = ms30(ch01);
217             f3 = f2 - w;
218             f2 = f2 + w;
219 
220             x -= 4;
221             x[0] = f0 + f2;
222             x[2] = f0 - f2;
223             x[1] = f1 + f3;
224             x[3] = f1 - f3;
225 
226             f0 = ms01(ch01);
227             w = ms11(ch01);
228             f1 = f0 - w;
229             f0 = f0 + w;
230             f2 = ms21(ch01);
231             w = ms31(ch01);
232             f3 = f2 - w;
233             f2 = f2 + w;
234 
235             x[BLKSIZE_s / 2 + 0] = f0 + f2;
236             x[BLKSIZE_s / 2 + 2] = f0 - f2;
237             x[BLKSIZE_s / 2 + 1] = f1 + f3;
238             x[BLKSIZE_s / 2 + 3] = f1 - f3;
239         } while (--j >= 0);
240 
241 #undef window
242 #undef window_s
243 
244         gfc->fft_fht(x, BLKSIZE_s / 2);
245         /* BLKSIZE_s/2 because of 3DNow! ASM routine */
246     }
247 }
248 
249 void
fft_long(lame_internal_flags const * const gfc,FLOAT x[BLKSIZE],int chn,const sample_t * const buffer[2])250 fft_long(lame_internal_flags const *const gfc,
251          FLOAT x[BLKSIZE], int chn, const sample_t *const buffer[2])
252 {
253     int     i;
254     int     jj = BLKSIZE / 8 - 1;
255     x += BLKSIZE / 2;
256 
257 #define window_s gfc->cd_psy->window_s
258 #define window gfc->cd_psy->window
259 
260     do {
261         FLOAT   f0, f1, f2, f3, w;
262 
263         i = rv_tbl[jj];
264         f0 = ml00(ch01);
265         w = ml10(ch01);
266         f1 = f0 - w;
267         f0 = f0 + w;
268         f2 = ml20(ch01);
269         w = ml30(ch01);
270         f3 = f2 - w;
271         f2 = f2 + w;
272 
273         x -= 4;
274         x[0] = f0 + f2;
275         x[2] = f0 - f2;
276         x[1] = f1 + f3;
277         x[3] = f1 - f3;
278 
279         f0 = ml01(ch01);
280         w = ml11(ch01);
281         f1 = f0 - w;
282         f0 = f0 + w;
283         f2 = ml21(ch01);
284         w = ml31(ch01);
285         f3 = f2 - w;
286         f2 = f2 + w;
287 
288         x[BLKSIZE / 2 + 0] = f0 + f2;
289         x[BLKSIZE / 2 + 2] = f0 - f2;
290         x[BLKSIZE / 2 + 1] = f1 + f3;
291         x[BLKSIZE / 2 + 3] = f1 - f3;
292     } while (--jj >= 0);
293 
294 #undef window
295 #undef window_s
296 
297     gfc->fft_fht(x, BLKSIZE / 2);
298     /* BLKSIZE/2 because of 3DNow! ASM routine */
299 }
300 
301 #ifdef HAVE_NASM
302 extern void fht_3DN(FLOAT * fz, int n);
303 extern void fht_SSE(FLOAT * fz, int n);
304 #endif
305 
306 void
init_fft(lame_internal_flags * const gfc)307 init_fft(lame_internal_flags * const gfc)
308 {
309     int     i;
310 
311     /* The type of window used here will make no real difference, but */
312     /* in the interest of merging nspsytune stuff - switch to blackman window */
313     for (i = 0; i < BLKSIZE; i++)
314         /* blackman window */
315         gfc->cd_psy->window[i] = 0.42 - 0.5 * cos(2 * PI * (i + .5) / BLKSIZE) +
316             0.08 * cos(4 * PI * (i + .5) / BLKSIZE);
317 
318     for (i = 0; i < BLKSIZE_s / 2; i++)
319         gfc->cd_psy->window_s[i] = 0.5 * (1.0 - cos(2.0 * PI * (i + 0.5) / BLKSIZE_s));
320 
321     gfc->fft_fht = fht;
322 #ifdef HAVE_NASM
323     if (gfc->CPU_features.AMD_3DNow) {
324         gfc->fft_fht = fht_3DN;
325     }
326     else if (gfc->CPU_features.SSE) {
327         gfc->fft_fht = fht_SSE;
328     }
329     else {
330         gfc->fft_fht = fht;
331     }
332 #else
333 #ifdef HAVE_XMMINTRIN_H
334 #ifdef MIN_ARCH_SSE
335     gfc->fft_fht = fht_SSE2;
336 #endif
337 #endif
338 #endif
339 }
340