1 /*
2 ** FFT and FHT routines
3 ** Copyright 1988, 1993; Ron Mayer
4 ** Copyright (c) 1999-2000 Takehiro Tominaga
5 **
6 ** fht(fz,n);
7 ** Does a hartley transform of "n" points in the array "fz".
8 **
9 ** NOTE: This routine uses at least 2 patented algorithms, and may be
10 ** under the restrictions of a bunch of different organizations.
11 ** Although I wrote it completely myself; it is kind of a derivative
12 ** of a routine I once authored and released under the GPL, so it
13 ** may fall under the free software foundation's restrictions;
14 ** it was worked on as a Stanford Univ project, so they claim
15 ** some rights to it; it was further optimized at work here, so
16 ** I think this company claims parts of it. The patents are
17 ** held by R. Bracewell (the FHT algorithm) and O. Buneman (the
18 ** trig generator), both at Stanford Univ.
19 ** If it were up to me, I'd say go do whatever you want with it;
20 ** but it would be polite to give credit to the following people
21 ** if you use this anywhere:
22 ** Euler - probable inventor of the fourier transform.
23 ** Gauss - probable inventor of the FFT.
24 ** Hartley - probable inventor of the hartley transform.
25 ** Buneman - for a really cool trig generator
26 ** Mayer(me) - for authoring this particular version and
27 ** including all the optimizations in one package.
28 ** Thanks,
29 ** Ron Mayer; mayer@acuson.com
30 ** and added some optimization by
31 ** Mather - idea of using lookup table
32 ** Takehiro - some dirty hack for speed up
33 */
34
35 /* $Id$ */
36
37 #ifdef HAVE_CONFIG_H
38 # include <config.h>
39 #endif
40
41 #include "lame.h"
42 #include "machine.h"
43 #include "encoder.h"
44 #include "util.h"
45 #include "fft.h"
46
47 #include "vector/lame_intrin.h"
48
49
50
51 #define TRI_SIZE (5-1) /* 1024 = 4**5 */
52
53 /* fft.c */
54
55 static const FLOAT costab[TRI_SIZE * 2] = {
56 9.238795325112867e-01, 3.826834323650898e-01,
57 9.951847266721969e-01, 9.801714032956060e-02,
58 9.996988186962042e-01, 2.454122852291229e-02,
59 9.999811752826011e-01, 6.135884649154475e-03
60 };
61
62 static void
fht(FLOAT * fz,int n)63 fht(FLOAT * fz, int n)
64 {
65 const FLOAT *tri = costab;
66 int k4;
67 FLOAT *fi, *gi;
68 FLOAT const *fn;
69
70 n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
71 fn = fz + n;
72 k4 = 4;
73 do {
74 FLOAT s1, c1;
75 int i, k1, k2, k3, kx;
76 kx = k4 >> 1;
77 k1 = k4;
78 k2 = k4 << 1;
79 k3 = k2 + k1;
80 k4 = k2 << 1;
81 fi = fz;
82 gi = fi + kx;
83 do {
84 FLOAT f0, f1, f2, f3;
85 f1 = fi[0] - fi[k1];
86 f0 = fi[0] + fi[k1];
87 f3 = fi[k2] - fi[k3];
88 f2 = fi[k2] + fi[k3];
89 fi[k2] = f0 - f2;
90 fi[0] = f0 + f2;
91 fi[k3] = f1 - f3;
92 fi[k1] = f1 + f3;
93 f1 = gi[0] - gi[k1];
94 f0 = gi[0] + gi[k1];
95 f3 = SQRT2 * gi[k3];
96 f2 = SQRT2 * gi[k2];
97 gi[k2] = f0 - f2;
98 gi[0] = f0 + f2;
99 gi[k3] = f1 - f3;
100 gi[k1] = f1 + f3;
101 gi += k4;
102 fi += k4;
103 } while (fi < fn);
104 c1 = tri[0];
105 s1 = tri[1];
106 for (i = 1; i < kx; i++) {
107 FLOAT c2, s2;
108 c2 = 1 - (2 * s1) * s1;
109 s2 = (2 * s1) * c1;
110 fi = fz + i;
111 gi = fz + k1 - i;
112 do {
113 FLOAT a, b, g0, f0, f1, g1, f2, g2, f3, g3;
114 b = s2 * fi[k1] - c2 * gi[k1];
115 a = c2 * fi[k1] + s2 * gi[k1];
116 f1 = fi[0] - a;
117 f0 = fi[0] + a;
118 g1 = gi[0] - b;
119 g0 = gi[0] + b;
120 b = s2 * fi[k3] - c2 * gi[k3];
121 a = c2 * fi[k3] + s2 * gi[k3];
122 f3 = fi[k2] - a;
123 f2 = fi[k2] + a;
124 g3 = gi[k2] - b;
125 g2 = gi[k2] + b;
126 b = s1 * f2 - c1 * g3;
127 a = c1 * f2 + s1 * g3;
128 fi[k2] = f0 - a;
129 fi[0] = f0 + a;
130 gi[k3] = g1 - b;
131 gi[k1] = g1 + b;
132 b = c1 * g2 - s1 * f3;
133 a = s1 * g2 + c1 * f3;
134 gi[k2] = g0 - a;
135 gi[0] = g0 + a;
136 fi[k3] = f1 - b;
137 fi[k1] = f1 + b;
138 gi += k4;
139 fi += k4;
140 } while (fi < fn);
141 c2 = c1;
142 c1 = c2 * tri[0] - s1 * tri[1];
143 s1 = c2 * tri[1] + s1 * tri[0];
144 }
145 tri += 2;
146 } while (k4 < n);
147 }
148
149
150 static const unsigned char rv_tbl[] = {
151 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
152 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
153 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
154 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
155 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
156 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
157 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
158 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
159 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
160 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
161 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
162 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
163 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
164 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
165 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
166 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
167 };
168
169 #define ch01(index) (buffer[chn][index])
170
171 #define ml00(f) (window[i ] * f(i))
172 #define ml10(f) (window[i + 0x200] * f(i + 0x200))
173 #define ml20(f) (window[i + 0x100] * f(i + 0x100))
174 #define ml30(f) (window[i + 0x300] * f(i + 0x300))
175
176 #define ml01(f) (window[i + 0x001] * f(i + 0x001))
177 #define ml11(f) (window[i + 0x201] * f(i + 0x201))
178 #define ml21(f) (window[i + 0x101] * f(i + 0x101))
179 #define ml31(f) (window[i + 0x301] * f(i + 0x301))
180
181 #define ms00(f) (window_s[i ] * f(i + k))
182 #define ms10(f) (window_s[0x7f - i] * f(i + k + 0x80))
183 #define ms20(f) (window_s[i + 0x40] * f(i + k + 0x40))
184 #define ms30(f) (window_s[0x3f - i] * f(i + k + 0xc0))
185
186 #define ms01(f) (window_s[i + 0x01] * f(i + k + 0x01))
187 #define ms11(f) (window_s[0x7e - i] * f(i + k + 0x81))
188 #define ms21(f) (window_s[i + 0x41] * f(i + k + 0x41))
189 #define ms31(f) (window_s[0x3e - i] * f(i + k + 0xc1))
190
191 void
fft_short(lame_internal_flags const * const gfc,FLOAT x_real[3][BLKSIZE_s],int chn,const sample_t * const buffer[2])192 fft_short(lame_internal_flags const *const gfc,
193 FLOAT x_real[3][BLKSIZE_s], int chn, const sample_t *const buffer[2])
194 {
195 int i;
196 int j;
197 int b;
198
199 #define window_s gfc->cd_psy->window_s
200 #define window gfc->cd_psy->window
201
202 for (b = 0; b < 3; b++) {
203 FLOAT *x = &x_real[b][BLKSIZE_s / 2];
204 short const k = (576 / 3) * (b + 1);
205 j = BLKSIZE_s / 8 - 1;
206 do {
207 FLOAT f0, f1, f2, f3, w;
208
209 i = rv_tbl[j << 2];
210
211 f0 = ms00(ch01);
212 w = ms10(ch01);
213 f1 = f0 - w;
214 f0 = f0 + w;
215 f2 = ms20(ch01);
216 w = ms30(ch01);
217 f3 = f2 - w;
218 f2 = f2 + w;
219
220 x -= 4;
221 x[0] = f0 + f2;
222 x[2] = f0 - f2;
223 x[1] = f1 + f3;
224 x[3] = f1 - f3;
225
226 f0 = ms01(ch01);
227 w = ms11(ch01);
228 f1 = f0 - w;
229 f0 = f0 + w;
230 f2 = ms21(ch01);
231 w = ms31(ch01);
232 f3 = f2 - w;
233 f2 = f2 + w;
234
235 x[BLKSIZE_s / 2 + 0] = f0 + f2;
236 x[BLKSIZE_s / 2 + 2] = f0 - f2;
237 x[BLKSIZE_s / 2 + 1] = f1 + f3;
238 x[BLKSIZE_s / 2 + 3] = f1 - f3;
239 } while (--j >= 0);
240
241 #undef window
242 #undef window_s
243
244 gfc->fft_fht(x, BLKSIZE_s / 2);
245 /* BLKSIZE_s/2 because of 3DNow! ASM routine */
246 }
247 }
248
249 void
fft_long(lame_internal_flags const * const gfc,FLOAT x[BLKSIZE],int chn,const sample_t * const buffer[2])250 fft_long(lame_internal_flags const *const gfc,
251 FLOAT x[BLKSIZE], int chn, const sample_t *const buffer[2])
252 {
253 int i;
254 int jj = BLKSIZE / 8 - 1;
255 x += BLKSIZE / 2;
256
257 #define window_s gfc->cd_psy->window_s
258 #define window gfc->cd_psy->window
259
260 do {
261 FLOAT f0, f1, f2, f3, w;
262
263 i = rv_tbl[jj];
264 f0 = ml00(ch01);
265 w = ml10(ch01);
266 f1 = f0 - w;
267 f0 = f0 + w;
268 f2 = ml20(ch01);
269 w = ml30(ch01);
270 f3 = f2 - w;
271 f2 = f2 + w;
272
273 x -= 4;
274 x[0] = f0 + f2;
275 x[2] = f0 - f2;
276 x[1] = f1 + f3;
277 x[3] = f1 - f3;
278
279 f0 = ml01(ch01);
280 w = ml11(ch01);
281 f1 = f0 - w;
282 f0 = f0 + w;
283 f2 = ml21(ch01);
284 w = ml31(ch01);
285 f3 = f2 - w;
286 f2 = f2 + w;
287
288 x[BLKSIZE / 2 + 0] = f0 + f2;
289 x[BLKSIZE / 2 + 2] = f0 - f2;
290 x[BLKSIZE / 2 + 1] = f1 + f3;
291 x[BLKSIZE / 2 + 3] = f1 - f3;
292 } while (--jj >= 0);
293
294 #undef window
295 #undef window_s
296
297 gfc->fft_fht(x, BLKSIZE / 2);
298 /* BLKSIZE/2 because of 3DNow! ASM routine */
299 }
300
301 #ifdef HAVE_NASM
302 extern void fht_3DN(FLOAT * fz, int n);
303 extern void fht_SSE(FLOAT * fz, int n);
304 #endif
305
306 void
init_fft(lame_internal_flags * const gfc)307 init_fft(lame_internal_flags * const gfc)
308 {
309 int i;
310
311 /* The type of window used here will make no real difference, but */
312 /* in the interest of merging nspsytune stuff - switch to blackman window */
313 for (i = 0; i < BLKSIZE; i++)
314 /* blackman window */
315 gfc->cd_psy->window[i] = 0.42 - 0.5 * cos(2 * PI * (i + .5) / BLKSIZE) +
316 0.08 * cos(4 * PI * (i + .5) / BLKSIZE);
317
318 for (i = 0; i < BLKSIZE_s / 2; i++)
319 gfc->cd_psy->window_s[i] = 0.5 * (1.0 - cos(2.0 * PI * (i + 0.5) / BLKSIZE_s));
320
321 gfc->fft_fht = fht;
322 #ifdef HAVE_NASM
323 if (gfc->CPU_features.AMD_3DNow) {
324 gfc->fft_fht = fht_3DN;
325 }
326 else if (gfc->CPU_features.SSE) {
327 gfc->fft_fht = fht_SSE;
328 }
329 else {
330 gfc->fft_fht = fht;
331 }
332 #else
333 #ifdef HAVE_XMMINTRIN_H
334 #ifdef MIN_ARCH_SSE
335 gfc->fft_fht = fht_SSE2;
336 #endif
337 #endif
338 #endif
339 }
340