1 /*
2 * Copyright (c) 2013 The WebRTC project authors. All Rights realserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12 #include <emmintrin.h>
13 #include <assert.h>
14
15 /**
16 * Two data formats are used by the FFT routines, internally. The
17 * interface to the main external FFT routines use interleaved complex
18 * values where the real part is followed by the imaginary part.
19 *
20 * One is the split format where a complex vector of real and imaginary
21 * values are split such that all of the real values are placed in the
22 * first half of the vector and the corresponding values are placed in
23 * the second half, in the same order. The conversion from interleaved
24 * complex values to split format and back is transparent to the
25 * external FFT interface.
26 *
27 * VComplex uses split format.
28 */
29
30 /** VComplex hold 4 complex float elements, with the real parts stored
31 * in real and corresponding imaginary parts in imag.
32 */
33 typedef struct VComplex {
34 __m128 real;
35 __m128 imag;
36 } VC;
37
38 /* out = a * b */
VC_MUL(VC * out,VC * a,VC * b)39 static __inline void VC_MUL(VC *out, VC *a, VC *b) {
40 out->real = _mm_sub_ps(_mm_mul_ps(a->real, b->real),
41 _mm_mul_ps(a->imag, b->imag));
42 out->imag = _mm_add_ps(_mm_mul_ps(a->real, b->imag),
43 _mm_mul_ps(a->imag, b->real));
44 }
45
46 /* out = conj(a) * b */
VC_CONJ_MUL(VC * out,VC * a,VC * b)47 static __inline void VC_CONJ_MUL(VC *out, VC *a, VC *b) {
48 out->real = _mm_add_ps(_mm_mul_ps(a->real, b->real),
49 _mm_mul_ps(a->imag, b->imag));
50 out->imag = _mm_sub_ps(_mm_mul_ps(a->real, b->imag),
51 _mm_mul_ps(a->imag, b->real));
52 }
53
54 /* Scale complex by a real factor */
VC_MUL_F(VC * out,VC * a,__m128 factor)55 static __inline void VC_MUL_F(VC *out, VC *a, __m128 factor) {
56 out->real = _mm_mul_ps(factor, a->real);
57 out->imag = _mm_mul_ps(factor, a->imag);
58 }
59
60 /* out = a + b */
VC_ADD(VC * out,VC * a,VC * b)61 static __inline void VC_ADD(VC *out, VC *a, VC *b) {
62 out->real = _mm_add_ps(a->real, b->real);
63 out->imag = _mm_add_ps(a->imag, b->imag);
64 }
65
66 /**
67 * out.real = a.real + b.imag
68 * out.imag = a.imag + b.real
69 */
VC_ADD_X(VC * out,VC * a,VC * b)70 static __inline void VC_ADD_X(VC *out, VC *a, VC *b) {
71 out->real = _mm_add_ps(a->real, b->imag);
72 out->imag = _mm_add_ps(b->real, a->imag);
73 }
74
75 /* VC_ADD and store the result with Split format. */
VC_ADD_STORE_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)76 static __inline void VC_ADD_STORE_SPLIT(
77 OMX_F32 *out,
78 VC *a,
79 VC *b,
80 OMX_INT offset) {
81 _mm_store_ps(out, _mm_add_ps(a->real, b->real));
82 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->imag));
83 }
84
85 /* out = a - b */
VC_SUB(VC * out,VC * a,VC * b)86 static __inline void VC_SUB(VC *out, VC *a, VC *b) {
87 out->real = _mm_sub_ps(a->real, b->real);
88 out->imag = _mm_sub_ps(a->imag, b->imag);
89 }
90
91 /**
92 * out.real = a.real - b.imag
93 * out.imag = a.imag - b.real
94 */
VC_SUB_X(VC * out,VC * a,VC * b)95 static __inline void VC_SUB_X(VC *out, VC *a, VC *b) {
96 out->real = _mm_sub_ps(a->real, b->imag);
97 out->imag = _mm_sub_ps(b->real, a->imag);
98 }
99
100 /* VC_SUB and store the result with Split format. */
VC_SUB_STORE_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)101 static __inline void VC_SUB_STORE_SPLIT(
102 OMX_F32 *out,
103 VC *a,
104 VC *b,
105 OMX_INT offset) {
106 _mm_store_ps(out, _mm_sub_ps(a->real, b->real));
107 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->imag));
108 }
109
110 /**
111 * out.real = a.real + b.real
112 * out.imag = a.imag - b.imag
113 */
VC_ADD_SUB(VC * out,VC * a,VC * b)114 static __inline void VC_ADD_SUB(VC *out, VC *a, VC *b) {
115 out->real = _mm_add_ps(a->real, b->real);
116 out->imag = _mm_sub_ps(a->imag, b->imag);
117 }
118
119 /**
120 * out.real = a.real + b.imag
121 * out.imag = a.imag - b.real
122 */
VC_ADD_SUB_X(VC * out,VC * a,VC * b)123 static __inline void VC_ADD_SUB_X(VC *out, VC *a, VC *b) {
124 out->real = _mm_add_ps(a->real, b->imag);
125 out->imag = _mm_sub_ps(a->imag, b->real);
126 }
127
128 /* VC_ADD_SUB_X and store the result with Split format. */
VC_ADD_SUB_X_STORE_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)129 static __inline void VC_ADD_SUB_X_STORE_SPLIT(
130 OMX_F32 *out,
131 VC *a,
132 VC *b,
133 OMX_INT offset) {
134 _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
135 _mm_store_ps(out + offset, _mm_sub_ps(a->imag, b->real));
136 }
137
138 /**
139 * out.real = a.real - b.real
140 * out.imag = a.imag + b.imag
141 */
VC_SUB_ADD(VC * out,VC * a,VC * b)142 static __inline void VC_SUB_ADD(VC *out, VC *a, VC *b) {
143 out->real = _mm_sub_ps(a->real, b->real);
144 out->imag = _mm_add_ps(a->imag, b->imag);
145 }
146
147 /**
148 * out.real = a.real - b.imag
149 * out.imag = a.imag + b.real
150 */
VC_SUB_ADD_X(VC * out,VC * a,VC * b)151 static __inline void VC_SUB_ADD_X(VC *out, VC *a, VC *b) {
152 out->real = _mm_sub_ps(a->real, b->imag);
153 out->imag = _mm_add_ps(a->imag, b->real);
154 }
155
156 /* VC_SUB_ADD_X and store the result with Split format. */
VC_SUB_ADD_X_STORE_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)157 static __inline void VC_SUB_ADD_X_STORE_SPLIT(
158 OMX_F32 *out,
159 VC *a, VC *b,
160 OMX_INT offset) {
161 _mm_store_ps(out, _mm_sub_ps(a->real, b->imag));
162 _mm_store_ps(out + offset, _mm_add_ps(a->imag, b->real));
163 }
164
165 /**
166 * out[0] = in.real
167 * out[offset] = in.imag
168 */
VC_STORE_SPLIT(OMX_F32 * out,VC * in,OMX_INT offset)169 static __inline void VC_STORE_SPLIT(
170 OMX_F32 *out,
171 VC *in,
172 OMX_INT offset) {
173 _mm_store_ps(out, in->real);
174 _mm_store_ps(out + offset, in->imag);
175 }
176
177 /**
178 * out.real = in[0];
179 * out.imag = in[offset];
180 */
VC_LOAD_SPLIT(VC * out,const OMX_F32 * in,OMX_INT offset)181 static __inline void VC_LOAD_SPLIT(
182 VC *out,
183 const OMX_F32 *in,
184 OMX_INT offset) {
185 out->real = _mm_load_ps(in);
186 out->imag = _mm_load_ps(in + offset);
187 }
188
189 /* Vector Complex Unpack from Split format to Interleaved format. */
VC_UNPACK(VC * out,VC * in)190 static __inline void VC_UNPACK(VC *out, VC *in) {
191 out->real = _mm_unpacklo_ps(in->real, in->imag);
192 out->imag = _mm_unpackhi_ps(in->real, in->imag);
193 }
194
195 /**
196 * Vector Complex load from interleaved complex array.
197 * out.real = [in[0].real, in[1].real, in[2].real, in[3].real]
198 * out.imag = [in[0].imag, in[1].imag, in[2].imag, in[3].imag]
199 */
VC_LOAD_INTERLEAVE(VC * out,const OMX_F32 * in)200 static __inline void VC_LOAD_INTERLEAVE(VC *out, const OMX_F32 *in) {
201 __m128 temp0 = _mm_load_ps(in);
202 __m128 temp1 = _mm_load_ps(in + 4);
203 out->real = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(2, 0, 2, 0));
204 out->imag = _mm_shuffle_ps(temp0, temp1, _MM_SHUFFLE(3, 1, 3, 1));
205 }
206 /**
207 * Vector Complex Load with Split format.
208 * The input address is not 16 byte aligned.
209 */
VC_LOADU_SPLIT(VC * out,const OMX_F32 * in,OMX_INT offset)210 static __inline void VC_LOADU_SPLIT(
211 VC *out,
212 const OMX_F32 *in,
213 OMX_INT offset) {
214 out->real = _mm_loadu_ps(in);
215 out->imag = _mm_loadu_ps(in + offset);
216 }
217
218 /* Reverse the order of the Complex Vector. */
VC_REVERSE(VC * v)219 static __inline void VC_REVERSE(VC *v) {
220 v->real = _mm_shuffle_ps(v->real, v->real, _MM_SHUFFLE(0, 1, 2, 3));
221 v->imag = _mm_shuffle_ps(v->imag, v->imag, _MM_SHUFFLE(0, 1, 2, 3));
222 }
223 /*
224 * Vector Complex store to interleaved complex array
225 * out[0] = in.real[0]
226 * out[1] = in.imag[0]
227 * out[2] = in.real[1]
228 * out[3] = in.imag[1]
229 * out[4] = in.real[2]
230 * out[5] = in.imag[2]
231 * out[6] = in.real[3]
232 * out[7] = in.imag[3]
233 */
VC_STORE_INTERLEAVE(OMX_F32 * out,VC * in)234 static __inline void VC_STORE_INTERLEAVE(OMX_F32 *out, VC *in) {
235 _mm_store_ps(out, _mm_unpacklo_ps(in->real, in->imag));
236 _mm_store_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
237 }
238
239 /**
240 * Vector Complex Store with Interleaved format.
241 * Address is not 16 byte aligned.
242 */
VC_STOREU_INTERLEAVE(OMX_F32 * out,VC * in)243 static __inline void VC_STOREU_INTERLEAVE(OMX_F32 *out, VC *in) {
244 _mm_storeu_ps(out, _mm_unpacklo_ps(in->real, in->imag));
245 _mm_storeu_ps(out + 4, _mm_unpackhi_ps(in->real, in->imag));
246 }
247
248 /* VC_ADD_X and store the result with Split format. */
VC_ADD_X_STORE_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)249 static __inline void VC_ADD_X_STORE_SPLIT(
250 OMX_F32 *out,
251 VC *a, VC *b,
252 OMX_INT offset) {
253 _mm_store_ps(out, _mm_add_ps(a->real, b->imag));
254 _mm_store_ps(out + offset, _mm_add_ps(b->real, a->imag));
255 }
256
257 /**
258 * VC_SUB_X and store the result with inverse order.
259 * Address is not 16 byte aligned.
260 */
VC_SUB_X_INVERSE_STOREU_SPLIT(OMX_F32 * out,VC * a,VC * b,OMX_INT offset)261 static __inline void VC_SUB_X_INVERSE_STOREU_SPLIT(
262 OMX_F32 *out,
263 VC *a,
264 VC *b,
265 OMX_INT offset) {
266 __m128 t;
267 t = _mm_sub_ps(a->real, b->imag);
268 _mm_storeu_ps(out, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
269 t = _mm_sub_ps(b->real, a->imag);
270 _mm_storeu_ps(out + offset, _mm_shuffle_ps(t, t, _MM_SHUFFLE(0, 1, 2, 3)));
271 }
272
273 /**
274 * Vector Complex Load from Interleaved format to Split format.
275 * Store the result into two __m128 registers.
276 */
VC_LOAD_SHUFFLE(__m128 * out0,__m128 * out1,const OMX_F32 * in)277 static __inline void VC_LOAD_SHUFFLE(
278 __m128 *out0,
279 __m128 *out1,
280 const OMX_F32 *in) {
281 VC temp;
282 VC_LOAD_INTERLEAVE(&temp, in);
283 *out0 = temp.real;
284 *out1 = temp.imag;
285 }
286
287 /* Finish the butterfly calculation of forward radix4 and store the outputs. */
RADIX4_FWD_BUTTERFLY_STORE(OMX_F32 * out0,OMX_F32 * out1,OMX_F32 * out2,OMX_F32 * out3,VC * t0,VC * t1,VC * t2,VC * t3,OMX_INT n)288 static __inline void RADIX4_FWD_BUTTERFLY_STORE(
289 OMX_F32 *out0,
290 OMX_F32 *out1,
291 OMX_F32 *out2,
292 OMX_F32 *out3,
293 VC *t0,
294 VC *t1,
295 VC *t2,
296 VC *t3,
297 OMX_INT n) {
298 /* CADD out0, t0, t2 */
299 VC_ADD_STORE_SPLIT(out0, t0, t2, n);
300
301 /* CSUB out2, t0, t2 */
302 VC_SUB_STORE_SPLIT(out2, t0, t2, n);
303
304 /* CADD_SUB_X out1, t1, t3 */
305 VC_ADD_SUB_X_STORE_SPLIT(out1, t1, t3, n);
306
307 /* CSUB_ADD_X out3, t1, t3 */
308 VC_SUB_ADD_X_STORE_SPLIT(out3, t1, t3, n);
309 }
310
311 /* Finish the butterfly calculation of inverse radix4 and store the outputs. */
RADIX4_INV_BUTTERFLY_STORE(OMX_F32 * out0,OMX_F32 * out1,OMX_F32 * out2,OMX_F32 * out3,VC * t0,VC * t1,VC * t2,VC * t3,OMX_INT n)312 static __inline void RADIX4_INV_BUTTERFLY_STORE(
313 OMX_F32 *out0,
314 OMX_F32 *out1,
315 OMX_F32 *out2,
316 OMX_F32 *out3,
317 VC *t0,
318 VC *t1,
319 VC *t2,
320 VC *t3,
321 OMX_INT n) {
322 /* CADD out0, t0, t2 */
323 VC_ADD_STORE_SPLIT(out0, t0, t2, n);
324
325 /* CSUB out2, t0, t2 */
326 VC_SUB_STORE_SPLIT(out2, t0, t2, n);
327
328 /* CSUB_ADD_X out1, t1, t3 */
329 VC_SUB_ADD_X_STORE_SPLIT(out1, t1, t3, n);
330
331 /* CADD_SUB_X out3, t1, t3 */
332 VC_ADD_SUB_X_STORE_SPLIT(out3, t1, t3, n);
333 }
334
335 /* Radix4 forward butterfly */
RADIX4_FWD_BUTTERFLY(VC * t0,VC * t1,VC * t2,VC * t3,VC * Tw1,VC * Tw2,VC * Tw3,VC * T0,VC * T1,VC * T2,VC * T3)336 static __inline void RADIX4_FWD_BUTTERFLY(
337 VC *t0,
338 VC *t1,
339 VC *t2,
340 VC *t3,
341 VC *Tw1,
342 VC *Tw2,
343 VC *Tw3,
344 VC *T0,
345 VC *T1,
346 VC *T2,
347 VC *T3) {
348 VC tt1, tt2, tt3;
349
350 /* CMUL tt1, Tw1, T1 */
351 VC_MUL(&tt1, Tw1, T1);
352
353 /* CMUL tt2, Tw2, T2 */
354 VC_MUL(&tt2, Tw2, T2);
355
356 /* CMUL tt3, Tw3, T3 */
357 VC_MUL(&tt3, Tw3, T3);
358
359 /* CADD t0, T0, tt2 */
360 VC_ADD(t0, T0, &tt2);
361
362 /* CSUB t1, T0, tt2 */
363 VC_SUB(t1, T0, &tt2);
364
365 /* CADD t2, tt1, tt3 */
366 VC_ADD(t2, &tt1, &tt3);
367
368 /* CSUB t3, tt1, tt3 */
369 VC_SUB(t3, &tt1, &tt3);
370 }
371
372 /* Radix4 inverse butterfly */
RADIX4_INV_BUTTERFLY(VC * t0,VC * t1,VC * t2,VC * t3,VC * Tw1,VC * Tw2,VC * Tw3,VC * T0,VC * T1,VC * T2,VC * T3)373 static __inline void RADIX4_INV_BUTTERFLY(
374 VC *t0,
375 VC *t1,
376 VC *t2,
377 VC *t3,
378 VC *Tw1,
379 VC *Tw2,
380 VC *Tw3,
381 VC *T0,
382 VC *T1,
383 VC *T2,
384 VC *T3) {
385 VC tt1, tt2, tt3;
386
387 /* CMUL tt1, Tw1, T1 */
388 VC_CONJ_MUL(&tt1, Tw1, T1);
389
390 /* CMUL tt2, Tw2, T2 */
391 VC_CONJ_MUL(&tt2, Tw2, T2);
392
393 /* CMUL tt3, Tw3, T3 */
394 VC_CONJ_MUL(&tt3, Tw3, T3);
395
396 /* CADD t0, T0, tt2 */
397 VC_ADD(t0, T0, &tt2);
398
399 /* CSUB t1, T0, tt2 */
400 VC_SUB(t1, T0, &tt2);
401
402 /* CADD t2, tt1, tt3 */
403 VC_ADD(t2, &tt1, &tt3);
404
405 /* CSUB t3, tt1, tt3 */
406 VC_SUB(t3, &tt1, &tt3);
407 }
408
409 /* Radix4 butterfly in first stage for both forward and inverse */
RADIX4_BUTTERFLY_FS(VC * t0,VC * t1,VC * t2,VC * t3,VC * T0,VC * T1,VC * T2,VC * T3)410 static __inline void RADIX4_BUTTERFLY_FS(
411 VC *t0,
412 VC *t1,
413 VC *t2,
414 VC *t3,
415 VC *T0,
416 VC *T1,
417 VC *T2,
418 VC *T3) {
419 /* CADD t0, T0, T2 */
420 VC_ADD(t0, T0, T2);
421
422 /* CSUB t1, T0, T2 */
423 VC_SUB(t1, T0, T2);
424
425 /* CADD t2, T1, T3 */
426 VC_ADD(t2, T1, T3);
427
428 /* CSUB t3, T1, T3 */
429 VC_SUB(t3, T1, T3);
430 }
431
432 /**
433 * Load 16 float elements (4 sse registers) which is a 4 * 4 matrix.
434 * Then Do transpose on the matrix.
435 * 3, 2, 1, 0 12, 8, 4, 0
436 * 7, 6, 5, 4 =====> 13, 9, 5, 1
437 * 11, 10, 9, 8 14, 10, 6, 2
438 * 15, 14, 13, 12 15, 11, 7, 3
439 */
VC_LOAD_MATRIX_TRANSPOSE(VC * T0,VC * T1,VC * T2,VC * T3,const OMX_F32 * pT0,const OMX_F32 * pT1,const OMX_F32 * pT2,const OMX_F32 * pT3,OMX_INT n)440 static __inline void VC_LOAD_MATRIX_TRANSPOSE(
441 VC *T0,
442 VC *T1,
443 VC *T2,
444 VC *T3,
445 const OMX_F32 *pT0,
446 const OMX_F32 *pT1,
447 const OMX_F32 *pT2,
448 const OMX_F32 *pT3,
449 OMX_INT n) {
450 __m128 xmm0;
451 __m128 xmm1;
452 __m128 xmm2;
453 __m128 xmm3;
454 __m128 xmm4;
455 __m128 xmm5;
456 __m128 xmm6;
457 __m128 xmm7;
458
459 xmm0 = _mm_load_ps(pT0);
460 xmm1 = _mm_load_ps(pT1);
461 xmm2 = _mm_load_ps(pT2);
462 xmm3 = _mm_load_ps(pT3);
463
464 /* Matrix transpose */
465 xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
466 xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
467 xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
468 xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
469 T0->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
470 T1->real = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
471 T2->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
472 T3->real = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
473
474 xmm0 = _mm_load_ps(pT0 + n);
475 xmm1 = _mm_load_ps(pT1 + n);
476 xmm2 = _mm_load_ps(pT2 + n);
477 xmm3 = _mm_load_ps(pT3 + n);
478
479 /* Matrix transpose */
480 xmm4 = _mm_unpacklo_ps(xmm0, xmm1);
481 xmm5 = _mm_unpackhi_ps(xmm0, xmm1);
482 xmm6 = _mm_unpacklo_ps(xmm2, xmm3);
483 xmm7 = _mm_unpackhi_ps(xmm2, xmm3);
484 T0->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(1, 0, 1, 0));
485 T1->imag = _mm_shuffle_ps(xmm4, xmm6, _MM_SHUFFLE(3, 2, 3, 2));
486 T2->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(1, 0, 1, 0));
487 T3->imag = _mm_shuffle_ps(xmm5, xmm7, _MM_SHUFFLE(3, 2, 3, 2));
488 }
489