1 /*
2 * Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 *
10 */
11
12 #include <stdbool.h>
13 #include <stdint.h>
14
15 #include "dl/api/omxtypes.h"
16 #include "dl/sp/api/omxSP.h"
17 #include "dl/sp/api/x86SP.h"
18 #include "dl/sp/src/x86/x86SP_SSE_Math.h"
19
20 extern OMX_F32* x86SP_F32_radix2_kernel_OutOfPlace(
21 const OMX_F32 *src,
22 OMX_F32 *buf1,
23 OMX_F32 *buf2,
24 const OMX_F32 *twiddle,
25 OMX_INT n,
26 bool forward_fft);
27
28 extern OMX_F32* x86SP_F32_radix4_kernel_OutOfPlace_sse(
29 const OMX_F32 *src,
30 OMX_F32 *buf1,
31 OMX_F32 *buf2,
32 const OMX_F32 *twiddle,
33 OMX_INT n,
34 bool forward_fft);
35
36 /**
37 * A two-for-one algorithm is used here to do the real fft:
38 *
39 * Input x[n], (n = 0, ..., N - 1)
40 * Output X[k] = DFT(N, k){x}
41 * a[n] = x[2n], (n = 0, ..., N/2 - 1)
42 * b[n] = x[2n + 1], (n = 0, ..., N/2 - 1)
43 * z[n] = a[n] + j * b[n]
44 * Z[k] = DFT(N/2, k){z}
45 * Z' is the complex conjugate of Z
46 * A[k] = (Z[k] + Z'[N/2 - k]) / 2
47 * B[k] = -j * (Z[k] - Z'[N/2 - k]) / 2
48 * X[k] = A[k] + B[k] * W[k], (W = exp(-j*2*PI*k/N); k = 0, ..., N/2 - 1)
49 * X[k] = A[k] - B[k], (k = N/2)
50 * X' is complex conjugate of X
51 * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
52 */
53
54 /**
55 * This function is the last permutation of two-for-one FFT algorithm.
56 * We move the division by 2 to the last step in the implementation, so:
57 * A[k] = (Z[k] + Z'[N/2 - k])
58 * B[k] = -j * (Z[k] - Z'[N/2 - k])
59 * X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
60 * X[k] = (A[k] - B[k]), (k = N/2)
61 * X[k] = X'[N - k], (k = N/2 + 1, ..., N - 1)
62 */
RevbinPermuteFwd(const OMX_F32 * in,OMX_F32 * out,const OMX_F32 * twiddle,OMX_INT n)63 static void RevbinPermuteFwd(
64 const OMX_F32 *in,
65 OMX_F32 *out,
66 const OMX_F32 *twiddle,
67 OMX_INT n) {
68 OMX_INT i;
69 OMX_INT j;
70 OMX_INT n_by_2 = n >> 1;
71 OMX_INT n_by_4 = n >> 2;
72
73 OMX_FC32 big_a;
74 OMX_FC32 big_b;
75 OMX_FC32 temp;
76 const OMX_F32 *tw;
77
78 for (i = 1, j = n_by_2 - 1; i < n_by_4; i++, j--) {
79 // A[k] = (Z[k] + Z'[N/2 - k])
80 big_a.Re = in[i] + in[j];
81 big_a.Im = in[j + n_by_2] - in[i + n_by_2];
82
83 // B[k] = -j * (Z[k] - Z'[N/2 - k])
84 big_b.Re = in[j] - in[i];
85 big_b.Im = in[j + n_by_2] + in[i + n_by_2];
86
87 // W[k]
88 tw = twiddle + i;
89
90 // temp = B[k] * W[k]
91 temp.Re = big_b.Re * tw[0] + big_b.Im * tw[n];
92 temp.Im = big_b.Re * tw[n] - big_b.Im * tw[0];
93
94 // Convert split format to interleaved format.
95 // X[k] = (A[k] + B[k] * W[k]) / 2, (k = 0, ..., N/2 - 1)
96 out[i << 1] = 0.5f * (big_a.Re - temp.Im);
97 out[(i << 1) + 1] = 0.5f * (temp.Re - big_a.Im);
98 // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
99 out[j << 1] = 0.5f * (big_a.Re + temp.Im);
100 out[(j << 1) + 1] = 0.5f * (temp.Re + big_a.Im);
101 }
102
103 // X[k] = A[k] - B[k] (k = N/2)
104 out[n_by_2] = in[n_by_4];
105 out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
106
107 out[0] = in[0] + in[n_by_2];
108 out[1] = 0;
109 out[n] = in[0] - in[n_by_2];
110 out[n + 1] = 0;
111 }
112
113 // Sse version of RevbinPermuteFwd function.
RevbinPermuteFwdSse(const OMX_F32 * in,OMX_F32 * out,const OMX_F32 * twiddle,OMX_INT n)114 static void RevbinPermuteFwdSse(
115 const OMX_F32 *in,
116 OMX_F32 *out,
117 const OMX_F32 *twiddle,
118 OMX_INT n) {
119 OMX_INT i;
120 OMX_INT j;
121 OMX_INT n_by_2 = n >> 1;
122 OMX_INT n_by_4 = n >> 2;
123
124 VC v_i;
125 VC v_j;
126 VC v_big_a;
127 VC v_big_b;
128 VC v_temp;
129 VC v_x0;
130 VC v_x1;
131 VC v_tw;
132
133 __m128 factor = _mm_set1_ps(0.5f);
134
135 for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
136 VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);
137
138 VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
139 VC_REVERSE(&v_j);
140
141 // A[k] = (Z[k] + Z'[N/2 - k])
142 VC_ADD_SUB(&v_big_a, &v_j, &v_i);
143
144 // B[k] = -j * (Z[k] - Z'[N/2 - k])
145 VC_SUB_ADD(&v_big_b, &v_j, &v_i);
146
147 // W[k]
148 VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);
149
150 // temp = B[k] * W[k]
151 VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);
152
153 VC_SUB_X(&v_x0, &v_big_a, &v_temp);
154 VC_ADD_X(&v_x1, &v_big_a, &v_temp);
155
156 VC_MUL_F(&v_x0, &v_x0, factor);
157 VC_MUL_F(&v_x1, &v_x1, factor);
158
159 // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
160 VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);
161
162 // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
163 VC_REVERSE(&v_x1);
164 VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
165 }
166
167 out[n_by_2] = in[n_by_4];
168 out[n_by_2 + 1] = -in[n_by_4 + n_by_2];
169
170 out[0] = in[0] + in[n_by_2];
171 out[1] = 0;
172 out[n] = in[0] - in[n_by_2];
173 out[n + 1] = 0;
174 }
175
omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 * pSrc,OMX_F32 * pDst,const OMXFFTSpec_R_F32 * pFFTSpec)176 OMXResult omxSP_FFTFwd_RToCCS_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
177 const OMXFFTSpec_R_F32 *pFFTSpec) {
178 OMX_INT n;
179 OMX_INT n_by_2;
180 OMX_INT n_by_4;
181 const OMX_F32 *twiddle;
182 OMX_F32 *buf;
183
184 const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;
185
186 // Input must be 32 byte aligned
187 if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31)
188 return OMX_Sts_BadArgErr;
189
190 n = pFFTStruct->N;
191
192 // This is to handle the case of order == 1.
193 if (n == 2) {
194 pDst[0] = (pSrc[0] + pSrc[1]);
195 pDst[1] = 0.0f;
196 pDst[2] = (pSrc[0] - pSrc[1]);
197 pDst[3] = 0.0f;
198 return OMX_Sts_NoErr;
199 }
200
201 n_by_2 = n >> 1;
202 n_by_4 = n >> 2;
203 buf = pFFTStruct->pBuf1;
204 twiddle = pFFTStruct->pTwiddle;
205
206 if(n_by_2 >= 16) {
207 buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
208 pSrc,
209 pFFTStruct->pBuf2,
210 buf,
211 twiddle,
212 n_by_2,
213 1);
214 } else {
215 buf = x86SP_F32_radix2_kernel_OutOfPlace(
216 pSrc,
217 pFFTStruct->pBuf2,
218 buf,
219 twiddle,
220 n_by_2,
221 1);
222 }
223
224 if(n >= 8)
225 RevbinPermuteFwdSse(buf, pDst, twiddle, n);
226 else
227 RevbinPermuteFwd(buf, pDst, twiddle, n);
228
229 return OMX_Sts_NoErr;
230 }
231