1 /* Copyright (c) 2007-2008 CSIRO
2 Copyright (c) 2007-2008 Xiph.Org Foundation
3 Written by Jean-Marc Valin */
4 /*
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions
7 are met:
8
9 - Redistributions of source code must retain the above copyright
10 notice, this list of conditions and the following disclaimer.
11
12 - Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20 OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24 LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /* This is a simple MDCT implementation that uses a N/4 complex FFT
30 to do most of the work. It should be relatively straightforward to
31 plug in pretty much and FFT here.
32
33 This replaces the Vorbis FFT (and uses the exact same API), which
34 was a bit too messy and that was ending up duplicating code
35 (might as well use the same FFT everywhere).
36
37 The algorithm is similar to (and inspired from) Fabrice Bellard's
38 MDCT implementation in FFMPEG, but has differences in signs, ordering
39 and scaling in many places.
40 */
41 #ifndef __MDCT_MIPSR1_H__
42 #define __MDCT_MIPSR1_H__
43
44 #ifndef SKIP_CONFIG_H
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 #endif
49
50 #include "mdct.h"
51 #include "kiss_fft.h"
52 #include "_kiss_fft_guts.h"
53 #include <math.h>
54 #include "os_support.h"
55 #include "mathops.h"
56 #include "stack_alloc.h"
57
58 /* Forward MDCT trashes the input array */
59 #define OVERRIDE_clt_mdct_forward
clt_mdct_forward(const mdct_lookup * l,kiss_fft_scalar * in,kiss_fft_scalar * OPUS_RESTRICT out,const opus_val16 * window,int overlap,int shift,int stride,int arch)60 void clt_mdct_forward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
61 const opus_val16 *window, int overlap, int shift, int stride, int arch)
62 {
63 int i;
64 int N, N2, N4;
65 VARDECL(kiss_fft_scalar, f);
66 VARDECL(kiss_fft_cpx, f2);
67 const kiss_fft_state *st = l->kfft[shift];
68 const kiss_twiddle_scalar *trig;
69 opus_val16 scale;
70 #ifdef FIXED_POINT
71 /* Allows us to scale with MULT16_32_Q16(), which is faster than
72 MULT16_32_Q15() on ARM. */
73 int scale_shift = st->scale_shift-1;
74 #endif
75
76 (void)arch;
77
78 SAVE_STACK;
79 scale = st->scale;
80
81 N = l->n;
82 trig = l->trig;
83 for (i=0;i<shift;i++)
84 {
85 N >>= 1;
86 trig += N;
87 }
88 N2 = N>>1;
89 N4 = N>>2;
90
91 ALLOC(f, N2, kiss_fft_scalar);
92 ALLOC(f2, N4, kiss_fft_cpx);
93
94 /* Consider the input to be composed of four blocks: [a, b, c, d] */
95 /* Window, shuffle, fold */
96 {
97 /* Temp pointers to make it really clear to the compiler what we're doing */
98 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in+(overlap>>1);
99 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+N2-1+(overlap>>1);
100 kiss_fft_scalar * OPUS_RESTRICT yp = f;
101 const opus_val16 * OPUS_RESTRICT wp1 = window+(overlap>>1);
102 const opus_val16 * OPUS_RESTRICT wp2 = window+(overlap>>1)-1;
103 for(i=0;i<((overlap+3)>>2);i++)
104 {
105 /* Real part arranged as -d-cR, Imag part arranged as -b+aR*/
106 *yp++ = S_MUL_ADD(*wp2, xp1[N2],*wp1,*xp2);
107 *yp++ = S_MUL_SUB(*wp1, *xp1,*wp2, xp2[-N2]);
108 xp1+=2;
109 xp2-=2;
110 wp1+=2;
111 wp2-=2;
112 }
113 wp1 = window;
114 wp2 = window+overlap-1;
115 for(;i<N4-((overlap+3)>>2);i++)
116 {
117 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
118 *yp++ = *xp2;
119 *yp++ = *xp1;
120 xp1+=2;
121 xp2-=2;
122 }
123 for(;i<N4;i++)
124 {
125 /* Real part arranged as a-bR, Imag part arranged as -c-dR */
126 *yp++ = S_MUL_SUB(*wp2, *xp2, *wp1, xp1[-N2]);
127 *yp++ = S_MUL_ADD(*wp2, *xp1, *wp1, xp2[N2]);
128 xp1+=2;
129 xp2-=2;
130 wp1+=2;
131 wp2-=2;
132 }
133 }
134 /* Pre-rotation */
135 {
136 kiss_fft_scalar * OPUS_RESTRICT yp = f;
137 const kiss_twiddle_scalar *t = &trig[0];
138 for(i=0;i<N4;i++)
139 {
140 kiss_fft_cpx yc;
141 kiss_twiddle_scalar t0, t1;
142 kiss_fft_scalar re, im, yr, yi;
143 t0 = t[i];
144 t1 = t[N4+i];
145 re = *yp++;
146 im = *yp++;
147
148 yr = S_MUL_SUB(re,t0,im,t1);
149 yi = S_MUL_ADD(im,t0,re,t1);
150
151 yc.r = yr;
152 yc.i = yi;
153 yc.r = PSHR32(MULT16_32_Q16(scale, yc.r), scale_shift);
154 yc.i = PSHR32(MULT16_32_Q16(scale, yc.i), scale_shift);
155 f2[st->bitrev[i]] = yc;
156 }
157 }
158
159 /* N/4 complex FFT, does not downscale anymore */
160 opus_fft_impl(st, f2);
161
162 /* Post-rotate */
163 {
164 /* Temp pointers to make it really clear to the compiler what we're doing */
165 const kiss_fft_cpx * OPUS_RESTRICT fp = f2;
166 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
167 kiss_fft_scalar * OPUS_RESTRICT yp2 = out+stride*(N2-1);
168 const kiss_twiddle_scalar *t = &trig[0];
169 /* Temp pointers to make it really clear to the compiler what we're doing */
170 for(i=0;i<N4;i++)
171 {
172 kiss_fft_scalar yr, yi;
173 yr = S_MUL_SUB(fp->i,t[N4+i] , fp->r,t[i]);
174 yi = S_MUL_ADD(fp->r,t[N4+i] ,fp->i,t[i]);
175 *yp1 = yr;
176 *yp2 = yi;
177 fp++;
178 yp1 += 2*stride;
179 yp2 -= 2*stride;
180 }
181 }
182 RESTORE_STACK;
183 }
184
185 #define OVERRIDE_clt_mdct_backward
clt_mdct_backward(const mdct_lookup * l,kiss_fft_scalar * in,kiss_fft_scalar * OPUS_RESTRICT out,const opus_val16 * OPUS_RESTRICT window,int overlap,int shift,int stride,int arch)186 void clt_mdct_backward(const mdct_lookup *l, kiss_fft_scalar *in, kiss_fft_scalar * OPUS_RESTRICT out,
187 const opus_val16 * OPUS_RESTRICT window, int overlap, int shift, int stride, int arch)
188 {
189 int i;
190 int N, N2, N4;
191 const kiss_twiddle_scalar *trig;
192
193 (void)arch;
194
195 N = l->n;
196 trig = l->trig;
197 for (i=0;i<shift;i++)
198 {
199 N >>= 1;
200 trig += N;
201 }
202 N2 = N>>1;
203 N4 = N>>2;
204
205 /* Pre-rotate */
206 {
207 /* Temp pointers to make it really clear to the compiler what we're doing */
208 const kiss_fft_scalar * OPUS_RESTRICT xp1 = in;
209 const kiss_fft_scalar * OPUS_RESTRICT xp2 = in+stride*(N2-1);
210 kiss_fft_scalar * OPUS_RESTRICT yp = out+(overlap>>1);
211 const kiss_twiddle_scalar * OPUS_RESTRICT t = &trig[0];
212 const opus_int16 * OPUS_RESTRICT bitrev = l->kfft[shift]->bitrev;
213 for(i=0;i<N4;i++)
214 {
215 int rev;
216 kiss_fft_scalar yr, yi;
217 rev = *bitrev++;
218 yr = S_MUL_ADD(*xp2, t[i] , *xp1, t[N4+i]);
219 yi = S_MUL_SUB(*xp1, t[i] , *xp2, t[N4+i]);
220 /* We swap real and imag because we use an FFT instead of an IFFT. */
221 yp[2*rev+1] = yr;
222 yp[2*rev] = yi;
223 /* Storing the pre-rotation directly in the bitrev order. */
224 xp1+=2*stride;
225 xp2-=2*stride;
226 }
227 }
228
229 opus_fft_impl(l->kfft[shift], (kiss_fft_cpx*)(out+(overlap>>1)));
230
231 /* Post-rotate and de-shuffle from both ends of the buffer at once to make
232 it in-place. */
233 {
234 kiss_fft_scalar * OPUS_RESTRICT yp0 = out+(overlap>>1);
235 kiss_fft_scalar * OPUS_RESTRICT yp1 = out+(overlap>>1)+N2-2;
236 const kiss_twiddle_scalar *t = &trig[0];
237 /* Loop to (N4+1)>>1 to handle odd N4. When N4 is odd, the
238 middle pair will be computed twice. */
239 for(i=0;i<(N4+1)>>1;i++)
240 {
241 kiss_fft_scalar re, im, yr, yi;
242 kiss_twiddle_scalar t0, t1;
243 /* We swap real and imag because we're using an FFT instead of an IFFT. */
244 re = yp0[1];
245 im = yp0[0];
246 t0 = t[i];
247 t1 = t[N4+i];
248 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
249 yr = S_MUL_ADD(re,t0 , im,t1);
250 yi = S_MUL_SUB(re,t1 , im,t0);
251 /* We swap real and imag because we're using an FFT instead of an IFFT. */
252 re = yp1[1];
253 im = yp1[0];
254 yp0[0] = yr;
255 yp1[1] = yi;
256
257 t0 = t[(N4-i-1)];
258 t1 = t[(N2-i-1)];
259 /* We'd scale up by 2 here, but instead it's done when mixing the windows */
260 yr = S_MUL_ADD(re,t0,im,t1);
261 yi = S_MUL_SUB(re,t1,im,t0);
262 yp1[0] = yr;
263 yp0[1] = yi;
264 yp0 += 2;
265 yp1 -= 2;
266 }
267 }
268
269 /* Mirror on both sides for TDAC */
270 {
271 kiss_fft_scalar * OPUS_RESTRICT xp1 = out+overlap-1;
272 kiss_fft_scalar * OPUS_RESTRICT yp1 = out;
273 const opus_val16 * OPUS_RESTRICT wp1 = window;
274 const opus_val16 * OPUS_RESTRICT wp2 = window+overlap-1;
275
276 for(i = 0; i < overlap/2; i++)
277 {
278 kiss_fft_scalar x1, x2;
279 x1 = *xp1;
280 x2 = *yp1;
281 *yp1++ = MULT16_32_Q15(*wp2, x2) - MULT16_32_Q15(*wp1, x1);
282 *xp1-- = MULT16_32_Q15(*wp1, x2) + MULT16_32_Q15(*wp2, x1);
283 wp1++;
284 wp2--;
285 }
286 }
287 }
288 #endif /* __MDCT_MIPSR1_H__ */
289