1 /*
2 * SIMD-optimized MP3 decoding functions
3 * Copyright (c) 2010 Vitor Sessak
4 *
5 * This file is part of FFmpeg.
6 *
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
11 *
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 */
21
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/mpegaudiodsp.h"
29
30 #define DECL(CPU)\
31 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
32 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
33
34 #if HAVE_X86ASM
35 #if ARCH_X86_32
36 DECL(sse)
37 #endif
38 DECL(sse2)
39 DECL(sse3)
40 DECL(ssse3)
41 DECL(avx)
42 #endif /* HAVE_X86ASM */
43
44 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
45 float *tmpbuf);
46 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
47 float *tmpbuf);
48
49 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
50
51 #if HAVE_6REGS && HAVE_SSE_INLINE
52
53 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
54 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
55
56 #define SUM8(op, sum, w, p) \
57 { \
58 op(sum, (w)[0 * 64], (p)[0 * 64]); \
59 op(sum, (w)[1 * 64], (p)[1 * 64]); \
60 op(sum, (w)[2 * 64], (p)[2 * 64]); \
61 op(sum, (w)[3 * 64], (p)[3 * 64]); \
62 op(sum, (w)[4 * 64], (p)[4 * 64]); \
63 op(sum, (w)[5 * 64], (p)[5 * 64]); \
64 op(sum, (w)[6 * 64], (p)[6 * 64]); \
65 op(sum, (w)[7 * 64], (p)[7 * 64]); \
66 }
67
apply_window(const float * buf,const float * win1,const float * win2,float * sum1,float * sum2,int len)68 static void apply_window(const float *buf, const float *win1,
69 const float *win2, float *sum1, float *sum2, int len)
70 {
71 x86_reg count = - 4*len;
72 const float *win1a = win1+len;
73 const float *win2a = win2+len;
74 const float *bufa = buf+len;
75 float *sum1a = sum1+len;
76 float *sum2a = sum2+len;
77
78
79 #define MULT(a, b) \
80 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
81 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
82 "mulps %%xmm2, %%xmm1 \n\t" \
83 "subps %%xmm1, %%xmm0 \n\t" \
84 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
85 "subps %%xmm2, %%xmm4 \n\t" \
86
87 __asm__ volatile(
88 "1: \n\t"
89 "xorps %%xmm0, %%xmm0 \n\t"
90 "xorps %%xmm4, %%xmm4 \n\t"
91
92 MULT( 0, 0)
93 MULT( 256, 64)
94 MULT( 512, 128)
95 MULT( 768, 192)
96 MULT(1024, 256)
97 MULT(1280, 320)
98 MULT(1536, 384)
99 MULT(1792, 448)
100
101 "movaps %%xmm0, (%4,%0) \n\t"
102 "movaps %%xmm4, (%5,%0) \n\t"
103 "add $16, %0 \n\t"
104 "jl 1b \n\t"
105 :"+&r"(count)
106 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
107 );
108
109 #undef MULT
110 }
111
apply_window_mp3(float * in,float * win,int * unused,float * out,ptrdiff_t incr)112 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
113 ptrdiff_t incr)
114 {
115 LOCAL_ALIGNED_16(float, suma, [17]);
116 LOCAL_ALIGNED_16(float, sumb, [17]);
117 LOCAL_ALIGNED_16(float, sumc, [17]);
118 LOCAL_ALIGNED_16(float, sumd, [17]);
119
120 float sum;
121
122 /* copy to avoid wrap */
123 __asm__ volatile(
124 "movaps 0(%0), %%xmm0 \n\t" \
125 "movaps 16(%0), %%xmm1 \n\t" \
126 "movaps 32(%0), %%xmm2 \n\t" \
127 "movaps 48(%0), %%xmm3 \n\t" \
128 "movaps %%xmm0, 0(%1) \n\t" \
129 "movaps %%xmm1, 16(%1) \n\t" \
130 "movaps %%xmm2, 32(%1) \n\t" \
131 "movaps %%xmm3, 48(%1) \n\t" \
132 "movaps 64(%0), %%xmm0 \n\t" \
133 "movaps 80(%0), %%xmm1 \n\t" \
134 "movaps 96(%0), %%xmm2 \n\t" \
135 "movaps 112(%0), %%xmm3 \n\t" \
136 "movaps %%xmm0, 64(%1) \n\t" \
137 "movaps %%xmm1, 80(%1) \n\t" \
138 "movaps %%xmm2, 96(%1) \n\t" \
139 "movaps %%xmm3, 112(%1) \n\t"
140 ::"r"(in), "r"(in+512)
141 :"memory"
142 );
143
144 apply_window(in + 16, win , win + 512, suma, sumc, 16);
145 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
146
147 SUM8(MACS, suma[0], win + 32, in + 48);
148
149 sumc[ 0] = 0;
150 sumb[16] = 0;
151 sumd[16] = 0;
152
153 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
154 "movups " #sumd "(%4), %%xmm0 \n\t" \
155 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
156 "subps " #suma "(%1), %%xmm0 \n\t" \
157 "movaps %%xmm0," #out1 "(%0) \n\t" \
158 \
159 "movups " #sumc "(%3), %%xmm0 \n\t" \
160 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
161 "addps " #sumb "(%2), %%xmm0 \n\t" \
162 "movaps %%xmm0," #out2 "(%0) \n\t"
163
164 if (incr == 1) {
165 __asm__ volatile(
166 SUMS( 0, 48, 4, 52, 0, 112)
167 SUMS(16, 32, 20, 36, 16, 96)
168 SUMS(32, 16, 36, 20, 32, 80)
169 SUMS(48, 0, 52, 4, 48, 64)
170
171 :"+&r"(out)
172 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
173 :"memory"
174 );
175 out += 16*incr;
176 } else {
177 int j;
178 float *out2 = out + 32 * incr;
179 out[0 ] = -suma[ 0];
180 out += incr;
181 out2 -= incr;
182 for(j=1;j<16;j++) {
183 *out = -suma[ j] + sumd[16-j];
184 *out2 = sumb[16-j] + sumc[ j];
185 out += incr;
186 out2 -= incr;
187 }
188 }
189
190 sum = 0;
191 SUM8(MLSS, sum, win + 16 + 32, in + 32);
192 *out = sum;
193 }
194
195 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
196
197 #if HAVE_X86ASM
198 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
199 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
200 int count, int switch_point, int block_type) \
201 { \
202 int align_end = count - (count & 3); \
203 int j; \
204 for (j = 0; j < align_end; j+= 4) { \
205 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
206 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
207 /* apply window & overlap with previous buffer */ \
208 \
209 /* select window */ \
210 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
211 in += 4*18; \
212 buf += 4*18; \
213 out += 4; \
214 } \
215 for (; j < count; j++) { \
216 /* apply window & overlap with previous buffer */ \
217 \
218 /* select window */ \
219 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
220 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
221 \
222 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
223 \
224 in += 18; \
225 buf++; \
226 out++; \
227 } \
228 }
229
230 #if HAVE_SSE
231 #if ARCH_X86_32
DECL_IMDCT_BLOCKS(sse,sse)232 DECL_IMDCT_BLOCKS(sse,sse)
233 #endif
234 DECL_IMDCT_BLOCKS(sse2,sse)
235 DECL_IMDCT_BLOCKS(sse3,sse)
236 DECL_IMDCT_BLOCKS(ssse3,sse)
237 #endif
238 #if HAVE_AVX_EXTERNAL
239 DECL_IMDCT_BLOCKS(avx,avx)
240 #endif
241 #endif /* HAVE_X86ASM */
242
243 av_cold void ff_mpadsp_init_x86_tabs(void)
244 {
245 int i, j;
246 for (j = 0; j < 4; j++) {
247 for (i = 0; i < 40; i ++) {
248 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
249 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
250 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
251 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
252 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
253 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
254 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
255 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
256 }
257 }
258 }
259
ff_mpadsp_init_x86(MPADSPContext * s)260 av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
261 {
262 av_unused int cpu_flags = av_get_cpu_flags();
263
264 #if HAVE_6REGS && HAVE_SSE_INLINE
265 if (INLINE_SSE(cpu_flags)) {
266 s->apply_window_float = apply_window_mp3;
267 }
268 #endif /* HAVE_SSE_INLINE */
269
270 #if HAVE_X86ASM
271 #if HAVE_SSE
272 #if ARCH_X86_32
273 if (EXTERNAL_SSE(cpu_flags)) {
274 s->imdct36_blocks_float = imdct36_blocks_sse;
275 }
276 #endif
277 if (EXTERNAL_SSE2(cpu_flags)) {
278 s->imdct36_blocks_float = imdct36_blocks_sse2;
279 }
280 if (EXTERNAL_SSE3(cpu_flags)) {
281 s->imdct36_blocks_float = imdct36_blocks_sse3;
282 }
283 if (EXTERNAL_SSSE3(cpu_flags)) {
284 s->imdct36_blocks_float = imdct36_blocks_ssse3;
285 }
286 #endif
287 #if HAVE_AVX_EXTERNAL
288 if (EXTERNAL_AVX(cpu_flags)) {
289 s->imdct36_blocks_float = imdct36_blocks_avx;
290 }
291 #endif
292 #endif /* HAVE_X86ASM */
293 }
294