1 /*
2 * VP8 DSP functions x86-optimized
3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5 *
6 * This file is part of FFmpeg.
7 *
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
12 *
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
17 *
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 */
22
23 #include "libavutil/attributes.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/vp8dsp.h"
28
29 #if HAVE_X86ASM
30
31 /*
32 * MC functions
33 */
34 void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
35 uint8_t *src, ptrdiff_t srcstride,
36 int height, int mx, int my);
37 void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
38 uint8_t *src, ptrdiff_t srcstride,
39 int height, int mx, int my);
40 void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
41 uint8_t *src, ptrdiff_t srcstride,
42 int height, int mx, int my);
43 void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
44 uint8_t *src, ptrdiff_t srcstride,
45 int height, int mx, int my);
46
47 void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
48 uint8_t *src, ptrdiff_t srcstride,
49 int height, int mx, int my);
50 void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
51 uint8_t *src, ptrdiff_t srcstride,
52 int height, int mx, int my);
53 void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
54 uint8_t *src, ptrdiff_t srcstride,
55 int height, int mx, int my);
56 void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
57 uint8_t *src, ptrdiff_t srcstride,
58 int height, int mx, int my);
59
60 void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
61 uint8_t *src, ptrdiff_t srcstride,
62 int height, int mx, int my);
63 void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
64 uint8_t *src, ptrdiff_t srcstride,
65 int height, int mx, int my);
66 void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
67 uint8_t *src, ptrdiff_t srcstride,
68 int height, int mx, int my);
69 void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
70 uint8_t *src, ptrdiff_t srcstride,
71 int height, int mx, int my);
72 void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
73 uint8_t *src, ptrdiff_t srcstride,
74 int height, int mx, int my);
75 void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
76 uint8_t *src, ptrdiff_t srcstride,
77 int height, int mx, int my);
78 void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
79 uint8_t *src, ptrdiff_t srcstride,
80 int height, int mx, int my);
81 void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
82 uint8_t *src, ptrdiff_t srcstride,
83 int height, int mx, int my);
84
85 void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
86 uint8_t *src, ptrdiff_t srcstride,
87 int height, int mx, int my);
88 void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
89 uint8_t *src, ptrdiff_t srcstride,
90 int height, int mx, int my);
91 void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
92 uint8_t *src, ptrdiff_t srcstride,
93 int height, int mx, int my);
94 void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
95 uint8_t *src, ptrdiff_t srcstride,
96 int height, int mx, int my);
97
98 void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
99 uint8_t *src, ptrdiff_t srcstride,
100 int height, int mx, int my);
101 void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
102 uint8_t *src, ptrdiff_t srcstride,
103 int height, int mx, int my);
104 void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
105 uint8_t *src, ptrdiff_t srcstride,
106 int height, int mx, int my);
107 void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
108 uint8_t *src, ptrdiff_t srcstride,
109 int height, int mx, int my);
110
111
112 void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
113 uint8_t *src, ptrdiff_t srcstride,
114 int height, int mx, int my);
115 void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
116 uint8_t *src, ptrdiff_t srcstride,
117 int height, int mx, int my);
118
119 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
120 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
121 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
122 ptrdiff_t srcstride, int height, int mx, int my) \
123 { \
124 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
125 dst, dststride, src, srcstride, height, mx, my); \
126 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
127 dst + 8, dststride, src + 8, srcstride, height, mx, my); \
128 }
129 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
130 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
132 ptrdiff_t srcstride, int height, int mx, int my) \
133 { \
134 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
135 dst, dststride, src, srcstride, height, mx, my); \
136 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
137 dst + 4, dststride, src + 4, srcstride, height, mx, my); \
138 }
139
140 TAP_W16(sse2, epel, h6)
141 TAP_W16(sse2, epel, v6)
142 TAP_W16(sse2, bilinear, h)
143 TAP_W16(sse2, bilinear, v)
144
145 TAP_W16(ssse3, epel, h6)
146 TAP_W16(ssse3, epel, v6)
147 TAP_W16(ssse3, bilinear, h)
148 TAP_W16(ssse3, bilinear, v)
149
150 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
151 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
152 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
153 ptrdiff_t srcstride, int height, int mx, int my) \
154 { \
155 LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
156 uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
157 src -= srcstride * (TAPNUMY / 2 - 1); \
158 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
159 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
160 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
161 dst, dststride, tmpptr, SIZE, height, mx, my); \
162 }
163
164 #define HVTAPMMX(x, y) \
165 HVTAP(mmxext, 8, x, y, 4, 8)
166
167 HVTAPMMX(4, 4)
168 HVTAPMMX(4, 6)
169 HVTAPMMX(6, 4)
170 HVTAPMMX(6, 6)
171
172 #define HVTAPSSE2(x, y, w) \
173 HVTAP(sse2, 16, x, y, w, 16) \
174 HVTAP(ssse3, 16, x, y, w, 16)
175
176 HVTAPSSE2(4, 4, 8)
177 HVTAPSSE2(4, 6, 8)
178 HVTAPSSE2(6, 4, 8)
179 HVTAPSSE2(6, 6, 8)
180 HVTAPSSE2(6, 6, 16)
181
182 HVTAP(ssse3, 16, 4, 4, 4, 8)
183 HVTAP(ssse3, 16, 4, 6, 4, 8)
184 HVTAP(ssse3, 16, 6, 4, 4, 8)
185 HVTAP(ssse3, 16, 6, 6, 4, 8)
186
187 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
188 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
189 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
190 ptrdiff_t srcstride, int height, int mx, int my) \
191 { \
192 LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
193 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
194 tmp, SIZE, src, srcstride, height + 1, mx, my); \
195 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
196 dst, dststride, tmp, SIZE, height, mx, my); \
197 }
198
199 HVBILIN(mmxext, 8, 4, 8)
200 HVBILIN(sse2, 8, 8, 16)
201 HVBILIN(sse2, 8, 16, 16)
202 HVBILIN(ssse3, 8, 4, 8)
203 HVBILIN(ssse3, 8, 8, 16)
204 HVBILIN(ssse3, 8, 16, 16)
205
206 void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
207 ptrdiff_t stride);
208 void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
209 ptrdiff_t stride);
210 void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
211 ptrdiff_t stride);
212 void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
213 ptrdiff_t stride);
214 void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
215 void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
216
217 #define DECLARE_LOOP_FILTER(NAME) \
218 void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
219 ptrdiff_t stride, \
220 int flim); \
221 void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
222 ptrdiff_t stride, \
223 int flim); \
224 void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
225 ptrdiff_t stride, \
226 int e, int i, int hvt); \
227 void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
228 ptrdiff_t stride, \
229 int e, int i, int hvt); \
230 void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
231 uint8_t *dstV, \
232 ptrdiff_t s, \
233 int e, int i, int hvt); \
234 void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
235 uint8_t *dstV, \
236 ptrdiff_t s, \
237 int e, int i, int hvt); \
238 void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
239 ptrdiff_t stride, \
240 int e, int i, int hvt); \
241 void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
242 ptrdiff_t stride, \
243 int e, int i, int hvt); \
244 void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
245 uint8_t *dstV, \
246 ptrdiff_t s, \
247 int e, int i, int hvt); \
248 void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
249 uint8_t *dstV, \
250 ptrdiff_t s, \
251 int e, int i, int hvt);
252
253 DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)254 DECLARE_LOOP_FILTER(ssse3)
255 DECLARE_LOOP_FILTER(sse4)
256
257 #endif /* HAVE_X86ASM */
258
259 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
260 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
261 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
262 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
263
264 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
265 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
266 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
267 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
268 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
269 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
270 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
271
272 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
273 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
274 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
275 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
276 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
277 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
278 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
279 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
280 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
281
282
283 av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
284 {
285 #if HAVE_X86ASM
286 int cpu_flags = av_get_cpu_flags();
287
288 if (EXTERNAL_MMX(cpu_flags)) {
289 c->put_vp8_epel_pixels_tab[1][0][0] =
290 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
291 }
292
293 /* note that 4-tap width=16 functions are missing because w=16
294 * is only used for luma, and luma is always a copy or sixtap. */
295 if (EXTERNAL_MMXEXT(cpu_flags)) {
296 VP8_MC_FUNC(2, 4, mmxext);
297 VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
298 }
299
300 if (EXTERNAL_SSE(cpu_flags)) {
301 c->put_vp8_epel_pixels_tab[0][0][0] =
302 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
303 }
304
305 if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
306 VP8_LUMA_MC_FUNC(0, 16, sse2);
307 VP8_MC_FUNC(1, 8, sse2);
308 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
309 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
310 }
311
312 if (EXTERNAL_SSSE3(cpu_flags)) {
313 VP8_LUMA_MC_FUNC(0, 16, ssse3);
314 VP8_MC_FUNC(1, 8, ssse3);
315 VP8_MC_FUNC(2, 4, ssse3);
316 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
317 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
318 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
319 }
320 #endif /* HAVE_X86ASM */
321 }
322
ff_vp8dsp_init_x86(VP8DSPContext * c)323 av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
324 {
325 #if HAVE_X86ASM
326 int cpu_flags = av_get_cpu_flags();
327
328 if (EXTERNAL_MMX(cpu_flags)) {
329 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
330 }
331
332 if (EXTERNAL_SSE(cpu_flags)) {
333 c->vp8_idct_add = ff_vp8_idct_add_sse;
334 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
335 }
336
337 if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
338 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
339
340 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
341 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
342
343 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
344 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
345 }
346
347 if (EXTERNAL_SSE2(cpu_flags)) {
348 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse2;
349 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
350
351 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
352
353 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
354 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
355
356 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
357 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
358 }
359
360 if (EXTERNAL_SSSE3(cpu_flags)) {
361 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
362 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
363
364 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
365 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
366 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
367 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
368
369 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
370 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
371 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
372 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
373 }
374
375 if (EXTERNAL_SSE4(cpu_flags)) {
376 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
377
378 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
379 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
380 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
381 }
382 #endif /* HAVE_X86ASM */
383 }
384