• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * VP8 DSP functions x86-optimized
3  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4  * Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/attributes.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/cpu.h"
27 #include "libavcodec/vp8dsp.h"
28 
29 #if HAVE_X86ASM
30 
31 /*
32  * MC functions
33  */
34 void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
35                                 uint8_t *src, ptrdiff_t srcstride,
36                                 int height, int mx, int my);
37 void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
38                                 uint8_t *src, ptrdiff_t srcstride,
39                                 int height, int mx, int my);
40 void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
41                                 uint8_t *src, ptrdiff_t srcstride,
42                                 int height, int mx, int my);
43 void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
44                                 uint8_t *src, ptrdiff_t srcstride,
45                                 int height, int mx, int my);
46 
47 void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
48                                 uint8_t *src, ptrdiff_t srcstride,
49                                 int height, int mx, int my);
50 void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, ptrdiff_t dststride,
51                                 uint8_t *src, ptrdiff_t srcstride,
52                                 int height, int mx, int my);
53 void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, ptrdiff_t dststride,
54                                 uint8_t *src, ptrdiff_t srcstride,
55                                 int height, int mx, int my);
56 void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, ptrdiff_t dststride,
57                                 uint8_t *src, ptrdiff_t srcstride,
58                                 int height, int mx, int my);
59 
60 void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
61                                 uint8_t *src, ptrdiff_t srcstride,
62                                 int height, int mx, int my);
63 void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
64                                 uint8_t *src, ptrdiff_t srcstride,
65                                 int height, int mx, int my);
66 void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
67                                 uint8_t *src, ptrdiff_t srcstride,
68                                 int height, int mx, int my);
69 void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
70                                 uint8_t *src, ptrdiff_t srcstride,
71                                 int height, int mx, int my);
72 void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
73                                 uint8_t *src, ptrdiff_t srcstride,
74                                 int height, int mx, int my);
75 void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
76                                 uint8_t *src, ptrdiff_t srcstride,
77                                 int height, int mx, int my);
78 void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
79                                 uint8_t *src, ptrdiff_t srcstride,
80                                 int height, int mx, int my);
81 void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
82                                 uint8_t *src, ptrdiff_t srcstride,
83                                 int height, int mx, int my);
84 
85 void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
86                                    uint8_t *src, ptrdiff_t srcstride,
87                                    int height, int mx, int my);
88 void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
89                                    uint8_t *src, ptrdiff_t srcstride,
90                                    int height, int mx, int my);
91 void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
92                                    uint8_t *src, ptrdiff_t srcstride,
93                                    int height, int mx, int my);
94 void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
95                                    uint8_t *src, ptrdiff_t srcstride,
96                                    int height, int mx, int my);
97 
98 void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
99                                    uint8_t *src, ptrdiff_t srcstride,
100                                    int height, int mx, int my);
101 void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
102                                    uint8_t *src, ptrdiff_t srcstride,
103                                    int height, int mx, int my);
104 void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
105                                    uint8_t *src, ptrdiff_t srcstride,
106                                    int height, int mx, int my);
107 void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
108                                    uint8_t *src, ptrdiff_t srcstride,
109                                    int height, int mx, int my);
110 
111 
112 void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
113                              uint8_t *src, ptrdiff_t srcstride,
114                              int height, int mx, int my);
115 void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
116                              uint8_t *src, ptrdiff_t srcstride,
117                              int height, int mx, int my);
118 
119 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
120 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
121     uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
122     ptrdiff_t srcstride, int height, int mx, int my) \
123 { \
124     ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
125         dst,     dststride, src,     srcstride, height, mx, my); \
126     ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
127         dst + 8, dststride, src + 8, srcstride, height, mx, my); \
128 }
129 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
130 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
131     uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
132     ptrdiff_t srcstride, int height, int mx, int my) \
133 { \
134     ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
135         dst,     dststride, src,     srcstride, height, mx, my); \
136     ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
137         dst + 4, dststride, src + 4, srcstride, height, mx, my); \
138 }
139 
140 TAP_W16(sse2,  epel, h6)
141 TAP_W16(sse2,  epel, v6)
142 TAP_W16(sse2,  bilinear, h)
143 TAP_W16(sse2,  bilinear, v)
144 
145 TAP_W16(ssse3, epel, h6)
146 TAP_W16(ssse3, epel, v6)
147 TAP_W16(ssse3, bilinear, h)
148 TAP_W16(ssse3, bilinear, v)
149 
150 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
151 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
152     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
153     ptrdiff_t srcstride, int height, int mx, int my) \
154 { \
155     LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + TAPNUMY - 1)]); \
156     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
157     src -= srcstride * (TAPNUMY / 2 - 1); \
158     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
159         tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
160     ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
161         dst, dststride, tmpptr, SIZE,      height,               mx, my); \
162 }
163 
164 #define HVTAPMMX(x, y) \
165 HVTAP(mmxext, 8, x, y,  4,  8)
166 
167 HVTAPMMX(4, 4)
168 HVTAPMMX(4, 6)
169 HVTAPMMX(6, 4)
170 HVTAPMMX(6, 6)
171 
172 #define HVTAPSSE2(x, y, w) \
173 HVTAP(sse2,  16, x, y, w, 16) \
174 HVTAP(ssse3, 16, x, y, w, 16)
175 
176 HVTAPSSE2(4, 4, 8)
177 HVTAPSSE2(4, 6, 8)
178 HVTAPSSE2(6, 4, 8)
179 HVTAPSSE2(6, 6, 8)
180 HVTAPSSE2(6, 6, 16)
181 
182 HVTAP(ssse3, 16, 4, 4, 4, 8)
183 HVTAP(ssse3, 16, 4, 6, 4, 8)
184 HVTAP(ssse3, 16, 6, 4, 4, 8)
185 HVTAP(ssse3, 16, 6, 6, 4, 8)
186 
187 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
188 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
189     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
190     ptrdiff_t srcstride, int height, int mx, int my) \
191 { \
192     LOCAL_ALIGNED(ALIGN, uint8_t, tmp, [SIZE * (MAXHEIGHT + 2)]); \
193     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
194         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
195     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
196         dst, dststride, tmp, SIZE,      height,     mx, my); \
197 }
198 
199 HVBILIN(mmxext,  8,  4,  8)
200 HVBILIN(sse2,  8,  8, 16)
201 HVBILIN(sse2,  8, 16, 16)
202 HVBILIN(ssse3, 8,  4,  8)
203 HVBILIN(ssse3, 8,  8, 16)
204 HVBILIN(ssse3, 8, 16, 16)
205 
206 void ff_vp8_idct_dc_add_sse2(uint8_t *dst, int16_t block[16],
207                              ptrdiff_t stride);
208 void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
209                              ptrdiff_t stride);
210 void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
211                                ptrdiff_t stride);
212 void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
213                                ptrdiff_t stride);
214 void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
215 void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
216 
217 #define DECLARE_LOOP_FILTER(NAME)                                       \
218 void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
219                                           ptrdiff_t stride,             \
220                                           int flim);                    \
221 void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst,                 \
222                                           ptrdiff_t stride,             \
223                                           int flim);                    \
224 void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
225                                              ptrdiff_t stride,          \
226                                              int e, int i, int hvt);    \
227 void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst,              \
228                                              ptrdiff_t stride,          \
229                                              int e, int i, int hvt);    \
230 void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
231                                              uint8_t *dstV,             \
232                                              ptrdiff_t s,               \
233                                              int e, int i, int hvt);    \
234 void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU,             \
235                                              uint8_t *dstV,             \
236                                              ptrdiff_t s,               \
237                                              int e, int i, int hvt);    \
238 void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
239                                              ptrdiff_t stride,          \
240                                              int e, int i, int hvt);    \
241 void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst,              \
242                                              ptrdiff_t stride,          \
243                                              int e, int i, int hvt);    \
244 void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
245                                              uint8_t *dstV,             \
246                                              ptrdiff_t s,               \
247                                              int e, int i, int hvt);    \
248 void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU,             \
249                                              uint8_t *dstV,             \
250                                              ptrdiff_t s,               \
251                                              int e, int i, int hvt);
252 
253 DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)254 DECLARE_LOOP_FILTER(ssse3)
255 DECLARE_LOOP_FILTER(sse4)
256 
257 #endif /* HAVE_X86ASM */
258 
259 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
260     c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
261     c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
262     c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
263 
264 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
265     c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
266     c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
267     c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
268     c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
269     c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
270     VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
271 
272 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
273     c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
274     c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
275     c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
276     c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
277     c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
278     c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
279     c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
280     c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
281 
282 
283 av_cold void ff_vp78dsp_init_x86(VP8DSPContext *c)
284 {
285 #if HAVE_X86ASM
286     int cpu_flags = av_get_cpu_flags();
287 
288     if (EXTERNAL_MMX(cpu_flags)) {
289         c->put_vp8_epel_pixels_tab[1][0][0]     =
290         c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
291     }
292 
293     /* note that 4-tap width=16 functions are missing because w=16
294      * is only used for luma, and luma is always a copy or sixtap. */
295     if (EXTERNAL_MMXEXT(cpu_flags)) {
296         VP8_MC_FUNC(2, 4, mmxext);
297         VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
298     }
299 
300     if (EXTERNAL_SSE(cpu_flags)) {
301         c->put_vp8_epel_pixels_tab[0][0][0]     =
302         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
303     }
304 
305     if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
306         VP8_LUMA_MC_FUNC(0, 16, sse2);
307         VP8_MC_FUNC(1, 8, sse2);
308         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
309         VP8_BILINEAR_MC_FUNC(1, 8, sse2);
310     }
311 
312     if (EXTERNAL_SSSE3(cpu_flags)) {
313         VP8_LUMA_MC_FUNC(0, 16, ssse3);
314         VP8_MC_FUNC(1, 8, ssse3);
315         VP8_MC_FUNC(2, 4, ssse3);
316         VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
317         VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
318         VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
319     }
320 #endif /* HAVE_X86ASM */
321 }
322 
ff_vp8dsp_init_x86(VP8DSPContext * c)323 av_cold void ff_vp8dsp_init_x86(VP8DSPContext *c)
324 {
325 #if HAVE_X86ASM
326     int cpu_flags = av_get_cpu_flags();
327 
328     if (EXTERNAL_MMX(cpu_flags)) {
329         c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
330     }
331 
332     if (EXTERNAL_SSE(cpu_flags)) {
333         c->vp8_idct_add                         = ff_vp8_idct_add_sse;
334         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
335     }
336 
337     if (EXTERNAL_SSE2_SLOW(cpu_flags)) {
338         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
339 
340         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
341         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
342 
343         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
344         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
345     }
346 
347     if (EXTERNAL_SSE2(cpu_flags)) {
348         c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse2;
349         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
350 
351         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse2;
352 
353         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
354         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
355 
356         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
357         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
358     }
359 
360     if (EXTERNAL_SSSE3(cpu_flags)) {
361         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
362         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
363 
364         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
365         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
366         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
367         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
368 
369         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
370         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
371         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
372         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
373     }
374 
375     if (EXTERNAL_SSE4(cpu_flags)) {
376         c->vp8_idct_dc_add            = ff_vp8_idct_dc_add_sse4;
377 
378         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
379         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
380         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
381     }
382 #endif /* HAVE_X86ASM */
383 }
384