• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32 
33 int ff_sum_abs_dctelem_mmx(int16_t *block);
34 int ff_sum_abs_dctelem_mmxext(int16_t *block);
35 int ff_sum_abs_dctelem_sse2(int16_t *block);
36 int ff_sum_abs_dctelem_ssse3(int16_t *block);
37 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38                 ptrdiff_t stride, int h);
39 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40                  ptrdiff_t stride, int h);
41 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42                   ptrdiff_t stride, int h);
43 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
45 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46                    ptrdiff_t stride, int h);
47 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48                     ptrdiff_t stride, int h);
49 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50                   ptrdiff_t stride, int h);
51 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52                       ptrdiff_t stride, int h);
53 int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
54                        ptrdiff_t stride, int h);
55 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56                      ptrdiff_t stride, int h);
57 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58                       ptrdiff_t stride, int h);
59 int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
60                        ptrdiff_t stride, int h);
61 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62                      ptrdiff_t stride, int h);
63 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64                               ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
66                                ptrdiff_t stride, int h);
67 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68                              ptrdiff_t stride, int h);
69 int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
70                           ptrdiff_t stride, int h);
71 int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
72                            ptrdiff_t stride, int h);
73 int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
74                          ptrdiff_t stride, int h);
75 int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
76                     ptrdiff_t stride, int h);
77 int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78                      ptrdiff_t stride, int h);
79 int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80                    ptrdiff_t stride, int h);
81 
82 #define hadamard_func(cpu)                                                    \
83     int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1,           \
84                                   uint8_t *src2, ptrdiff_t stride, int h);    \
85     int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1,         \
86                                     uint8_t *src2, ptrdiff_t stride, int h);
87 
88 hadamard_func(mmx)
hadamard_func(mmxext)89 hadamard_func(mmxext)
90 hadamard_func(sse2)
91 hadamard_func(ssse3)
92 
93 #if HAVE_X86ASM
94 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
95                       ptrdiff_t stride, int h)
96 {
97     int score1, score2;
98 
99     if (c)
100         score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101     else
102         score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
103     score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
104            - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
105 
106     if (c)
107         return score1 + FFABS(score2) * c->avctx->nsse_weight;
108     else
109         return score1 + FFABS(score2) * 8;
110 }
111 
nsse8_mmx(MpegEncContext * c,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)112 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
113                      ptrdiff_t stride, int h)
114 {
115     int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
116     int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
117                  ff_hf_noise8_mmx(pix2, stride, h);
118 
119     if (c)
120         return score1 + FFABS(score2) * c->avctx->nsse_weight;
121     else
122         return score1 + FFABS(score2) * 8;
123 }
124 
125 #endif /* HAVE_X86ASM */
126 
127 #if HAVE_INLINE_ASM
128 
vsad_intra16_mmx(MpegEncContext * v,uint8_t * pix,uint8_t * dummy,ptrdiff_t stride,int h)129 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
130                             ptrdiff_t stride, int h)
131 {
132     int tmp;
133 
134     av_assert2((((int) pix) & 7) == 0);
135     av_assert2((stride & 7) == 0);
136 
137 #define SUM(in0, in1, out0, out1)               \
138     "movq (%0), %%mm2\n"                        \
139     "movq 8(%0), %%mm3\n"                       \
140     "add %2,%0\n"                               \
141     "movq %%mm2, " #out0 "\n"                   \
142     "movq %%mm3, " #out1 "\n"                   \
143     "psubusb " #in0 ", %%mm2\n"                 \
144     "psubusb " #in1 ", %%mm3\n"                 \
145     "psubusb " #out0 ", " #in0 "\n"             \
146     "psubusb " #out1 ", " #in1 "\n"             \
147     "por %%mm2, " #in0 "\n"                     \
148     "por %%mm3, " #in1 "\n"                     \
149     "movq " #in0 ", %%mm2\n"                    \
150     "movq " #in1 ", %%mm3\n"                    \
151     "punpcklbw %%mm7, " #in0 "\n"               \
152     "punpcklbw %%mm7, " #in1 "\n"               \
153     "punpckhbw %%mm7, %%mm2\n"                  \
154     "punpckhbw %%mm7, %%mm3\n"                  \
155     "paddw " #in1 ", " #in0 "\n"                \
156     "paddw %%mm3, %%mm2\n"                      \
157     "paddw %%mm2, " #in0 "\n"                   \
158     "paddw " #in0 ", %%mm6\n"
159 
160 
161     __asm__ volatile (
162         "movl    %3, %%ecx\n"
163         "pxor %%mm6, %%mm6\n"
164         "pxor %%mm7, %%mm7\n"
165         "movq  (%0), %%mm0\n"
166         "movq 8(%0), %%mm1\n"
167         "add %2, %0\n"
168         "jmp 2f\n"
169         "1:\n"
170 
171         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172         "2:\n"
173         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
174 
175         "subl $2, %%ecx\n"
176         "jnz 1b\n"
177 
178         "movq  %%mm6, %%mm0\n"
179         "psrlq $32,   %%mm6\n"
180         "paddw %%mm6, %%mm0\n"
181         "movq  %%mm0, %%mm6\n"
182         "psrlq $16,   %%mm0\n"
183         "paddw %%mm6, %%mm0\n"
184         "movd  %%mm0, %1\n"
185         : "+r" (pix), "=r" (tmp)
186         : "r" (stride), "m" (h)
187         : "%ecx");
188 
189     return tmp & 0xFFFF;
190 }
191 #undef SUM
192 
vsad16_mmx(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)193 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
194                       ptrdiff_t stride, int h)
195 {
196     int tmp;
197 
198     av_assert2((((int) pix1) & 7) == 0);
199     av_assert2((((int) pix2) & 7) == 0);
200     av_assert2((stride & 7) == 0);
201 
202 #define SUM(in0, in1, out0, out1)       \
203     "movq (%0), %%mm2\n"                \
204     "movq (%1), " #out0 "\n"            \
205     "movq 8(%0), %%mm3\n"               \
206     "movq 8(%1), " #out1 "\n"           \
207     "add %3, %0\n"                      \
208     "add %3, %1\n"                      \
209     "psubb " #out0 ", %%mm2\n"          \
210     "psubb " #out1 ", %%mm3\n"          \
211     "pxor %%mm7, %%mm2\n"               \
212     "pxor %%mm7, %%mm3\n"               \
213     "movq %%mm2, " #out0 "\n"           \
214     "movq %%mm3, " #out1 "\n"           \
215     "psubusb " #in0 ", %%mm2\n"         \
216     "psubusb " #in1 ", %%mm3\n"         \
217     "psubusb " #out0 ", " #in0 "\n"     \
218     "psubusb " #out1 ", " #in1 "\n"     \
219     "por %%mm2, " #in0 "\n"             \
220     "por %%mm3, " #in1 "\n"             \
221     "movq " #in0 ", %%mm2\n"            \
222     "movq " #in1 ", %%mm3\n"            \
223     "punpcklbw %%mm7, " #in0 "\n"       \
224     "punpcklbw %%mm7, " #in1 "\n"       \
225     "punpckhbw %%mm7, %%mm2\n"          \
226     "punpckhbw %%mm7, %%mm3\n"          \
227     "paddw " #in1 ", " #in0 "\n"        \
228     "paddw %%mm3, %%mm2\n"              \
229     "paddw %%mm2, " #in0 "\n"           \
230     "paddw " #in0 ", %%mm6\n"
231 
232 
233     __asm__ volatile (
234         "movl %4, %%ecx\n"
235         "pxor %%mm6, %%mm6\n"
236         "pcmpeqw %%mm7, %%mm7\n"
237         "psllw $15, %%mm7\n"
238         "packsswb %%mm7, %%mm7\n"
239         "movq (%0), %%mm0\n"
240         "movq (%1), %%mm2\n"
241         "movq 8(%0), %%mm1\n"
242         "movq 8(%1), %%mm3\n"
243         "add %3, %0\n"
244         "add %3, %1\n"
245         "psubb %%mm2, %%mm0\n"
246         "psubb %%mm3, %%mm1\n"
247         "pxor %%mm7, %%mm0\n"
248         "pxor %%mm7, %%mm1\n"
249         "jmp 2f\n"
250         "1:\n"
251 
252         SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253         "2:\n"
254         SUM(%%mm0, %%mm1, %%mm4, %%mm5)
255 
256         "subl $2, %%ecx\n"
257         "jnz 1b\n"
258 
259         "movq %%mm6, %%mm0\n"
260         "psrlq $32, %%mm6\n"
261         "paddw %%mm6, %%mm0\n"
262         "movq %%mm0, %%mm6\n"
263         "psrlq $16, %%mm0\n"
264         "paddw %%mm6, %%mm0\n"
265         "movd %%mm0, %2\n"
266         : "+r" (pix1), "+r" (pix2), "=r" (tmp)
267         : "r" (stride), "m" (h)
268         : "%ecx");
269 
270     return tmp & 0x7FFF;
271 }
272 #undef SUM
273 
274 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
275     0x0000000000000000ULL,
276     0x0001000100010001ULL,
277     0x0002000200020002ULL,
278 };
279 
sad8_1_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)280 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
281                               ptrdiff_t stride, int h)
282 {
283     x86_reg len = -stride * h;
284     __asm__ volatile (
285         ".p2align 4                     \n\t"
286         "1:                             \n\t"
287         "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
288         "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
289         "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
290         "add %3, %%"FF_REG_a"           \n\t"
291         "psubusb %%mm0, %%mm2           \n\t"
292         "psubusb %%mm4, %%mm0           \n\t"
293         "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
294         "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
295         "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
296         "psubusb %%mm1, %%mm3           \n\t"
297         "psubusb %%mm5, %%mm1           \n\t"
298         "por %%mm2, %%mm0               \n\t"
299         "por %%mm1, %%mm3               \n\t"
300         "movq %%mm0, %%mm1              \n\t"
301         "movq %%mm3, %%mm2              \n\t"
302         "punpcklbw %%mm7, %%mm0         \n\t"
303         "punpckhbw %%mm7, %%mm1         \n\t"
304         "punpcklbw %%mm7, %%mm3         \n\t"
305         "punpckhbw %%mm7, %%mm2         \n\t"
306         "paddw %%mm1, %%mm0             \n\t"
307         "paddw %%mm3, %%mm2             \n\t"
308         "paddw %%mm2, %%mm0             \n\t"
309         "paddw %%mm0, %%mm6             \n\t"
310         "add %3, %%"FF_REG_a"           \n\t"
311         " js 1b                         \n\t"
312         : "+a" (len)
313         : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
314 }
315 
sad8_2_mmx(uint8_t * blk1a,uint8_t * blk1b,uint8_t * blk2,ptrdiff_t stride,int h)316 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
317                               ptrdiff_t stride, int h)
318 {
319     x86_reg len = -stride * h;
320     __asm__ volatile (
321         ".p2align 4                     \n\t"
322         "1:                             \n\t"
323         "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
324         "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
325         "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
326         "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
327         "punpcklbw %%mm7, %%mm0         \n\t"
328         "punpcklbw %%mm7, %%mm1         \n\t"
329         "punpckhbw %%mm7, %%mm2         \n\t"
330         "punpckhbw %%mm7, %%mm3         \n\t"
331         "paddw %%mm0, %%mm1             \n\t"
332         "paddw %%mm2, %%mm3             \n\t"
333         "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
334         "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
335         "paddw %%mm5, %%mm1             \n\t"
336         "paddw %%mm5, %%mm3             \n\t"
337         "psrlw $1, %%mm1                \n\t"
338         "psrlw $1, %%mm3                \n\t"
339         "packuswb %%mm3, %%mm1          \n\t"
340         "psubusb %%mm1, %%mm4           \n\t"
341         "psubusb %%mm2, %%mm1           \n\t"
342         "por %%mm4, %%mm1               \n\t"
343         "movq %%mm1, %%mm0              \n\t"
344         "punpcklbw %%mm7, %%mm0         \n\t"
345         "punpckhbw %%mm7, %%mm1         \n\t"
346         "paddw %%mm1, %%mm0             \n\t"
347         "paddw %%mm0, %%mm6             \n\t"
348         "add %4, %%"FF_REG_a"           \n\t"
349         " js 1b                         \n\t"
350         : "+a" (len)
351         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
352           "r" (stride));
353 }
354 
sad8_4_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)355 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
356                               ptrdiff_t stride, int h)
357 {
358     x86_reg len = -stride * h;
359     __asm__ volatile (
360         "movq  (%1, %%"FF_REG_a"), %%mm0\n\t"
361         "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
362         "movq %%mm0, %%mm1              \n\t"
363         "movq %%mm2, %%mm3              \n\t"
364         "punpcklbw %%mm7, %%mm0         \n\t"
365         "punpckhbw %%mm7, %%mm1         \n\t"
366         "punpcklbw %%mm7, %%mm2         \n\t"
367         "punpckhbw %%mm7, %%mm3         \n\t"
368         "paddw %%mm2, %%mm0             \n\t"
369         "paddw %%mm3, %%mm1             \n\t"
370         ".p2align 4                     \n\t"
371         "1:                             \n\t"
372         "movq  (%2, %%"FF_REG_a"), %%mm2\n\t"
373         "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
374         "movq %%mm2, %%mm3              \n\t"
375         "movq %%mm4, %%mm5              \n\t"
376         "punpcklbw %%mm7, %%mm2         \n\t"
377         "punpckhbw %%mm7, %%mm3         \n\t"
378         "punpcklbw %%mm7, %%mm4         \n\t"
379         "punpckhbw %%mm7, %%mm5         \n\t"
380         "paddw %%mm4, %%mm2             \n\t"
381         "paddw %%mm5, %%mm3             \n\t"
382         "movq %5, %%mm5                 \n\t"
383         "paddw %%mm2, %%mm0             \n\t"
384         "paddw %%mm3, %%mm1             \n\t"
385         "paddw %%mm5, %%mm0             \n\t"
386         "paddw %%mm5, %%mm1             \n\t"
387         "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
388         "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
389         "psrlw $2, %%mm0                \n\t"
390         "psrlw $2, %%mm1                \n\t"
391         "packuswb %%mm1, %%mm0          \n\t"
392         "psubusb %%mm0, %%mm4           \n\t"
393         "psubusb %%mm5, %%mm0           \n\t"
394         "por %%mm4, %%mm0               \n\t"
395         "movq %%mm0, %%mm4              \n\t"
396         "punpcklbw %%mm7, %%mm0         \n\t"
397         "punpckhbw %%mm7, %%mm4         \n\t"
398         "paddw %%mm0, %%mm6             \n\t"
399         "paddw %%mm4, %%mm6             \n\t"
400         "movq  %%mm2, %%mm0             \n\t"
401         "movq  %%mm3, %%mm1             \n\t"
402         "add %4, %%"FF_REG_a"           \n\t"
403         " js 1b                         \n\t"
404         : "+a" (len)
405         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
406           "r" (stride), "m" (round_tab[2]));
407 }
408 
sum_mmx(void)409 static inline int sum_mmx(void)
410 {
411     int ret;
412     __asm__ volatile (
413         "movq %%mm6, %%mm0              \n\t"
414         "psrlq $32, %%mm6               \n\t"
415         "paddw %%mm0, %%mm6             \n\t"
416         "movq %%mm6, %%mm0              \n\t"
417         "psrlq $16, %%mm6               \n\t"
418         "paddw %%mm0, %%mm6             \n\t"
419         "movd %%mm6, %0                 \n\t"
420         : "=r" (ret));
421     return ret & 0xFFFF;
422 }
423 
sad8_x2a_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)424 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
425                                 ptrdiff_t stride, int h)
426 {
427     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
428 }
429 
sad8_y2a_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)430 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
431                                 ptrdiff_t stride, int h)
432 {
433     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
434 }
435 
436 #define PIX_SAD(suf)                                                    \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
438                         uint8_t *blk1, ptrdiff_t stride, int h)         \
439 {                                                                       \
440     av_assert2(h == 8);                                                     \
441     __asm__ volatile (                                                  \
442         "pxor %%mm7, %%mm7     \n\t"                                    \
443         "pxor %%mm6, %%mm6     \n\t"                                    \
444         :);                                                             \
445                                                                         \
446     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
447                                                                         \
448     return sum_ ## suf();                                               \
449 }                                                                       \
450                                                                         \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
452                            uint8_t *blk1, ptrdiff_t stride, int h)      \
453 {                                                                       \
454     av_assert2(h == 8);                                                     \
455     __asm__ volatile (                                                  \
456         "pxor %%mm7, %%mm7     \n\t"                                    \
457         "pxor %%mm6, %%mm6     \n\t"                                    \
458         "movq %0, %%mm5        \n\t"                                    \
459         :: "m" (round_tab[1]));                                         \
460                                                                         \
461     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
462                                                                         \
463     return sum_ ## suf();                                               \
464 }                                                                       \
465                                                                         \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
467                            uint8_t *blk1, ptrdiff_t stride, int h)      \
468 {                                                                       \
469     av_assert2(h == 8);                                                     \
470     __asm__ volatile (                                                  \
471         "pxor %%mm7, %%mm7     \n\t"                                    \
472         "pxor %%mm6, %%mm6     \n\t"                                    \
473         "movq %0, %%mm5        \n\t"                                    \
474         :: "m" (round_tab[1]));                                         \
475                                                                         \
476     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
477                                                                         \
478     return sum_ ## suf();                                               \
479 }                                                                       \
480                                                                         \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
482                             uint8_t *blk1, ptrdiff_t stride, int h)     \
483 {                                                                       \
484     av_assert2(h == 8);                                                     \
485     __asm__ volatile (                                                  \
486         "pxor %%mm7, %%mm7     \n\t"                                    \
487         "pxor %%mm6, %%mm6     \n\t"                                    \
488         ::);                                                            \
489                                                                         \
490     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
491                                                                         \
492     return sum_ ## suf();                                               \
493 }                                                                       \
494                                                                         \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
496                          uint8_t *blk1, ptrdiff_t stride, int h)        \
497 {                                                                       \
498     __asm__ volatile (                                                  \
499         "pxor %%mm7, %%mm7     \n\t"                                    \
500         "pxor %%mm6, %%mm6     \n\t"                                    \
501         :);                                                             \
502                                                                         \
503     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
504     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
505                                                                         \
506     return sum_ ## suf();                                               \
507 }                                                                       \
508                                                                         \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
510                             uint8_t *blk1, ptrdiff_t stride, int h)     \
511 {                                                                       \
512     __asm__ volatile (                                                  \
513         "pxor %%mm7, %%mm7     \n\t"                                    \
514         "pxor %%mm6, %%mm6     \n\t"                                    \
515         "movq %0, %%mm5        \n\t"                                    \
516         :: "m" (round_tab[1]));                                         \
517                                                                         \
518     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
519     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
520                                                                         \
521     return sum_ ## suf();                                               \
522 }                                                                       \
523                                                                         \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
525                             uint8_t *blk1, ptrdiff_t stride, int h)     \
526 {                                                                       \
527     __asm__ volatile (                                                  \
528         "pxor %%mm7, %%mm7     \n\t"                                    \
529         "pxor %%mm6, %%mm6     \n\t"                                    \
530         "movq %0, %%mm5        \n\t"                                    \
531         :: "m" (round_tab[1]));                                         \
532                                                                         \
533     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
534     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
535                                                                         \
536     return sum_ ## suf();                                               \
537 }                                                                       \
538                                                                         \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
540                              uint8_t *blk1, ptrdiff_t stride, int h)    \
541 {                                                                       \
542     __asm__ volatile (                                                  \
543         "pxor %%mm7, %%mm7     \n\t"                                    \
544         "pxor %%mm6, %%mm6     \n\t"                                    \
545         ::);                                                            \
546                                                                         \
547     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
548     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
549                                                                         \
550     return sum_ ## suf();                                               \
551 }                                                                       \
552 
PIX_SAD(mmx)553 PIX_SAD(mmx)
554 
555 #endif /* HAVE_INLINE_ASM */
556 
557 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
558 {
559     int cpu_flags = av_get_cpu_flags();
560 
561 #if HAVE_INLINE_ASM
562     if (INLINE_MMX(cpu_flags)) {
563         c->pix_abs[0][0] = sad16_mmx;
564         c->pix_abs[0][1] = sad16_x2_mmx;
565         c->pix_abs[0][2] = sad16_y2_mmx;
566         c->pix_abs[0][3] = sad16_xy2_mmx;
567         c->pix_abs[1][0] = sad8_mmx;
568         c->pix_abs[1][1] = sad8_x2_mmx;
569         c->pix_abs[1][2] = sad8_y2_mmx;
570         c->pix_abs[1][3] = sad8_xy2_mmx;
571 
572         c->sad[0] = sad16_mmx;
573         c->sad[1] = sad8_mmx;
574 
575         c->vsad[4] = vsad_intra16_mmx;
576 
577         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
578             c->vsad[0] = vsad16_mmx;
579         }
580     }
581 
582 #endif /* HAVE_INLINE_ASM */
583 
584     if (EXTERNAL_MMX(cpu_flags)) {
585         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
587         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmx;
588         c->sse[0]            = ff_sse16_mmx;
589         c->sse[1]            = ff_sse8_mmx;
590 #if HAVE_X86ASM
591         c->nsse[0]           = nsse16_mmx;
592         c->nsse[1]           = nsse8_mmx;
593 #endif
594     }
595 
596     if (EXTERNAL_MMXEXT(cpu_flags)) {
597         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
599         c->sum_abs_dctelem   = ff_sum_abs_dctelem_mmxext;
600 
601         c->sad[0] = ff_sad16_mmxext;
602         c->sad[1] = ff_sad8_mmxext;
603 
604         c->pix_abs[0][0] = ff_sad16_mmxext;
605         c->pix_abs[0][1] = ff_sad16_x2_mmxext;
606         c->pix_abs[0][2] = ff_sad16_y2_mmxext;
607         c->pix_abs[1][0] = ff_sad8_mmxext;
608         c->pix_abs[1][1] = ff_sad8_x2_mmxext;
609         c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610 
611         c->vsad[4] = ff_vsad_intra16_mmxext;
612         c->vsad[5] = ff_vsad_intra8_mmxext;
613 
614         if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
615             c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
616             c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
617 
618             c->vsad[0] = ff_vsad16_approx_mmxext;
619             c->vsad[1] = ff_vsad8_approx_mmxext;
620         }
621     }
622 
623     if (EXTERNAL_SSE2(cpu_flags)) {
624         c->sse[0] = ff_sse16_sse2;
625         c->sum_abs_dctelem   = ff_sum_abs_dctelem_sse2;
626 
627 #if HAVE_ALIGNED_STACK
628         c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629         c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630 #endif
631         if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
632             c->sad[0]        = ff_sad16_sse2;
633             c->pix_abs[0][0] = ff_sad16_sse2;
634             c->pix_abs[0][1] = ff_sad16_x2_sse2;
635             c->pix_abs[0][2] = ff_sad16_y2_sse2;
636 
637             c->vsad[4]       = ff_vsad_intra16_sse2;
638             if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
639                 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
640                 c->vsad[0]       = ff_vsad16_approx_sse2;
641             }
642         }
643     }
644 
645     if (EXTERNAL_SSSE3(cpu_flags)) {
646         c->sum_abs_dctelem   = ff_sum_abs_dctelem_ssse3;
647 #if HAVE_ALIGNED_STACK
648         c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649         c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
650 #endif
651     }
652 }
653