1 /*
2 * SIMD-optimized motion estimation
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5 *
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7 *
8 * This file is part of FFmpeg.
9 *
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
14 *
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
19 *
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32
33 int ff_sum_abs_dctelem_mmx(int16_t *block);
34 int ff_sum_abs_dctelem_mmxext(int16_t *block);
35 int ff_sum_abs_dctelem_sse2(int16_t *block);
36 int ff_sum_abs_dctelem_ssse3(int16_t *block);
37 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38 ptrdiff_t stride, int h);
39 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40 ptrdiff_t stride, int h);
41 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42 ptrdiff_t stride, int h);
43 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
45 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46 ptrdiff_t stride, int h);
47 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48 ptrdiff_t stride, int h);
49 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50 ptrdiff_t stride, int h);
51 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52 ptrdiff_t stride, int h);
53 int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
54 ptrdiff_t stride, int h);
55 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56 ptrdiff_t stride, int h);
57 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58 ptrdiff_t stride, int h);
59 int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
60 ptrdiff_t stride, int h);
61 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62 ptrdiff_t stride, int h);
63 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64 ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
66 ptrdiff_t stride, int h);
67 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68 ptrdiff_t stride, int h);
69 int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
70 ptrdiff_t stride, int h);
71 int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
72 ptrdiff_t stride, int h);
73 int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
74 ptrdiff_t stride, int h);
75 int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
76 ptrdiff_t stride, int h);
77 int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78 ptrdiff_t stride, int h);
79 int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80 ptrdiff_t stride, int h);
81
82 #define hadamard_func(cpu) \
83 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
84 uint8_t *src2, ptrdiff_t stride, int h); \
85 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
86 uint8_t *src2, ptrdiff_t stride, int h);
87
88 hadamard_func(mmx)
hadamard_func(mmxext)89 hadamard_func(mmxext)
90 hadamard_func(sse2)
91 hadamard_func(ssse3)
92
93 #if HAVE_X86ASM
94 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
95 ptrdiff_t stride, int h)
96 {
97 int score1, score2;
98
99 if (c)
100 score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101 else
102 score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
103 score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
104 - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
105
106 if (c)
107 return score1 + FFABS(score2) * c->avctx->nsse_weight;
108 else
109 return score1 + FFABS(score2) * 8;
110 }
111
nsse8_mmx(MpegEncContext * c,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)112 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
113 ptrdiff_t stride, int h)
114 {
115 int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
116 int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
117 ff_hf_noise8_mmx(pix2, stride, h);
118
119 if (c)
120 return score1 + FFABS(score2) * c->avctx->nsse_weight;
121 else
122 return score1 + FFABS(score2) * 8;
123 }
124
125 #endif /* HAVE_X86ASM */
126
127 #if HAVE_INLINE_ASM
128
vsad_intra16_mmx(MpegEncContext * v,uint8_t * pix,uint8_t * dummy,ptrdiff_t stride,int h)129 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
130 ptrdiff_t stride, int h)
131 {
132 int tmp;
133
134 av_assert2((((int) pix) & 7) == 0);
135 av_assert2((stride & 7) == 0);
136
137 #define SUM(in0, in1, out0, out1) \
138 "movq (%0), %%mm2\n" \
139 "movq 8(%0), %%mm3\n" \
140 "add %2,%0\n" \
141 "movq %%mm2, " #out0 "\n" \
142 "movq %%mm3, " #out1 "\n" \
143 "psubusb " #in0 ", %%mm2\n" \
144 "psubusb " #in1 ", %%mm3\n" \
145 "psubusb " #out0 ", " #in0 "\n" \
146 "psubusb " #out1 ", " #in1 "\n" \
147 "por %%mm2, " #in0 "\n" \
148 "por %%mm3, " #in1 "\n" \
149 "movq " #in0 ", %%mm2\n" \
150 "movq " #in1 ", %%mm3\n" \
151 "punpcklbw %%mm7, " #in0 "\n" \
152 "punpcklbw %%mm7, " #in1 "\n" \
153 "punpckhbw %%mm7, %%mm2\n" \
154 "punpckhbw %%mm7, %%mm3\n" \
155 "paddw " #in1 ", " #in0 "\n" \
156 "paddw %%mm3, %%mm2\n" \
157 "paddw %%mm2, " #in0 "\n" \
158 "paddw " #in0 ", %%mm6\n"
159
160
161 __asm__ volatile (
162 "movl %3, %%ecx\n"
163 "pxor %%mm6, %%mm6\n"
164 "pxor %%mm7, %%mm7\n"
165 "movq (%0), %%mm0\n"
166 "movq 8(%0), %%mm1\n"
167 "add %2, %0\n"
168 "jmp 2f\n"
169 "1:\n"
170
171 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172 "2:\n"
173 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
174
175 "subl $2, %%ecx\n"
176 "jnz 1b\n"
177
178 "movq %%mm6, %%mm0\n"
179 "psrlq $32, %%mm6\n"
180 "paddw %%mm6, %%mm0\n"
181 "movq %%mm0, %%mm6\n"
182 "psrlq $16, %%mm0\n"
183 "paddw %%mm6, %%mm0\n"
184 "movd %%mm0, %1\n"
185 : "+r" (pix), "=r" (tmp)
186 : "r" (stride), "m" (h)
187 : "%ecx");
188
189 return tmp & 0xFFFF;
190 }
191 #undef SUM
192
vsad16_mmx(MpegEncContext * v,uint8_t * pix1,uint8_t * pix2,ptrdiff_t stride,int h)193 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
194 ptrdiff_t stride, int h)
195 {
196 int tmp;
197
198 av_assert2((((int) pix1) & 7) == 0);
199 av_assert2((((int) pix2) & 7) == 0);
200 av_assert2((stride & 7) == 0);
201
202 #define SUM(in0, in1, out0, out1) \
203 "movq (%0), %%mm2\n" \
204 "movq (%1), " #out0 "\n" \
205 "movq 8(%0), %%mm3\n" \
206 "movq 8(%1), " #out1 "\n" \
207 "add %3, %0\n" \
208 "add %3, %1\n" \
209 "psubb " #out0 ", %%mm2\n" \
210 "psubb " #out1 ", %%mm3\n" \
211 "pxor %%mm7, %%mm2\n" \
212 "pxor %%mm7, %%mm3\n" \
213 "movq %%mm2, " #out0 "\n" \
214 "movq %%mm3, " #out1 "\n" \
215 "psubusb " #in0 ", %%mm2\n" \
216 "psubusb " #in1 ", %%mm3\n" \
217 "psubusb " #out0 ", " #in0 "\n" \
218 "psubusb " #out1 ", " #in1 "\n" \
219 "por %%mm2, " #in0 "\n" \
220 "por %%mm3, " #in1 "\n" \
221 "movq " #in0 ", %%mm2\n" \
222 "movq " #in1 ", %%mm3\n" \
223 "punpcklbw %%mm7, " #in0 "\n" \
224 "punpcklbw %%mm7, " #in1 "\n" \
225 "punpckhbw %%mm7, %%mm2\n" \
226 "punpckhbw %%mm7, %%mm3\n" \
227 "paddw " #in1 ", " #in0 "\n" \
228 "paddw %%mm3, %%mm2\n" \
229 "paddw %%mm2, " #in0 "\n" \
230 "paddw " #in0 ", %%mm6\n"
231
232
233 __asm__ volatile (
234 "movl %4, %%ecx\n"
235 "pxor %%mm6, %%mm6\n"
236 "pcmpeqw %%mm7, %%mm7\n"
237 "psllw $15, %%mm7\n"
238 "packsswb %%mm7, %%mm7\n"
239 "movq (%0), %%mm0\n"
240 "movq (%1), %%mm2\n"
241 "movq 8(%0), %%mm1\n"
242 "movq 8(%1), %%mm3\n"
243 "add %3, %0\n"
244 "add %3, %1\n"
245 "psubb %%mm2, %%mm0\n"
246 "psubb %%mm3, %%mm1\n"
247 "pxor %%mm7, %%mm0\n"
248 "pxor %%mm7, %%mm1\n"
249 "jmp 2f\n"
250 "1:\n"
251
252 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253 "2:\n"
254 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
255
256 "subl $2, %%ecx\n"
257 "jnz 1b\n"
258
259 "movq %%mm6, %%mm0\n"
260 "psrlq $32, %%mm6\n"
261 "paddw %%mm6, %%mm0\n"
262 "movq %%mm0, %%mm6\n"
263 "psrlq $16, %%mm0\n"
264 "paddw %%mm6, %%mm0\n"
265 "movd %%mm0, %2\n"
266 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
267 : "r" (stride), "m" (h)
268 : "%ecx");
269
270 return tmp & 0x7FFF;
271 }
272 #undef SUM
273
274 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
275 0x0000000000000000ULL,
276 0x0001000100010001ULL,
277 0x0002000200020002ULL,
278 };
279
sad8_1_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)280 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
281 ptrdiff_t stride, int h)
282 {
283 x86_reg len = -stride * h;
284 __asm__ volatile (
285 ".p2align 4 \n\t"
286 "1: \n\t"
287 "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
288 "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
289 "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
290 "add %3, %%"FF_REG_a" \n\t"
291 "psubusb %%mm0, %%mm2 \n\t"
292 "psubusb %%mm4, %%mm0 \n\t"
293 "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
294 "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
295 "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
296 "psubusb %%mm1, %%mm3 \n\t"
297 "psubusb %%mm5, %%mm1 \n\t"
298 "por %%mm2, %%mm0 \n\t"
299 "por %%mm1, %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm2 \n\t"
302 "punpcklbw %%mm7, %%mm0 \n\t"
303 "punpckhbw %%mm7, %%mm1 \n\t"
304 "punpcklbw %%mm7, %%mm3 \n\t"
305 "punpckhbw %%mm7, %%mm2 \n\t"
306 "paddw %%mm1, %%mm0 \n\t"
307 "paddw %%mm3, %%mm2 \n\t"
308 "paddw %%mm2, %%mm0 \n\t"
309 "paddw %%mm0, %%mm6 \n\t"
310 "add %3, %%"FF_REG_a" \n\t"
311 " js 1b \n\t"
312 : "+a" (len)
313 : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
314 }
315
sad8_2_mmx(uint8_t * blk1a,uint8_t * blk1b,uint8_t * blk2,ptrdiff_t stride,int h)316 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
317 ptrdiff_t stride, int h)
318 {
319 x86_reg len = -stride * h;
320 __asm__ volatile (
321 ".p2align 4 \n\t"
322 "1: \n\t"
323 "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
324 "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
325 "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
326 "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
327 "punpcklbw %%mm7, %%mm0 \n\t"
328 "punpcklbw %%mm7, %%mm1 \n\t"
329 "punpckhbw %%mm7, %%mm2 \n\t"
330 "punpckhbw %%mm7, %%mm3 \n\t"
331 "paddw %%mm0, %%mm1 \n\t"
332 "paddw %%mm2, %%mm3 \n\t"
333 "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
334 "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
335 "paddw %%mm5, %%mm1 \n\t"
336 "paddw %%mm5, %%mm3 \n\t"
337 "psrlw $1, %%mm1 \n\t"
338 "psrlw $1, %%mm3 \n\t"
339 "packuswb %%mm3, %%mm1 \n\t"
340 "psubusb %%mm1, %%mm4 \n\t"
341 "psubusb %%mm2, %%mm1 \n\t"
342 "por %%mm4, %%mm1 \n\t"
343 "movq %%mm1, %%mm0 \n\t"
344 "punpcklbw %%mm7, %%mm0 \n\t"
345 "punpckhbw %%mm7, %%mm1 \n\t"
346 "paddw %%mm1, %%mm0 \n\t"
347 "paddw %%mm0, %%mm6 \n\t"
348 "add %4, %%"FF_REG_a" \n\t"
349 " js 1b \n\t"
350 : "+a" (len)
351 : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
352 "r" (stride));
353 }
354
sad8_4_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)355 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
356 ptrdiff_t stride, int h)
357 {
358 x86_reg len = -stride * h;
359 __asm__ volatile (
360 "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
361 "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
362 "movq %%mm0, %%mm1 \n\t"
363 "movq %%mm2, %%mm3 \n\t"
364 "punpcklbw %%mm7, %%mm0 \n\t"
365 "punpckhbw %%mm7, %%mm1 \n\t"
366 "punpcklbw %%mm7, %%mm2 \n\t"
367 "punpckhbw %%mm7, %%mm3 \n\t"
368 "paddw %%mm2, %%mm0 \n\t"
369 "paddw %%mm3, %%mm1 \n\t"
370 ".p2align 4 \n\t"
371 "1: \n\t"
372 "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
373 "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
374 "movq %%mm2, %%mm3 \n\t"
375 "movq %%mm4, %%mm5 \n\t"
376 "punpcklbw %%mm7, %%mm2 \n\t"
377 "punpckhbw %%mm7, %%mm3 \n\t"
378 "punpcklbw %%mm7, %%mm4 \n\t"
379 "punpckhbw %%mm7, %%mm5 \n\t"
380 "paddw %%mm4, %%mm2 \n\t"
381 "paddw %%mm5, %%mm3 \n\t"
382 "movq %5, %%mm5 \n\t"
383 "paddw %%mm2, %%mm0 \n\t"
384 "paddw %%mm3, %%mm1 \n\t"
385 "paddw %%mm5, %%mm0 \n\t"
386 "paddw %%mm5, %%mm1 \n\t"
387 "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
388 "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
389 "psrlw $2, %%mm0 \n\t"
390 "psrlw $2, %%mm1 \n\t"
391 "packuswb %%mm1, %%mm0 \n\t"
392 "psubusb %%mm0, %%mm4 \n\t"
393 "psubusb %%mm5, %%mm0 \n\t"
394 "por %%mm4, %%mm0 \n\t"
395 "movq %%mm0, %%mm4 \n\t"
396 "punpcklbw %%mm7, %%mm0 \n\t"
397 "punpckhbw %%mm7, %%mm4 \n\t"
398 "paddw %%mm0, %%mm6 \n\t"
399 "paddw %%mm4, %%mm6 \n\t"
400 "movq %%mm2, %%mm0 \n\t"
401 "movq %%mm3, %%mm1 \n\t"
402 "add %4, %%"FF_REG_a" \n\t"
403 " js 1b \n\t"
404 : "+a" (len)
405 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
406 "r" (stride), "m" (round_tab[2]));
407 }
408
sum_mmx(void)409 static inline int sum_mmx(void)
410 {
411 int ret;
412 __asm__ volatile (
413 "movq %%mm6, %%mm0 \n\t"
414 "psrlq $32, %%mm6 \n\t"
415 "paddw %%mm0, %%mm6 \n\t"
416 "movq %%mm6, %%mm0 \n\t"
417 "psrlq $16, %%mm6 \n\t"
418 "paddw %%mm0, %%mm6 \n\t"
419 "movd %%mm6, %0 \n\t"
420 : "=r" (ret));
421 return ret & 0xFFFF;
422 }
423
sad8_x2a_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)424 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
425 ptrdiff_t stride, int h)
426 {
427 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
428 }
429
sad8_y2a_mmx(uint8_t * blk1,uint8_t * blk2,ptrdiff_t stride,int h)430 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
431 ptrdiff_t stride, int h)
432 {
433 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
434 }
435
436 #define PIX_SAD(suf) \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
438 uint8_t *blk1, ptrdiff_t stride, int h) \
439 { \
440 av_assert2(h == 8); \
441 __asm__ volatile ( \
442 "pxor %%mm7, %%mm7 \n\t" \
443 "pxor %%mm6, %%mm6 \n\t" \
444 :); \
445 \
446 sad8_1_ ## suf(blk1, blk2, stride, 8); \
447 \
448 return sum_ ## suf(); \
449 } \
450 \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
452 uint8_t *blk1, ptrdiff_t stride, int h) \
453 { \
454 av_assert2(h == 8); \
455 __asm__ volatile ( \
456 "pxor %%mm7, %%mm7 \n\t" \
457 "pxor %%mm6, %%mm6 \n\t" \
458 "movq %0, %%mm5 \n\t" \
459 :: "m" (round_tab[1])); \
460 \
461 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462 \
463 return sum_ ## suf(); \
464 } \
465 \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
467 uint8_t *blk1, ptrdiff_t stride, int h) \
468 { \
469 av_assert2(h == 8); \
470 __asm__ volatile ( \
471 "pxor %%mm7, %%mm7 \n\t" \
472 "pxor %%mm6, %%mm6 \n\t" \
473 "movq %0, %%mm5 \n\t" \
474 :: "m" (round_tab[1])); \
475 \
476 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477 \
478 return sum_ ## suf(); \
479 } \
480 \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
482 uint8_t *blk1, ptrdiff_t stride, int h) \
483 { \
484 av_assert2(h == 8); \
485 __asm__ volatile ( \
486 "pxor %%mm7, %%mm7 \n\t" \
487 "pxor %%mm6, %%mm6 \n\t" \
488 ::); \
489 \
490 sad8_4_ ## suf(blk1, blk2, stride, 8); \
491 \
492 return sum_ ## suf(); \
493 } \
494 \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
496 uint8_t *blk1, ptrdiff_t stride, int h) \
497 { \
498 __asm__ volatile ( \
499 "pxor %%mm7, %%mm7 \n\t" \
500 "pxor %%mm6, %%mm6 \n\t" \
501 :); \
502 \
503 sad8_1_ ## suf(blk1, blk2, stride, h); \
504 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505 \
506 return sum_ ## suf(); \
507 } \
508 \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
510 uint8_t *blk1, ptrdiff_t stride, int h) \
511 { \
512 __asm__ volatile ( \
513 "pxor %%mm7, %%mm7 \n\t" \
514 "pxor %%mm6, %%mm6 \n\t" \
515 "movq %0, %%mm5 \n\t" \
516 :: "m" (round_tab[1])); \
517 \
518 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
519 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520 \
521 return sum_ ## suf(); \
522 } \
523 \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
525 uint8_t *blk1, ptrdiff_t stride, int h) \
526 { \
527 __asm__ volatile ( \
528 "pxor %%mm7, %%mm7 \n\t" \
529 "pxor %%mm6, %%mm6 \n\t" \
530 "movq %0, %%mm5 \n\t" \
531 :: "m" (round_tab[1])); \
532 \
533 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
534 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535 \
536 return sum_ ## suf(); \
537 } \
538 \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
540 uint8_t *blk1, ptrdiff_t stride, int h) \
541 { \
542 __asm__ volatile ( \
543 "pxor %%mm7, %%mm7 \n\t" \
544 "pxor %%mm6, %%mm6 \n\t" \
545 ::); \
546 \
547 sad8_4_ ## suf(blk1, blk2, stride, h); \
548 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549 \
550 return sum_ ## suf(); \
551 } \
552
PIX_SAD(mmx)553 PIX_SAD(mmx)
554
555 #endif /* HAVE_INLINE_ASM */
556
557 av_cold void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
558 {
559 int cpu_flags = av_get_cpu_flags();
560
561 #if HAVE_INLINE_ASM
562 if (INLINE_MMX(cpu_flags)) {
563 c->pix_abs[0][0] = sad16_mmx;
564 c->pix_abs[0][1] = sad16_x2_mmx;
565 c->pix_abs[0][2] = sad16_y2_mmx;
566 c->pix_abs[0][3] = sad16_xy2_mmx;
567 c->pix_abs[1][0] = sad8_mmx;
568 c->pix_abs[1][1] = sad8_x2_mmx;
569 c->pix_abs[1][2] = sad8_y2_mmx;
570 c->pix_abs[1][3] = sad8_xy2_mmx;
571
572 c->sad[0] = sad16_mmx;
573 c->sad[1] = sad8_mmx;
574
575 c->vsad[4] = vsad_intra16_mmx;
576
577 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
578 c->vsad[0] = vsad16_mmx;
579 }
580 }
581
582 #endif /* HAVE_INLINE_ASM */
583
584 if (EXTERNAL_MMX(cpu_flags)) {
585 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
587 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
588 c->sse[0] = ff_sse16_mmx;
589 c->sse[1] = ff_sse8_mmx;
590 #if HAVE_X86ASM
591 c->nsse[0] = nsse16_mmx;
592 c->nsse[1] = nsse8_mmx;
593 #endif
594 }
595
596 if (EXTERNAL_MMXEXT(cpu_flags)) {
597 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
599 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
600
601 c->sad[0] = ff_sad16_mmxext;
602 c->sad[1] = ff_sad8_mmxext;
603
604 c->pix_abs[0][0] = ff_sad16_mmxext;
605 c->pix_abs[0][1] = ff_sad16_x2_mmxext;
606 c->pix_abs[0][2] = ff_sad16_y2_mmxext;
607 c->pix_abs[1][0] = ff_sad8_mmxext;
608 c->pix_abs[1][1] = ff_sad8_x2_mmxext;
609 c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610
611 c->vsad[4] = ff_vsad_intra16_mmxext;
612 c->vsad[5] = ff_vsad_intra8_mmxext;
613
614 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
615 c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
616 c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
617
618 c->vsad[0] = ff_vsad16_approx_mmxext;
619 c->vsad[1] = ff_vsad8_approx_mmxext;
620 }
621 }
622
623 if (EXTERNAL_SSE2(cpu_flags)) {
624 c->sse[0] = ff_sse16_sse2;
625 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
626
627 #if HAVE_ALIGNED_STACK
628 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630 #endif
631 if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
632 c->sad[0] = ff_sad16_sse2;
633 c->pix_abs[0][0] = ff_sad16_sse2;
634 c->pix_abs[0][1] = ff_sad16_x2_sse2;
635 c->pix_abs[0][2] = ff_sad16_y2_sse2;
636
637 c->vsad[4] = ff_vsad_intra16_sse2;
638 if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
639 c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
640 c->vsad[0] = ff_vsad16_approx_sse2;
641 }
642 }
643 }
644
645 if (EXTERNAL_SSSE3(cpu_flags)) {
646 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
647 #if HAVE_ALIGNED_STACK
648 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
650 #endif
651 }
652 }
653