1;***************************************************************************** 2;* SIMD-optimized motion compensation estimation 3;***************************************************************************** 4;* Copyright (c) 2000, 2001 Fabrice Bellard 5;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;***************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 27 28cextern pb_1 29cextern pb_80 30 31SECTION .text 32 33%macro DIFF_PIXELS_1 4 34 movh %1, %3 35 movh %2, %4 36 punpcklbw %2, %1 37 punpcklbw %1, %1 38 psubw %1, %2 39%endmacro 40 41; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 42; %6=temporary storage location 43; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) 44%macro DIFF_PIXELS_8 6 45 DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] 46 DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] 47 DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] 48 add %1, %5 49 add %2, %5 50 DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] 51 DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] 52 DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] 53 DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] 54%ifdef m8 55 DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] 56%else 57 mova [%6], m0 58 DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] 59 mova m0, [%6] 60%endif 61 sub %1, %5 62 sub %2, %5 63%endmacro 64 65%macro HADAMARD8 0 66 SUMSUB_BADC w, 0, 1, 2, 3 67 SUMSUB_BADC w, 4, 5, 6, 7 68 SUMSUB_BADC w, 0, 2, 1, 3 69 SUMSUB_BADC w, 4, 6, 5, 7 70 SUMSUB_BADC w, 0, 4, 1, 5 71 SUMSUB_BADC w, 2, 6, 3, 7 72%endmacro 73 74%macro ABS1_SUM 3 75 ABS1 %1, %2 76 paddusw %3, %1 77%endmacro 78 79%macro ABS2_SUM 6 80 ABS2 %1, %2, %3, %4 81 paddusw %5, %1 82 paddusw %6, %2 83%endmacro 84 85%macro ABS_SUM_8x8_64 1 86 ABS2 m0, m1, m8, m9 87 ABS2_SUM m2, m3, m8, m9, m0, m1 88 ABS2_SUM m4, m5, m8, m9, m0, m1 89 ABS2_SUM m6, m7, m8, m9, m0, m1 90 paddusw m0, m1 91%endmacro 92 93%macro ABS_SUM_8x8_32 1 94 mova [%1], m7 95 ABS1 m0, m7 96 ABS1 m1, m7 97 ABS1_SUM m2, m7, m0 98 ABS1_SUM m3, m7, m1 99 ABS1_SUM m4, m7, m0 100 ABS1_SUM m5, m7, m1 101 ABS1_SUM m6, m7, m0 102 mova m2, [%1] 103 ABS1_SUM m2, m7, m1 104 paddusw m0, m1 105%endmacro 106 107; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to 108; about 100k on extreme inputs. But that's very unlikely to occur in natural video, 109; and it's even more unlikely to not have any alternative mvs/modes with lower cost. 110%macro HSUM 3 111%if cpuflag(sse2) 112 movhlps %2, %1 113 paddusw %1, %2 114 pshuflw %2, %1, 0xE 115 paddusw %1, %2 116 pshuflw %2, %1, 0x1 117 paddusw %1, %2 118 movd %3, %1 119%elif cpuflag(mmxext) 120 pshufw %2, %1, 0xE 121 paddusw %1, %2 122 pshufw %2, %1, 0x1 123 paddusw %1, %2 124 movd %3, %1 125%elif cpuflag(mmx) 126 mova %2, %1 127 psrlq %1, 32 128 paddusw %1, %2 129 mova %2, %1 130 psrlq %1, 16 131 paddusw %1, %2 132 movd %3, %1 133%endif 134%endmacro 135 136%macro STORE4 5 137 mova [%1+mmsize*0], %2 138 mova [%1+mmsize*1], %3 139 mova [%1+mmsize*2], %4 140 mova [%1+mmsize*3], %5 141%endmacro 142 143%macro LOAD4 5 144 mova %2, [%1+mmsize*0] 145 mova %3, [%1+mmsize*1] 146 mova %4, [%1+mmsize*2] 147 mova %5, [%1+mmsize*3] 148%endmacro 149 150%macro hadamard8_16_wrapper 2 151cglobal hadamard8_diff, 4, 4, %1 152%ifndef m8 153 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 154 SUB rsp, pad 155%endif 156 call hadamard8x8_diff %+ SUFFIX 157%ifndef m8 158 ADD rsp, pad 159%endif 160 RET 161 162cglobal hadamard8_diff16, 5, 6, %1 163%ifndef m8 164 %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) 165 SUB rsp, pad 166%endif 167 168 call hadamard8x8_diff %+ SUFFIX 169 mov r5d, eax 170 171 add r1, 8 172 add r2, 8 173 call hadamard8x8_diff %+ SUFFIX 174 add r5d, eax 175 176 cmp r4d, 16 177 jne .done 178 179 lea r1, [r1+r3*8-8] 180 lea r2, [r2+r3*8-8] 181 call hadamard8x8_diff %+ SUFFIX 182 add r5d, eax 183 184 add r1, 8 185 add r2, 8 186 call hadamard8x8_diff %+ SUFFIX 187 add r5d, eax 188 189.done: 190 mov eax, r5d 191%ifndef m8 192 ADD rsp, pad 193%endif 194 RET 195%endmacro 196 197%macro HADAMARD8_DIFF 0-1 198%if cpuflag(sse2) 199hadamard8x8_diff %+ SUFFIX: 200 lea r0, [r3*3] 201 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize 202 HADAMARD8 203%if ARCH_X86_64 204 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 205%else 206 TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] 207%endif 208 HADAMARD8 209 ABS_SUM_8x8 rsp+gprsize 210 HSUM m0, m1, eax 211 and eax, 0xFFFF 212 ret 213 214hadamard8_16_wrapper %1, 3 215%elif cpuflag(mmx) 216ALIGN 16 217; int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, 218; uint8_t *src2, ptrdiff_t stride, int h) 219; r0 = void *s = unused, int h = unused (always 8) 220; note how r1, r2 and r3 are not clobbered in this function, so 16x16 221; can simply call this 2x2x (and that's why we access rsp+gprsize 222; everywhere, which is rsp of calling func 223hadamard8x8_diff %+ SUFFIX: 224 lea r0, [r3*3] 225 226 ; first 4x8 pixels 227 DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 228 HADAMARD8 229 mova [rsp+gprsize+0x60], m7 230 TRANSPOSE4x4W 0, 1, 2, 3, 7 231 STORE4 rsp+gprsize, m0, m1, m2, m3 232 mova m7, [rsp+gprsize+0x60] 233 TRANSPOSE4x4W 4, 5, 6, 7, 0 234 STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 235 236 ; second 4x8 pixels 237 DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 238 HADAMARD8 239 mova [rsp+gprsize+0x60], m7 240 TRANSPOSE4x4W 0, 1, 2, 3, 7 241 STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 242 mova m7, [rsp+gprsize+0x60] 243 TRANSPOSE4x4W 4, 5, 6, 7, 0 244 245 LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 246 HADAMARD8 247 ABS_SUM_8x8_32 rsp+gprsize+0x60 248 mova [rsp+gprsize+0x60], m0 249 250 LOAD4 rsp+gprsize , m0, m1, m2, m3 251 LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 252 HADAMARD8 253 ABS_SUM_8x8_32 rsp+gprsize 254 paddusw m0, [rsp+gprsize+0x60] 255 256 HSUM m0, m1, eax 257 and rax, 0xFFFF 258 ret 259 260hadamard8_16_wrapper 0, 14 261%endif 262%endmacro 263 264INIT_MMX mmx 265HADAMARD8_DIFF 266 267INIT_MMX mmxext 268HADAMARD8_DIFF 269 270INIT_XMM sse2 271%if ARCH_X86_64 272%define ABS_SUM_8x8 ABS_SUM_8x8_64 273%else 274%define ABS_SUM_8x8 ABS_SUM_8x8_32 275%endif 276HADAMARD8_DIFF 10 277 278INIT_XMM ssse3 279%define ABS_SUM_8x8 ABS_SUM_8x8_64 280HADAMARD8_DIFF 9 281 282; int ff_sse*_*(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 283; ptrdiff_t line_size, int h) 284 285%macro SUM_SQUARED_ERRORS 1 286cglobal sse%1, 5,5,8, v, pix1, pix2, lsize, h 287%if %1 == mmsize 288 shr hd, 1 289%endif 290 pxor m0, m0 ; mm0 = 0 291 pxor m7, m7 ; mm7 holds the sum 292 293.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned 294 movu m1, [pix1q] ; m1 = pix1[0][0-15], [0-7] for mmx 295 movu m2, [pix2q] ; m2 = pix2[0][0-15], [0-7] for mmx 296%if %1 == mmsize 297 movu m3, [pix1q+lsizeq] ; m3 = pix1[1][0-15], [0-7] for mmx 298 movu m4, [pix2q+lsizeq] ; m4 = pix2[1][0-15], [0-7] for mmx 299%else ; %1 / 2 == mmsize; mmx only 300 mova m3, [pix1q+8] ; m3 = pix1[0][8-15] 301 mova m4, [pix2q+8] ; m4 = pix2[0][8-15] 302%endif 303 304 ; todo: mm1-mm2, mm3-mm4 305 ; algo: subtract mm1 from mm2 with saturation and vice versa 306 ; OR the result to get the absolute difference 307 mova m5, m1 308 mova m6, m3 309 psubusb m1, m2 310 psubusb m3, m4 311 psubusb m2, m5 312 psubusb m4, m6 313 314 por m2, m1 315 por m4, m3 316 317 ; now convert to 16-bit vectors so we can square them 318 mova m1, m2 319 mova m3, m4 320 321 punpckhbw m2, m0 322 punpckhbw m4, m0 323 punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) 324 punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) 325 326 pmaddwd m2, m2 327 pmaddwd m4, m4 328 pmaddwd m1, m1 329 pmaddwd m3, m3 330 331 paddd m1, m2 332 paddd m3, m4 333 paddd m7, m1 334 paddd m7, m3 335 336%if %1 == mmsize 337 lea pix1q, [pix1q + 2*lsizeq] 338 lea pix2q, [pix2q + 2*lsizeq] 339%else 340 add pix1q, lsizeq 341 add pix2q, lsizeq 342%endif 343 dec hd 344 jnz .next2lines 345 346 HADDD m7, m1 347 movd eax, m7 ; return value 348 RET 349%endmacro 350 351INIT_MMX mmx 352SUM_SQUARED_ERRORS 8 353 354INIT_MMX mmx 355SUM_SQUARED_ERRORS 16 356 357INIT_XMM sse2 358SUM_SQUARED_ERRORS 16 359 360;----------------------------------------------- 361;int ff_sum_abs_dctelem(int16_t *block) 362;----------------------------------------------- 363; %1 = number of xmm registers used 364; %2 = number of inline loops 365 366%macro SUM_ABS_DCTELEM 2 367cglobal sum_abs_dctelem, 1, 1, %1, block 368 pxor m0, m0 369 pxor m1, m1 370%assign %%i 0 371%rep %2 372 mova m2, [blockq+mmsize*(0+%%i)] 373 mova m3, [blockq+mmsize*(1+%%i)] 374 mova m4, [blockq+mmsize*(2+%%i)] 375 mova m5, [blockq+mmsize*(3+%%i)] 376 ABS1_SUM m2, m6, m0 377 ABS1_SUM m3, m6, m1 378 ABS1_SUM m4, m6, m0 379 ABS1_SUM m5, m6, m1 380%assign %%i %%i+4 381%endrep 382 paddusw m0, m1 383 HSUM m0, m1, eax 384 and eax, 0xFFFF 385 RET 386%endmacro 387 388INIT_MMX mmx 389SUM_ABS_DCTELEM 0, 4 390INIT_MMX mmxext 391SUM_ABS_DCTELEM 0, 4 392INIT_XMM sse2 393SUM_ABS_DCTELEM 7, 2 394INIT_XMM ssse3 395SUM_ABS_DCTELEM 6, 2 396 397;------------------------------------------------------------------------------ 398; int ff_hf_noise*_mmx(uint8_t *pix1, ptrdiff_t lsize, int h) 399;------------------------------------------------------------------------------ 400; %1 = 8/16. %2-5=m# 401%macro HF_NOISE_PART1 5 402 mova m%2, [pix1q] 403%if %1 == 8 404 mova m%3, m%2 405 psllq m%2, 8 406 psrlq m%3, 8 407 psrlq m%2, 8 408%else 409 mova m%3, [pix1q+1] 410%endif 411 mova m%4, m%2 412 mova m%5, m%3 413 punpcklbw m%2, m7 414 punpcklbw m%3, m7 415 punpckhbw m%4, m7 416 punpckhbw m%5, m7 417 psubw m%2, m%3 418 psubw m%4, m%5 419%endmacro 420 421; %1-2 = m# 422%macro HF_NOISE_PART2 4 423 psubw m%1, m%3 424 psubw m%2, m%4 425 pxor m3, m3 426 pxor m1, m1 427 pcmpgtw m3, m%1 428 pcmpgtw m1, m%2 429 pxor m%1, m3 430 pxor m%2, m1 431 psubw m%1, m3 432 psubw m%2, m1 433 paddw m%2, m%1 434 paddw m6, m%2 435%endmacro 436 437; %1 = 8/16 438%macro HF_NOISE 1 439cglobal hf_noise%1, 3,3,0, pix1, lsize, h 440 sub hd, 2 441 pxor m7, m7 442 pxor m6, m6 443 HF_NOISE_PART1 %1, 0, 1, 2, 3 444 add pix1q, lsizeq 445 HF_NOISE_PART1 %1, 4, 1, 5, 3 446 HF_NOISE_PART2 0, 2, 4, 5 447 add pix1q, lsizeq 448.loop: 449 HF_NOISE_PART1 %1, 0, 1, 2, 3 450 HF_NOISE_PART2 4, 5, 0, 2 451 add pix1q, lsizeq 452 HF_NOISE_PART1 %1, 4, 1, 5, 3 453 HF_NOISE_PART2 0, 2, 4, 5 454 add pix1q, lsizeq 455 sub hd, 2 456 jne .loop 457 458 mova m0, m6 459 punpcklwd m0, m7 460 punpckhwd m6, m7 461 paddd m6, m0 462 mova m0, m6 463 psrlq m6, 32 464 paddd m0, m6 465 movd eax, m0 ; eax = result of hf_noise8; 466 REP_RET ; return eax; 467%endmacro 468 469INIT_MMX mmx 470HF_NOISE 8 471HF_NOISE 16 472 473;--------------------------------------------------------------------------------------- 474;int ff_sad_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 475;--------------------------------------------------------------------------------------- 476;%1 = 8/16 477%macro SAD 1 478cglobal sad%1, 5, 5, 3, v, pix1, pix2, stride, h 479 movu m2, [pix2q] 480 movu m1, [pix2q+strideq] 481 psadbw m2, [pix1q] 482 psadbw m1, [pix1q+strideq] 483 paddw m2, m1 484%if %1 != mmsize 485 movu m0, [pix2q+8] 486 movu m1, [pix2q+strideq+8] 487 psadbw m0, [pix1q+8] 488 psadbw m1, [pix1q+strideq+8] 489 paddw m2, m0 490 paddw m2, m1 491%endif 492 sub hd, 2 493 494align 16 495.loop: 496 lea pix1q, [pix1q+strideq*2] 497 lea pix2q, [pix2q+strideq*2] 498 movu m0, [pix2q] 499 movu m1, [pix2q+strideq] 500 psadbw m0, [pix1q] 501 psadbw m1, [pix1q+strideq] 502 paddw m2, m0 503 paddw m2, m1 504%if %1 != mmsize 505 movu m0, [pix2q+8] 506 movu m1, [pix2q+strideq+8] 507 psadbw m0, [pix1q+8] 508 psadbw m1, [pix1q+strideq+8] 509 paddw m2, m0 510 paddw m2, m1 511%endif 512 sub hd, 2 513 jg .loop 514%if mmsize == 16 515 movhlps m0, m2 516 paddw m2, m0 517%endif 518 movd eax, m2 519 RET 520%endmacro 521 522INIT_MMX mmxext 523SAD 8 524SAD 16 525INIT_XMM sse2 526SAD 16 527 528;------------------------------------------------------------------------------------------ 529;int ff_sad_x2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 530;------------------------------------------------------------------------------------------ 531;%1 = 8/16 532%macro SAD_X2 1 533cglobal sad%1_x2, 5, 5, 5, v, pix1, pix2, stride, h 534 movu m0, [pix2q] 535 movu m2, [pix2q+strideq] 536%if mmsize == 16 537 movu m3, [pix2q+1] 538 movu m4, [pix2q+strideq+1] 539 pavgb m0, m3 540 pavgb m2, m4 541%else 542 pavgb m0, [pix2q+1] 543 pavgb m2, [pix2q+strideq+1] 544%endif 545 psadbw m0, [pix1q] 546 psadbw m2, [pix1q+strideq] 547 paddw m0, m2 548%if %1 != mmsize 549 movu m1, [pix2q+8] 550 movu m2, [pix2q+strideq+8] 551 pavgb m1, [pix2q+9] 552 pavgb m2, [pix2q+strideq+9] 553 psadbw m1, [pix1q+8] 554 psadbw m2, [pix1q+strideq+8] 555 paddw m0, m1 556 paddw m0, m2 557%endif 558 sub hd, 2 559 560align 16 561.loop: 562 lea pix1q, [pix1q+2*strideq] 563 lea pix2q, [pix2q+2*strideq] 564 movu m1, [pix2q] 565 movu m2, [pix2q+strideq] 566%if mmsize == 16 567 movu m3, [pix2q+1] 568 movu m4, [pix2q+strideq+1] 569 pavgb m1, m3 570 pavgb m2, m4 571%else 572 pavgb m1, [pix2q+1] 573 pavgb m2, [pix2q+strideq+1] 574%endif 575 psadbw m1, [pix1q] 576 psadbw m2, [pix1q+strideq] 577 paddw m0, m1 578 paddw m0, m2 579%if %1 != mmsize 580 movu m1, [pix2q+8] 581 movu m2, [pix2q+strideq+8] 582 pavgb m1, [pix2q+9] 583 pavgb m2, [pix2q+strideq+9] 584 psadbw m1, [pix1q+8] 585 psadbw m2, [pix1q+strideq+8] 586 paddw m0, m1 587 paddw m0, m2 588%endif 589 sub hd, 2 590 jg .loop 591%if mmsize == 16 592 movhlps m1, m0 593 paddw m0, m1 594%endif 595 movd eax, m0 596 RET 597%endmacro 598 599INIT_MMX mmxext 600SAD_X2 8 601SAD_X2 16 602INIT_XMM sse2 603SAD_X2 16 604 605;------------------------------------------------------------------------------------------ 606;int ff_sad_y2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 607;------------------------------------------------------------------------------------------ 608;%1 = 8/16 609%macro SAD_Y2 1 610cglobal sad%1_y2, 5, 5, 4, v, pix1, pix2, stride, h 611 movu m1, [pix2q] 612 movu m0, [pix2q+strideq] 613 movu m3, [pix2q+2*strideq] 614 pavgb m1, m0 615 pavgb m0, m3 616 psadbw m1, [pix1q] 617 psadbw m0, [pix1q+strideq] 618 paddw m0, m1 619 mova m1, m3 620%if %1 != mmsize 621 movu m4, [pix2q+8] 622 movu m5, [pix2q+strideq+8] 623 movu m6, [pix2q+2*strideq+8] 624 pavgb m4, m5 625 pavgb m5, m6 626 psadbw m4, [pix1q+8] 627 psadbw m5, [pix1q+strideq+8] 628 paddw m0, m4 629 paddw m0, m5 630 mova m4, m6 631%endif 632 add pix2q, strideq 633 sub hd, 2 634 635align 16 636.loop: 637 lea pix1q, [pix1q+2*strideq] 638 lea pix2q, [pix2q+2*strideq] 639 movu m2, [pix2q] 640 movu m3, [pix2q+strideq] 641 pavgb m1, m2 642 pavgb m2, m3 643 psadbw m1, [pix1q] 644 psadbw m2, [pix1q+strideq] 645 paddw m0, m1 646 paddw m0, m2 647 mova m1, m3 648%if %1 != mmsize 649 movu m5, [pix2q+8] 650 movu m6, [pix2q+strideq+8] 651 pavgb m4, m5 652 pavgb m5, m6 653 psadbw m4, [pix1q+8] 654 psadbw m5, [pix1q+strideq+8] 655 paddw m0, m4 656 paddw m0, m5 657 mova m4, m6 658%endif 659 sub hd, 2 660 jg .loop 661%if mmsize == 16 662 movhlps m1, m0 663 paddw m0, m1 664%endif 665 movd eax, m0 666 RET 667%endmacro 668 669INIT_MMX mmxext 670SAD_Y2 8 671SAD_Y2 16 672INIT_XMM sse2 673SAD_Y2 16 674 675;------------------------------------------------------------------------------------------- 676;int ff_sad_approx_xy2_<opt>(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h); 677;------------------------------------------------------------------------------------------- 678;%1 = 8/16 679%macro SAD_APPROX_XY2 1 680cglobal sad%1_approx_xy2, 5, 5, 7, v, pix1, pix2, stride, h 681 mova m4, [pb_1] 682 movu m1, [pix2q] 683 movu m0, [pix2q+strideq] 684 movu m3, [pix2q+2*strideq] 685%if mmsize == 16 686 movu m5, [pix2q+1] 687 movu m6, [pix2q+strideq+1] 688 movu m2, [pix2q+2*strideq+1] 689 pavgb m1, m5 690 pavgb m0, m6 691 pavgb m3, m2 692%else 693 pavgb m1, [pix2q+1] 694 pavgb m0, [pix2q+strideq+1] 695 pavgb m3, [pix2q+2*strideq+1] 696%endif 697 psubusb m0, m4 698 pavgb m1, m0 699 pavgb m0, m3 700 psadbw m1, [pix1q] 701 psadbw m0, [pix1q+strideq] 702 paddw m0, m1 703 mova m1, m3 704%if %1 != mmsize 705 movu m5, [pix2q+8] 706 movu m6, [pix2q+strideq+8] 707 movu m7, [pix2q+2*strideq+8] 708 pavgb m5, [pix2q+1+8] 709 pavgb m6, [pix2q+strideq+1+8] 710 pavgb m7, [pix2q+2*strideq+1+8] 711 psubusb m6, m4 712 pavgb m5, m6 713 pavgb m6, m7 714 psadbw m5, [pix1q+8] 715 psadbw m6, [pix1q+strideq+8] 716 paddw m0, m5 717 paddw m0, m6 718 mova m5, m7 719%endif 720 add pix2q, strideq 721 sub hd, 2 722 723align 16 724.loop: 725 lea pix1q, [pix1q+2*strideq] 726 lea pix2q, [pix2q+2*strideq] 727 movu m2, [pix2q] 728 movu m3, [pix2q+strideq] 729%if mmsize == 16 730 movu m5, [pix2q+1] 731 movu m6, [pix2q+strideq+1] 732 pavgb m2, m5 733 pavgb m3, m6 734%else 735 pavgb m2, [pix2q+1] 736 pavgb m3, [pix2q+strideq+1] 737%endif 738 psubusb m2, m4 739 pavgb m1, m2 740 pavgb m2, m3 741 psadbw m1, [pix1q] 742 psadbw m2, [pix1q+strideq] 743 paddw m0, m1 744 paddw m0, m2 745 mova m1, m3 746%if %1 != mmsize 747 movu m6, [pix2q+8] 748 movu m7, [pix2q+strideq+8] 749 pavgb m6, [pix2q+8+1] 750 pavgb m7, [pix2q+strideq+8+1] 751 psubusb m6, m4 752 pavgb m5, m6 753 pavgb m6, m7 754 psadbw m5, [pix1q+8] 755 psadbw m6, [pix1q+strideq+8] 756 paddw m0, m5 757 paddw m0, m6 758 mova m5, m7 759%endif 760 sub hd, 2 761 jg .loop 762%if mmsize == 16 763 movhlps m1, m0 764 paddw m0, m1 765%endif 766 movd eax, m0 767 RET 768%endmacro 769 770INIT_MMX mmxext 771SAD_APPROX_XY2 8 772SAD_APPROX_XY2 16 773INIT_XMM sse2 774SAD_APPROX_XY2 16 775 776;-------------------------------------------------------------------- 777;int ff_vsad_intra(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 778; ptrdiff_t line_size, int h); 779;-------------------------------------------------------------------- 780; %1 = 8/16 781%macro VSAD_INTRA 1 782cglobal vsad_intra%1, 5, 5, 3, v, pix1, pix2, lsize, h 783 mova m0, [pix1q] 784%if %1 == mmsize 785 mova m2, [pix1q+lsizeq] 786 psadbw m0, m2 787%else 788 mova m2, [pix1q+lsizeq] 789 mova m3, [pix1q+8] 790 mova m4, [pix1q+lsizeq+8] 791 psadbw m0, m2 792 psadbw m3, m4 793 paddw m0, m3 794%endif 795 sub hd, 2 796 797.loop: 798 lea pix1q, [pix1q + 2*lsizeq] 799%if %1 == mmsize 800 mova m1, [pix1q] 801 psadbw m2, m1 802 paddw m0, m2 803 mova m2, [pix1q+lsizeq] 804 psadbw m1, m2 805 paddw m0, m1 806%else 807 mova m1, [pix1q] 808 mova m3, [pix1q+8] 809 psadbw m2, m1 810 psadbw m4, m3 811 paddw m0, m2 812 paddw m0, m4 813 mova m2, [pix1q+lsizeq] 814 mova m4, [pix1q+lsizeq+8] 815 psadbw m1, m2 816 psadbw m3, m4 817 paddw m0, m1 818 paddw m0, m3 819%endif 820 sub hd, 2 821 jg .loop 822 823%if mmsize == 16 824 pshufd m1, m0, 0xe 825 paddd m0, m1 826%endif 827 movd eax, m0 828 RET 829%endmacro 830 831INIT_MMX mmxext 832VSAD_INTRA 8 833VSAD_INTRA 16 834INIT_XMM sse2 835VSAD_INTRA 16 836 837;--------------------------------------------------------------------- 838;int ff_vsad_approx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, 839; ptrdiff_t line_size, int h); 840;--------------------------------------------------------------------- 841; %1 = 8/16 842%macro VSAD_APPROX 1 843cglobal vsad%1_approx, 5, 5, 5, v, pix1, pix2, lsize, h 844 mova m1, [pb_80] 845 mova m0, [pix1q] 846%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 847 mova m4, [pix1q+lsizeq] 848%if mmsize == 16 849 movu m3, [pix2q] 850 movu m2, [pix2q+lsizeq] 851 psubb m0, m3 852 psubb m4, m2 853%else 854 psubb m0, [pix2q] 855 psubb m4, [pix2q+lsizeq] 856%endif 857 pxor m0, m1 858 pxor m4, m1 859 psadbw m0, m4 860%else ; vsad16_mmxext 861 mova m3, [pix1q+8] 862 psubb m0, [pix2q] 863 psubb m3, [pix2q+8] 864 pxor m0, m1 865 pxor m3, m1 866 mova m4, [pix1q+lsizeq] 867 mova m5, [pix1q+lsizeq+8] 868 psubb m4, [pix2q+lsizeq] 869 psubb m5, [pix2q+lsizeq+8] 870 pxor m4, m1 871 pxor m5, m1 872 psadbw m0, m4 873 psadbw m3, m5 874 paddw m0, m3 875%endif 876 sub hd, 2 877 878.loop: 879 lea pix1q, [pix1q + 2*lsizeq] 880 lea pix2q, [pix2q + 2*lsizeq] 881 mova m2, [pix1q] 882%if %1 == mmsize ; vsad8_mmxext, vsad16_sse2 883%if mmsize == 16 884 movu m3, [pix2q] 885 psubb m2, m3 886%else 887 psubb m2, [pix2q] 888%endif 889 pxor m2, m1 890 psadbw m4, m2 891 paddw m0, m4 892 mova m4, [pix1q+lsizeq] 893 movu m3, [pix2q+lsizeq] 894 psubb m4, m3 895 pxor m4, m1 896 psadbw m2, m4 897 paddw m0, m2 898%else ; vsad16_mmxext 899 mova m3, [pix1q+8] 900 psubb m2, [pix2q] 901 psubb m3, [pix2q+8] 902 pxor m2, m1 903 pxor m3, m1 904 psadbw m4, m2 905 psadbw m5, m3 906 paddw m0, m4 907 paddw m0, m5 908 mova m4, [pix1q+lsizeq] 909 mova m5, [pix1q+lsizeq+8] 910 psubb m4, [pix2q+lsizeq] 911 psubb m5, [pix2q+lsizeq+8] 912 pxor m4, m1 913 pxor m5, m1 914 psadbw m2, m4 915 psadbw m3, m5 916 paddw m0, m2 917 paddw m0, m3 918%endif 919 sub hd, 2 920 jg .loop 921 922%if mmsize == 16 923 pshufd m1, m0, 0xe 924 paddd m0, m1 925%endif 926 movd eax, m0 927 RET 928%endmacro 929 930INIT_MMX mmxext 931VSAD_APPROX 8 932VSAD_APPROX 16 933INIT_XMM sse2 934VSAD_APPROX 16 935