1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro STACK_FRAME_CREATE_X3 0 14%if ABI_IS_32BIT 15 %define src_ptr rsi 16 %define src_stride rax 17 %define ref_ptr rdi 18 %define ref_stride rdx 19 %define end_ptr rcx 20 %define ret_var rbx 21 %define result_ptr arg(4) 22 %define max_sad arg(4) 23 %define height dword ptr arg(4) 24 push rbp 25 mov rbp, rsp 26 push rsi 27 push rdi 28 push rbx 29 30 mov rsi, arg(0) ; src_ptr 31 mov rdi, arg(2) ; ref_ptr 32 33 movsxd rax, dword ptr arg(1) ; src_stride 34 movsxd rdx, dword ptr arg(3) ; ref_stride 35%else 36 %if LIBVPX_YASM_WIN64 37 SAVE_XMM 7, u 38 %define src_ptr rcx 39 %define src_stride rdx 40 %define ref_ptr r8 41 %define ref_stride r9 42 %define end_ptr r10 43 %define ret_var r11 44 %define result_ptr [rsp+xmm_stack_space+8+4*8] 45 %define max_sad [rsp+xmm_stack_space+8+4*8] 46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] 47 %else 48 %define src_ptr rdi 49 %define src_stride rsi 50 %define ref_ptr rdx 51 %define ref_stride rcx 52 %define end_ptr r9 53 %define ret_var r10 54 %define result_ptr r8 55 %define max_sad r8 56 %define height r8 57 %endif 58%endif 59 60%endmacro 61 62%macro STACK_FRAME_DESTROY_X3 0 63 %define src_ptr 64 %define src_stride 65 %define ref_ptr 66 %define ref_stride 67 %define end_ptr 68 %define ret_var 69 %define result_ptr 70 %define max_sad 71 %define height 72 73%if ABI_IS_32BIT 74 pop rbx 75 pop rdi 76 pop rsi 77 pop rbp 78%else 79 %if LIBVPX_YASM_WIN64 80 RESTORE_XMM 81 %endif 82%endif 83 ret 84%endmacro 85 86%macro STACK_FRAME_CREATE_X4 0 87%if ABI_IS_32BIT 88 %define src_ptr rsi 89 %define src_stride rax 90 %define r0_ptr rcx 91 %define r1_ptr rdx 92 %define r2_ptr rbx 93 %define r3_ptr rdi 94 %define ref_stride rbp 95 %define result_ptr arg(4) 96 push rbp 97 mov rbp, rsp 98 push rsi 99 push rdi 100 push rbx 101 102 push rbp 103 mov rdi, arg(2) ; ref_ptr_base 104 105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 106 107 mov rsi, arg(0) ; src_ptr 108 109 movsxd rbx, dword ptr arg(1) ; src_stride 110 movsxd rbp, dword ptr arg(3) ; ref_stride 111 112 xchg rbx, rax 113%else 114 %if LIBVPX_YASM_WIN64 115 SAVE_XMM 7, u 116 %define src_ptr rcx 117 %define src_stride rdx 118 %define r0_ptr rsi 119 %define r1_ptr r10 120 %define r2_ptr r11 121 %define r3_ptr r8 122 %define ref_stride r9 123 %define result_ptr [rsp+xmm_stack_space+16+4*8] 124 push rsi 125 126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr 127 %else 128 %define src_ptr rdi 129 %define src_stride rsi 130 %define r0_ptr r9 131 %define r1_ptr r10 132 %define r2_ptr r11 133 %define r3_ptr rdx 134 %define ref_stride rcx 135 %define result_ptr r8 136 137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr 138 139 %endif 140%endif 141%endmacro 142 143%macro STACK_FRAME_DESTROY_X4 0 144 %define src_ptr 145 %define src_stride 146 %define r0_ptr 147 %define r1_ptr 148 %define r2_ptr 149 %define r3_ptr 150 %define ref_stride 151 %define result_ptr 152 153%if ABI_IS_32BIT 154 pop rbx 155 pop rdi 156 pop rsi 157 pop rbp 158%else 159 %if LIBVPX_YASM_WIN64 160 pop rsi 161 RESTORE_XMM 162 %endif 163%endif 164 ret 165%endmacro 166 167%macro PROCESS_16X2X3 5 168%if %1==0 169 movdqa xmm0, XMMWORD PTR [%2] 170 lddqu xmm5, XMMWORD PTR [%3] 171 lddqu xmm6, XMMWORD PTR [%3+1] 172 lddqu xmm7, XMMWORD PTR [%3+2] 173 174 psadbw xmm5, xmm0 175 psadbw xmm6, xmm0 176 psadbw xmm7, xmm0 177%else 178 movdqa xmm0, XMMWORD PTR [%2] 179 lddqu xmm1, XMMWORD PTR [%3] 180 lddqu xmm2, XMMWORD PTR [%3+1] 181 lddqu xmm3, XMMWORD PTR [%3+2] 182 183 psadbw xmm1, xmm0 184 psadbw xmm2, xmm0 185 psadbw xmm3, xmm0 186 187 paddw xmm5, xmm1 188 paddw xmm6, xmm2 189 paddw xmm7, xmm3 190%endif 191 movdqa xmm0, XMMWORD PTR [%2+%4] 192 lddqu xmm1, XMMWORD PTR [%3+%5] 193 lddqu xmm2, XMMWORD PTR [%3+%5+1] 194 lddqu xmm3, XMMWORD PTR [%3+%5+2] 195 196%if %1==0 || %1==1 197 lea %2, [%2+%4*2] 198 lea %3, [%3+%5*2] 199%endif 200 201 psadbw xmm1, xmm0 202 psadbw xmm2, xmm0 203 psadbw xmm3, xmm0 204 205 paddw xmm5, xmm1 206 paddw xmm6, xmm2 207 paddw xmm7, xmm3 208%endmacro 209 210%macro PROCESS_8X2X3 5 211%if %1==0 212 movq mm0, QWORD PTR [%2] 213 movq mm5, QWORD PTR [%3] 214 movq mm6, QWORD PTR [%3+1] 215 movq mm7, QWORD PTR [%3+2] 216 217 psadbw mm5, mm0 218 psadbw mm6, mm0 219 psadbw mm7, mm0 220%else 221 movq mm0, QWORD PTR [%2] 222 movq mm1, QWORD PTR [%3] 223 movq mm2, QWORD PTR [%3+1] 224 movq mm3, QWORD PTR [%3+2] 225 226 psadbw mm1, mm0 227 psadbw mm2, mm0 228 psadbw mm3, mm0 229 230 paddw mm5, mm1 231 paddw mm6, mm2 232 paddw mm7, mm3 233%endif 234 movq mm0, QWORD PTR [%2+%4] 235 movq mm1, QWORD PTR [%3+%5] 236 movq mm2, QWORD PTR [%3+%5+1] 237 movq mm3, QWORD PTR [%3+%5+2] 238 239%if %1==0 || %1==1 240 lea %2, [%2+%4*2] 241 lea %3, [%3+%5*2] 242%endif 243 244 psadbw mm1, mm0 245 psadbw mm2, mm0 246 psadbw mm3, mm0 247 248 paddw mm5, mm1 249 paddw mm6, mm2 250 paddw mm7, mm3 251%endmacro 252 253%macro LOAD_X4_ADDRESSES 5 254 mov %2, [%1+REG_SZ_BYTES*0] 255 mov %3, [%1+REG_SZ_BYTES*1] 256 257 mov %4, [%1+REG_SZ_BYTES*2] 258 mov %5, [%1+REG_SZ_BYTES*3] 259%endmacro 260 261%macro PROCESS_16X2X4 8 262%if %1==0 263 movdqa xmm0, XMMWORD PTR [%2] 264 lddqu xmm4, XMMWORD PTR [%3] 265 lddqu xmm5, XMMWORD PTR [%4] 266 lddqu xmm6, XMMWORD PTR [%5] 267 lddqu xmm7, XMMWORD PTR [%6] 268 269 psadbw xmm4, xmm0 270 psadbw xmm5, xmm0 271 psadbw xmm6, xmm0 272 psadbw xmm7, xmm0 273%else 274 movdqa xmm0, XMMWORD PTR [%2] 275 lddqu xmm1, XMMWORD PTR [%3] 276 lddqu xmm2, XMMWORD PTR [%4] 277 lddqu xmm3, XMMWORD PTR [%5] 278 279 psadbw xmm1, xmm0 280 psadbw xmm2, xmm0 281 psadbw xmm3, xmm0 282 283 paddw xmm4, xmm1 284 lddqu xmm1, XMMWORD PTR [%6] 285 paddw xmm5, xmm2 286 paddw xmm6, xmm3 287 288 psadbw xmm1, xmm0 289 paddw xmm7, xmm1 290%endif 291 movdqa xmm0, XMMWORD PTR [%2+%7] 292 lddqu xmm1, XMMWORD PTR [%3+%8] 293 lddqu xmm2, XMMWORD PTR [%4+%8] 294 lddqu xmm3, XMMWORD PTR [%5+%8] 295 296 psadbw xmm1, xmm0 297 psadbw xmm2, xmm0 298 psadbw xmm3, xmm0 299 300 paddw xmm4, xmm1 301 lddqu xmm1, XMMWORD PTR [%6+%8] 302 paddw xmm5, xmm2 303 paddw xmm6, xmm3 304 305%if %1==0 || %1==1 306 lea %2, [%2+%7*2] 307 lea %3, [%3+%8*2] 308 309 lea %4, [%4+%8*2] 310 lea %5, [%5+%8*2] 311 312 lea %6, [%6+%8*2] 313%endif 314 psadbw xmm1, xmm0 315 paddw xmm7, xmm1 316 317%endmacro 318 319%macro PROCESS_8X2X4 8 320%if %1==0 321 movq mm0, QWORD PTR [%2] 322 movq mm4, QWORD PTR [%3] 323 movq mm5, QWORD PTR [%4] 324 movq mm6, QWORD PTR [%5] 325 movq mm7, QWORD PTR [%6] 326 327 psadbw mm4, mm0 328 psadbw mm5, mm0 329 psadbw mm6, mm0 330 psadbw mm7, mm0 331%else 332 movq mm0, QWORD PTR [%2] 333 movq mm1, QWORD PTR [%3] 334 movq mm2, QWORD PTR [%4] 335 movq mm3, QWORD PTR [%5] 336 337 psadbw mm1, mm0 338 psadbw mm2, mm0 339 psadbw mm3, mm0 340 341 paddw mm4, mm1 342 movq mm1, QWORD PTR [%6] 343 paddw mm5, mm2 344 paddw mm6, mm3 345 346 psadbw mm1, mm0 347 paddw mm7, mm1 348%endif 349 movq mm0, QWORD PTR [%2+%7] 350 movq mm1, QWORD PTR [%3+%8] 351 movq mm2, QWORD PTR [%4+%8] 352 movq mm3, QWORD PTR [%5+%8] 353 354 psadbw mm1, mm0 355 psadbw mm2, mm0 356 psadbw mm3, mm0 357 358 paddw mm4, mm1 359 movq mm1, QWORD PTR [%6+%8] 360 paddw mm5, mm2 361 paddw mm6, mm3 362 363%if %1==0 || %1==1 364 lea %2, [%2+%7*2] 365 lea %3, [%3+%8*2] 366 367 lea %4, [%4+%8*2] 368 lea %5, [%5+%8*2] 369 370 lea %6, [%6+%8*2] 371%endif 372 psadbw mm1, mm0 373 paddw mm7, mm1 374 375%endmacro 376 377;void int vp8_sad16x16x3_sse3( 378; unsigned char *src_ptr, 379; int src_stride, 380; unsigned char *ref_ptr, 381; int ref_stride, 382; int *results) 383global sym(vp8_sad16x16x3_sse3) PRIVATE 384sym(vp8_sad16x16x3_sse3): 385 386 STACK_FRAME_CREATE_X3 387 388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 396 397 mov rcx, result_ptr 398 399 movq xmm0, xmm5 400 psrldq xmm5, 8 401 402 paddw xmm0, xmm5 403 movd [rcx], xmm0 404;- 405 movq xmm0, xmm6 406 psrldq xmm6, 8 407 408 paddw xmm0, xmm6 409 movd [rcx+4], xmm0 410;- 411 movq xmm0, xmm7 412 psrldq xmm7, 8 413 414 paddw xmm0, xmm7 415 movd [rcx+8], xmm0 416 417 STACK_FRAME_DESTROY_X3 418 419;void int vp8_sad16x8x3_sse3( 420; unsigned char *src_ptr, 421; int src_stride, 422; unsigned char *ref_ptr, 423; int ref_stride, 424; int *results) 425global sym(vp8_sad16x8x3_sse3) PRIVATE 426sym(vp8_sad16x8x3_sse3): 427 428 STACK_FRAME_CREATE_X3 429 430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 434 435 mov rcx, result_ptr 436 437 movq xmm0, xmm5 438 psrldq xmm5, 8 439 440 paddw xmm0, xmm5 441 movd [rcx], xmm0 442;- 443 movq xmm0, xmm6 444 psrldq xmm6, 8 445 446 paddw xmm0, xmm6 447 movd [rcx+4], xmm0 448;- 449 movq xmm0, xmm7 450 psrldq xmm7, 8 451 452 paddw xmm0, xmm7 453 movd [rcx+8], xmm0 454 455 STACK_FRAME_DESTROY_X3 456 457;void int vp8_sad8x16x3_sse3( 458; unsigned char *src_ptr, 459; int src_stride, 460; unsigned char *ref_ptr, 461; int ref_stride, 462; int *results) 463global sym(vp8_sad8x16x3_sse3) PRIVATE 464sym(vp8_sad8x16x3_sse3): 465 466 STACK_FRAME_CREATE_X3 467 468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 476 477 mov rcx, result_ptr 478 479 punpckldq mm5, mm6 480 481 movq [rcx], mm5 482 movd [rcx+8], mm7 483 484 STACK_FRAME_DESTROY_X3 485 486;void int vp8_sad8x8x3_sse3( 487; unsigned char *src_ptr, 488; int src_stride, 489; unsigned char *ref_ptr, 490; int ref_stride, 491; int *results) 492global sym(vp8_sad8x8x3_sse3) PRIVATE 493sym(vp8_sad8x8x3_sse3): 494 495 STACK_FRAME_CREATE_X3 496 497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 501 502 mov rcx, result_ptr 503 504 punpckldq mm5, mm6 505 506 movq [rcx], mm5 507 movd [rcx+8], mm7 508 509 STACK_FRAME_DESTROY_X3 510 511;void int vp8_sad4x4x3_sse3( 512; unsigned char *src_ptr, 513; int src_stride, 514; unsigned char *ref_ptr, 515; int ref_stride, 516; int *results) 517global sym(vp8_sad4x4x3_sse3) PRIVATE 518sym(vp8_sad4x4x3_sse3): 519 520 STACK_FRAME_CREATE_X3 521 522 movd mm0, DWORD PTR [src_ptr] 523 movd mm1, DWORD PTR [ref_ptr] 524 525 movd mm2, DWORD PTR [src_ptr+src_stride] 526 movd mm3, DWORD PTR [ref_ptr+ref_stride] 527 528 punpcklbw mm0, mm2 529 punpcklbw mm1, mm3 530 531 movd mm4, DWORD PTR [ref_ptr+1] 532 movd mm5, DWORD PTR [ref_ptr+2] 533 534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 536 537 psadbw mm1, mm0 538 539 punpcklbw mm4, mm2 540 punpcklbw mm5, mm3 541 542 psadbw mm4, mm0 543 psadbw mm5, mm0 544 545 lea src_ptr, [src_ptr+src_stride*2] 546 lea ref_ptr, [ref_ptr+ref_stride*2] 547 548 movd mm0, DWORD PTR [src_ptr] 549 movd mm2, DWORD PTR [ref_ptr] 550 551 movd mm3, DWORD PTR [src_ptr+src_stride] 552 movd mm6, DWORD PTR [ref_ptr+ref_stride] 553 554 punpcklbw mm0, mm3 555 punpcklbw mm2, mm6 556 557 movd mm3, DWORD PTR [ref_ptr+1] 558 movd mm7, DWORD PTR [ref_ptr+2] 559 560 psadbw mm2, mm0 561 562 paddw mm1, mm2 563 564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 566 567 punpcklbw mm3, mm2 568 punpcklbw mm7, mm6 569 570 psadbw mm3, mm0 571 psadbw mm7, mm0 572 573 paddw mm3, mm4 574 paddw mm7, mm5 575 576 mov rcx, result_ptr 577 578 punpckldq mm1, mm3 579 580 movq [rcx], mm1 581 movd [rcx+8], mm7 582 583 STACK_FRAME_DESTROY_X3 584 585;unsigned int vp8_sad16x16_sse3( 586; unsigned char *src_ptr, 587; int src_stride, 588; unsigned char *ref_ptr, 589; int ref_stride, 590; int max_sad) 591;%define lddqu movdqu 592global sym(vp8_sad16x16_sse3) PRIVATE 593sym(vp8_sad16x16_sse3): 594 595 STACK_FRAME_CREATE_X3 596 597 mov end_ptr, 4 598 pxor xmm7, xmm7 599 600.vp8_sad16x16_sse3_loop: 601 movdqa xmm0, XMMWORD PTR [src_ptr] 602 movdqu xmm1, XMMWORD PTR [ref_ptr] 603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] 604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] 605 606 lea src_ptr, [src_ptr+src_stride*2] 607 lea ref_ptr, [ref_ptr+ref_stride*2] 608 609 movdqa xmm4, XMMWORD PTR [src_ptr] 610 movdqu xmm5, XMMWORD PTR [ref_ptr] 611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] 612 613 psadbw xmm0, xmm1 614 615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] 616 617 psadbw xmm2, xmm3 618 psadbw xmm4, xmm5 619 psadbw xmm6, xmm1 620 621 lea src_ptr, [src_ptr+src_stride*2] 622 lea ref_ptr, [ref_ptr+ref_stride*2] 623 624 paddw xmm7, xmm0 625 paddw xmm7, xmm2 626 paddw xmm7, xmm4 627 paddw xmm7, xmm6 628 629 sub end_ptr, 1 630 jne .vp8_sad16x16_sse3_loop 631 632 movq xmm0, xmm7 633 psrldq xmm7, 8 634 paddw xmm0, xmm7 635 movq rax, xmm0 636 637 STACK_FRAME_DESTROY_X3 638 639;void vp8_copy32xn_sse3( 640; unsigned char *src_ptr, 641; int src_stride, 642; unsigned char *dst_ptr, 643; int dst_stride, 644; int height); 645global sym(vp8_copy32xn_sse3) PRIVATE 646sym(vp8_copy32xn_sse3): 647 648 STACK_FRAME_CREATE_X3 649 650.block_copy_sse3_loopx4: 651 lea end_ptr, [src_ptr+src_stride*2] 652 653 movdqu xmm0, XMMWORD PTR [src_ptr] 654 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] 656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] 657 movdqu xmm4, XMMWORD PTR [end_ptr] 658 movdqu xmm5, XMMWORD PTR [end_ptr + 16] 659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] 660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] 661 662 lea src_ptr, [src_ptr+src_stride*4] 663 664 lea end_ptr, [ref_ptr+ref_stride*2] 665 666 movdqa XMMWORD PTR [ref_ptr], xmm0 667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 670 movdqa XMMWORD PTR [end_ptr], xmm4 671 movdqa XMMWORD PTR [end_ptr + 16], xmm5 672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 674 675 lea ref_ptr, [ref_ptr+ref_stride*4] 676 677 sub height, 4 678 cmp height, 4 679 jge .block_copy_sse3_loopx4 680 681 ;Check to see if there is more rows need to be copied. 682 cmp height, 0 683 je .copy_is_done 684 685.block_copy_sse3_loop: 686 movdqu xmm0, XMMWORD PTR [src_ptr] 687 movdqu xmm1, XMMWORD PTR [src_ptr + 16] 688 lea src_ptr, [src_ptr+src_stride] 689 690 movdqa XMMWORD PTR [ref_ptr], xmm0 691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 692 lea ref_ptr, [ref_ptr+ref_stride] 693 694 sub height, 1 695 jne .block_copy_sse3_loop 696 697.copy_is_done: 698 STACK_FRAME_DESTROY_X3 699 700;void vp8_sad16x16x4d_sse3( 701; unsigned char *src_ptr, 702; int src_stride, 703; unsigned char *ref_ptr_base, 704; int ref_stride, 705; int *results) 706global sym(vp8_sad16x16x4d_sse3) PRIVATE 707sym(vp8_sad16x16x4d_sse3): 708 709 STACK_FRAME_CREATE_X4 710 711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 719 720%if ABI_IS_32BIT 721 pop rbp 722%endif 723 mov rcx, result_ptr 724 725 movq xmm0, xmm4 726 psrldq xmm4, 8 727 728 paddw xmm0, xmm4 729 movd [rcx], xmm0 730;- 731 movq xmm0, xmm5 732 psrldq xmm5, 8 733 734 paddw xmm0, xmm5 735 movd [rcx+4], xmm0 736;- 737 movq xmm0, xmm6 738 psrldq xmm6, 8 739 740 paddw xmm0, xmm6 741 movd [rcx+8], xmm0 742;- 743 movq xmm0, xmm7 744 psrldq xmm7, 8 745 746 paddw xmm0, xmm7 747 movd [rcx+12], xmm0 748 749 STACK_FRAME_DESTROY_X4 750 751;void vp8_sad16x8x4d_sse3( 752; unsigned char *src_ptr, 753; int src_stride, 754; unsigned char *ref_ptr_base, 755; int ref_stride, 756; int *results) 757global sym(vp8_sad16x8x4d_sse3) PRIVATE 758sym(vp8_sad16x8x4d_sse3): 759 760 STACK_FRAME_CREATE_X4 761 762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 766 767%if ABI_IS_32BIT 768 pop rbp 769%endif 770 mov rcx, result_ptr 771 772 movq xmm0, xmm4 773 psrldq xmm4, 8 774 775 paddw xmm0, xmm4 776 movd [rcx], xmm0 777;- 778 movq xmm0, xmm5 779 psrldq xmm5, 8 780 781 paddw xmm0, xmm5 782 movd [rcx+4], xmm0 783;- 784 movq xmm0, xmm6 785 psrldq xmm6, 8 786 787 paddw xmm0, xmm6 788 movd [rcx+8], xmm0 789;- 790 movq xmm0, xmm7 791 psrldq xmm7, 8 792 793 paddw xmm0, xmm7 794 movd [rcx+12], xmm0 795 796 STACK_FRAME_DESTROY_X4 797 798;void int vp8_sad8x16x4d_sse3( 799; unsigned char *src_ptr, 800; int src_stride, 801; unsigned char *ref_ptr, 802; int ref_stride, 803; int *results) 804global sym(vp8_sad8x16x4d_sse3) PRIVATE 805sym(vp8_sad8x16x4d_sse3): 806 807 STACK_FRAME_CREATE_X4 808 809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 817 818%if ABI_IS_32BIT 819 pop rbp 820%endif 821 mov rcx, result_ptr 822 823 punpckldq mm4, mm5 824 punpckldq mm6, mm7 825 826 movq [rcx], mm4 827 movq [rcx+8], mm6 828 829 STACK_FRAME_DESTROY_X4 830 831;void int vp8_sad8x8x4d_sse3( 832; unsigned char *src_ptr, 833; int src_stride, 834; unsigned char *ref_ptr, 835; int ref_stride, 836; int *results) 837global sym(vp8_sad8x8x4d_sse3) PRIVATE 838sym(vp8_sad8x8x4d_sse3): 839 840 STACK_FRAME_CREATE_X4 841 842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 846 847%if ABI_IS_32BIT 848 pop rbp 849%endif 850 mov rcx, result_ptr 851 852 punpckldq mm4, mm5 853 punpckldq mm6, mm7 854 855 movq [rcx], mm4 856 movq [rcx+8], mm6 857 858 STACK_FRAME_DESTROY_X4 859 860;void int vp8_sad4x4x4d_sse3( 861; unsigned char *src_ptr, 862; int src_stride, 863; unsigned char *ref_ptr, 864; int ref_stride, 865; int *results) 866global sym(vp8_sad4x4x4d_sse3) PRIVATE 867sym(vp8_sad4x4x4d_sse3): 868 869 STACK_FRAME_CREATE_X4 870 871 movd mm0, DWORD PTR [src_ptr] 872 movd mm1, DWORD PTR [r0_ptr] 873 874 movd mm2, DWORD PTR [src_ptr+src_stride] 875 movd mm3, DWORD PTR [r0_ptr+ref_stride] 876 877 punpcklbw mm0, mm2 878 punpcklbw mm1, mm3 879 880 movd mm4, DWORD PTR [r1_ptr] 881 movd mm5, DWORD PTR [r2_ptr] 882 883 movd mm6, DWORD PTR [r3_ptr] 884 movd mm2, DWORD PTR [r1_ptr+ref_stride] 885 886 movd mm3, DWORD PTR [r2_ptr+ref_stride] 887 movd mm7, DWORD PTR [r3_ptr+ref_stride] 888 889 psadbw mm1, mm0 890 891 punpcklbw mm4, mm2 892 punpcklbw mm5, mm3 893 894 punpcklbw mm6, mm7 895 psadbw mm4, mm0 896 897 psadbw mm5, mm0 898 psadbw mm6, mm0 899 900 901 902 lea src_ptr, [src_ptr+src_stride*2] 903 lea r0_ptr, [r0_ptr+ref_stride*2] 904 905 lea r1_ptr, [r1_ptr+ref_stride*2] 906 lea r2_ptr, [r2_ptr+ref_stride*2] 907 908 lea r3_ptr, [r3_ptr+ref_stride*2] 909 910 movd mm0, DWORD PTR [src_ptr] 911 movd mm2, DWORD PTR [r0_ptr] 912 913 movd mm3, DWORD PTR [src_ptr+src_stride] 914 movd mm7, DWORD PTR [r0_ptr+ref_stride] 915 916 punpcklbw mm0, mm3 917 punpcklbw mm2, mm7 918 919 movd mm3, DWORD PTR [r1_ptr] 920 movd mm7, DWORD PTR [r2_ptr] 921 922 psadbw mm2, mm0 923%if ABI_IS_32BIT 924 mov rax, rbp 925 926 pop rbp 927%define ref_stride rax 928%endif 929 mov rsi, result_ptr 930 931 paddw mm1, mm2 932 movd [rsi], mm1 933 934 movd mm2, DWORD PTR [r1_ptr+ref_stride] 935 movd mm1, DWORD PTR [r2_ptr+ref_stride] 936 937 punpcklbw mm3, mm2 938 punpcklbw mm7, mm1 939 940 psadbw mm3, mm0 941 psadbw mm7, mm0 942 943 movd mm2, DWORD PTR [r3_ptr] 944 movd mm1, DWORD PTR [r3_ptr+ref_stride] 945 946 paddw mm3, mm4 947 paddw mm7, mm5 948 949 movd [rsi+4], mm3 950 punpcklbw mm2, mm1 951 952 movd [rsi+8], mm7 953 psadbw mm2, mm0 954 955 paddw mm2, mm6 956 movd [rsi+12], mm2 957 958 959 STACK_FRAME_DESTROY_X4 960 961