1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define xmm_filter_shift 7 15 16;unsigned int vp8_get_mb_ss_sse2 17;( 18; short *src_ptr 19;) 20global sym(vp8_get_mb_ss_sse2) PRIVATE 21sym(vp8_get_mb_ss_sse2): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 1 25 GET_GOT rbx 26 push rsi 27 push rdi 28 sub rsp, 16 29 ; end prolog 30 31 32 mov rax, arg(0) ;[src_ptr] 33 mov rcx, 8 34 pxor xmm4, xmm4 35 36.NEXTROW: 37 movdqa xmm0, [rax] 38 movdqa xmm1, [rax+16] 39 movdqa xmm2, [rax+32] 40 movdqa xmm3, [rax+48] 41 pmaddwd xmm0, xmm0 42 pmaddwd xmm1, xmm1 43 pmaddwd xmm2, xmm2 44 pmaddwd xmm3, xmm3 45 46 paddd xmm0, xmm1 47 paddd xmm2, xmm3 48 paddd xmm4, xmm0 49 paddd xmm4, xmm2 50 51 add rax, 0x40 52 dec rcx 53 ja .NEXTROW 54 55 movdqa xmm3,xmm4 56 psrldq xmm4,8 57 paddd xmm4,xmm3 58 movdqa xmm3,xmm4 59 psrldq xmm4,4 60 paddd xmm4,xmm3 61 movq rax,xmm4 62 63 64 ; begin epilog 65 add rsp, 16 66 pop rdi 67 pop rsi 68 RESTORE_GOT 69 UNSHADOW_ARGS 70 pop rbp 71 ret 72 73 74;unsigned int vp8_get16x16var_sse2 75;( 76; unsigned char * src_ptr, 77; int source_stride, 78; unsigned char * ref_ptr, 79; int recon_stride, 80; unsigned int * SSE, 81; int * Sum 82;) 83global sym(vp8_get16x16var_sse2) PRIVATE 84sym(vp8_get16x16var_sse2): 85 push rbp 86 mov rbp, rsp 87 SHADOW_ARGS_TO_STACK 6 88 SAVE_XMM 7 89 push rbx 90 push rsi 91 push rdi 92 ; end prolog 93 94 mov rsi, arg(0) ;[src_ptr] 95 mov rdi, arg(2) ;[ref_ptr] 96 97 movsxd rax, DWORD PTR arg(1) ;[source_stride] 98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 99 100 ; Prefetch data 101 lea rcx, [rax+rax*2] 102 prefetcht0 [rsi] 103 prefetcht0 [rsi+rax] 104 prefetcht0 [rsi+rax*2] 105 prefetcht0 [rsi+rcx] 106 lea rbx, [rsi+rax*4] 107 prefetcht0 [rbx] 108 prefetcht0 [rbx+rax] 109 prefetcht0 [rbx+rax*2] 110 prefetcht0 [rbx+rcx] 111 112 lea rcx, [rdx+rdx*2] 113 prefetcht0 [rdi] 114 prefetcht0 [rdi+rdx] 115 prefetcht0 [rdi+rdx*2] 116 prefetcht0 [rdi+rcx] 117 lea rbx, [rdi+rdx*4] 118 prefetcht0 [rbx] 119 prefetcht0 [rbx+rdx] 120 prefetcht0 [rbx+rdx*2] 121 prefetcht0 [rbx+rcx] 122 123 pxor xmm0, xmm0 ; clear xmm0 for unpack 124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 125 126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 127 mov rcx, 16 128 129.var16loop: 130 movdqu xmm1, XMMWORD PTR [rsi] 131 movdqu xmm2, XMMWORD PTR [rdi] 132 133 prefetcht0 [rsi+rax*8] 134 prefetcht0 [rdi+rdx*8] 135 136 movdqa xmm3, xmm1 137 movdqa xmm4, xmm2 138 139 140 punpcklbw xmm1, xmm0 141 punpckhbw xmm3, xmm0 142 143 punpcklbw xmm2, xmm0 144 punpckhbw xmm4, xmm0 145 146 147 psubw xmm1, xmm2 148 psubw xmm3, xmm4 149 150 paddw xmm7, xmm1 151 pmaddwd xmm1, xmm1 152 153 paddw xmm7, xmm3 154 pmaddwd xmm3, xmm3 155 156 paddd xmm6, xmm1 157 paddd xmm6, xmm3 158 159 add rsi, rax 160 add rdi, rdx 161 162 sub rcx, 1 163 jnz .var16loop 164 165 166 movdqa xmm1, xmm6 167 pxor xmm6, xmm6 168 169 pxor xmm5, xmm5 170 punpcklwd xmm6, xmm7 171 172 punpckhwd xmm5, xmm7 173 psrad xmm5, 16 174 175 psrad xmm6, 16 176 paddd xmm6, xmm5 177 178 movdqa xmm2, xmm1 179 punpckldq xmm1, xmm0 180 181 punpckhdq xmm2, xmm0 182 movdqa xmm7, xmm6 183 184 paddd xmm1, xmm2 185 punpckldq xmm6, xmm0 186 187 punpckhdq xmm7, xmm0 188 paddd xmm6, xmm7 189 190 movdqa xmm2, xmm1 191 movdqa xmm7, xmm6 192 193 psrldq xmm1, 8 194 psrldq xmm6, 8 195 196 paddd xmm7, xmm6 197 paddd xmm1, xmm2 198 199 mov rax, arg(5) ;[Sum] 200 mov rdi, arg(4) ;[SSE] 201 202 movd DWORD PTR [rax], xmm7 203 movd DWORD PTR [rdi], xmm1 204 205 206 ; begin epilog 207 pop rdi 208 pop rsi 209 pop rbx 210 RESTORE_XMM 211 UNSHADOW_ARGS 212 pop rbp 213 ret 214 215 216 217 218;unsigned int vp8_get8x8var_sse2 219;( 220; unsigned char * src_ptr, 221; int source_stride, 222; unsigned char * ref_ptr, 223; int recon_stride, 224; unsigned int * SSE, 225; int * Sum 226;) 227global sym(vp8_get8x8var_sse2) PRIVATE 228sym(vp8_get8x8var_sse2): 229 push rbp 230 mov rbp, rsp 231 SHADOW_ARGS_TO_STACK 6 232 SAVE_XMM 7 233 GET_GOT rbx 234 push rsi 235 push rdi 236 sub rsp, 16 237 ; end prolog 238 239 mov rsi, arg(0) ;[src_ptr] 240 mov rdi, arg(2) ;[ref_ptr] 241 242 movsxd rax, DWORD PTR arg(1) ;[source_stride] 243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 244 245 pxor xmm0, xmm0 ; clear xmm0 for unpack 246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 247 248 movq xmm1, QWORD PTR [rsi] 249 movq xmm2, QWORD PTR [rdi] 250 251 punpcklbw xmm1, xmm0 252 punpcklbw xmm2, xmm0 253 254 psubsw xmm1, xmm2 255 paddw xmm7, xmm1 256 257 pmaddwd xmm1, xmm1 258 259 movq xmm2, QWORD PTR[rsi + rax] 260 movq xmm3, QWORD PTR[rdi + rdx] 261 262 punpcklbw xmm2, xmm0 263 punpcklbw xmm3, xmm0 264 265 psubsw xmm2, xmm3 266 paddw xmm7, xmm2 267 268 pmaddwd xmm2, xmm2 269 paddd xmm1, xmm2 270 271 272 movq xmm2, QWORD PTR[rsi + rax * 2] 273 movq xmm3, QWORD PTR[rdi + rdx * 2] 274 275 punpcklbw xmm2, xmm0 276 punpcklbw xmm3, xmm0 277 278 psubsw xmm2, xmm3 279 paddw xmm7, xmm2 280 281 pmaddwd xmm2, xmm2 282 paddd xmm1, xmm2 283 284 285 lea rsi, [rsi + rax * 2] 286 lea rdi, [rdi + rdx * 2] 287 movq xmm2, QWORD PTR[rsi + rax] 288 movq xmm3, QWORD PTR[rdi + rdx] 289 290 punpcklbw xmm2, xmm0 291 punpcklbw xmm3, xmm0 292 293 psubsw xmm2, xmm3 294 paddw xmm7, xmm2 295 296 pmaddwd xmm2, xmm2 297 paddd xmm1, xmm2 298 299 movq xmm2, QWORD PTR[rsi + rax *2] 300 movq xmm3, QWORD PTR[rdi + rdx *2] 301 302 punpcklbw xmm2, xmm0 303 punpcklbw xmm3, xmm0 304 305 psubsw xmm2, xmm3 306 paddw xmm7, xmm2 307 308 pmaddwd xmm2, xmm2 309 paddd xmm1, xmm2 310 311 312 lea rsi, [rsi + rax * 2] 313 lea rdi, [rdi + rdx * 2] 314 315 316 movq xmm2, QWORD PTR[rsi + rax] 317 movq xmm3, QWORD PTR[rdi + rdx] 318 319 punpcklbw xmm2, xmm0 320 punpcklbw xmm3, xmm0 321 322 psubsw xmm2, xmm3 323 paddw xmm7, xmm2 324 325 pmaddwd xmm2, xmm2 326 paddd xmm1, xmm2 327 328 movq xmm2, QWORD PTR[rsi + rax *2] 329 movq xmm3, QWORD PTR[rdi + rdx *2] 330 331 punpcklbw xmm2, xmm0 332 punpcklbw xmm3, xmm0 333 334 psubsw xmm2, xmm3 335 paddw xmm7, xmm2 336 337 pmaddwd xmm2, xmm2 338 paddd xmm1, xmm2 339 340 341 lea rsi, [rsi + rax * 2] 342 lea rdi, [rdi + rdx * 2] 343 344 movq xmm2, QWORD PTR[rsi + rax] 345 movq xmm3, QWORD PTR[rdi + rdx] 346 347 punpcklbw xmm2, xmm0 348 punpcklbw xmm3, xmm0 349 350 psubsw xmm2, xmm3 351 paddw xmm7, xmm2 352 353 pmaddwd xmm2, xmm2 354 paddd xmm1, xmm2 355 356 357 movdqa xmm6, xmm7 358 punpcklwd xmm6, xmm0 359 360 punpckhwd xmm7, xmm0 361 movdqa xmm2, xmm1 362 363 paddw xmm6, xmm7 364 punpckldq xmm1, xmm0 365 366 punpckhdq xmm2, xmm0 367 movdqa xmm7, xmm6 368 369 paddd xmm1, xmm2 370 punpckldq xmm6, xmm0 371 372 punpckhdq xmm7, xmm0 373 paddw xmm6, xmm7 374 375 movdqa xmm2, xmm1 376 movdqa xmm7, xmm6 377 378 psrldq xmm1, 8 379 psrldq xmm6, 8 380 381 paddw xmm7, xmm6 382 paddd xmm1, xmm2 383 384 mov rax, arg(5) ;[Sum] 385 mov rdi, arg(4) ;[SSE] 386 387 movq rdx, xmm7 388 movsx rcx, dx 389 390 mov dword ptr [rax], ecx 391 movd DWORD PTR [rdi], xmm1 392 393 ; begin epilog 394 add rsp, 16 395 pop rdi 396 pop rsi 397 RESTORE_GOT 398 RESTORE_XMM 399 UNSHADOW_ARGS 400 pop rbp 401 ret 402 403;void vp8_filter_block2d_bil_var_sse2 404;( 405; unsigned char *ref_ptr, 406; int ref_pixels_per_line, 407; unsigned char *src_ptr, 408; int src_pixels_per_line, 409; unsigned int Height, 410; int xoffset, 411; int yoffset, 412; int *sum, 413; unsigned int *sumsquared;; 414; 415;) 416global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE 417sym(vp8_filter_block2d_bil_var_sse2): 418 push rbp 419 mov rbp, rsp 420 SHADOW_ARGS_TO_STACK 9 421 SAVE_XMM 7 422 GET_GOT rbx 423 push rsi 424 push rdi 425 push rbx 426 ; end prolog 427 428 pxor xmm6, xmm6 ; 429 pxor xmm7, xmm7 ; 430 431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding 432 movdqa xmm4, XMMWORD PTR [rsi] 433 434 lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] 435 movsxd rax, dword ptr arg(5) ; xoffset 436 437 cmp rax, 0 ; skip first_pass filter if xoffset=0 438 je filter_block2d_bil_var_sse2_sp_only 439 440 shl rax, 5 ; point to filter coeff with xoffset 441 lea rax, [rax + rcx] ; HFilter 442 443 movsxd rdx, dword ptr arg(6) ; yoffset 444 445 cmp rdx, 0 ; skip second_pass filter if yoffset=0 446 je filter_block2d_bil_var_sse2_fp_only 447 448 shl rdx, 5 449 lea rdx, [rdx + rcx] ; VFilter 450 451 mov rsi, arg(0) ;ref_ptr 452 mov rdi, arg(2) ;src_ptr 453 movsxd rcx, dword ptr arg(4) ;Height 454 455 pxor xmm0, xmm0 ; 456 movq xmm1, QWORD PTR [rsi] ; 457 movq xmm3, QWORD PTR [rsi+1] ; 458 459 punpcklbw xmm1, xmm0 ; 460 pmullw xmm1, [rax] ; 461 punpcklbw xmm3, xmm0 462 pmullw xmm3, [rax+16] ; 463 464 paddw xmm1, xmm3 ; 465 paddw xmm1, xmm4 ; 466 psraw xmm1, xmm_filter_shift ; 467 movdqa xmm5, xmm1 468 469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line 470 lea rsi, [rsi + rbx] 471%if ABI_IS_32BIT=0 472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 473%endif 474 475filter_block2d_bil_var_sse2_loop: 476 movq xmm1, QWORD PTR [rsi] ; 477 movq xmm3, QWORD PTR [rsi+1] ; 478 479 punpcklbw xmm1, xmm0 ; 480 pmullw xmm1, [rax] ; 481 punpcklbw xmm3, xmm0 ; 482 pmullw xmm3, [rax+16] ; 483 484 paddw xmm1, xmm3 ; 485 paddw xmm1, xmm4 ; 486 psraw xmm1, xmm_filter_shift ; 487 488 movdqa xmm3, xmm5 ; 489 movdqa xmm5, xmm1 ; 490 491 pmullw xmm3, [rdx] ; 492 pmullw xmm1, [rdx+16] ; 493 paddw xmm1, xmm3 ; 494 paddw xmm1, xmm4 ; 495 psraw xmm1, xmm_filter_shift ; 496 497 movq xmm3, QWORD PTR [rdi] ; 498 punpcklbw xmm3, xmm0 ; 499 500 psubw xmm1, xmm3 ; 501 paddw xmm6, xmm1 ; 502 503 pmaddwd xmm1, xmm1 ; 504 paddd xmm7, xmm1 ; 505 506 lea rsi, [rsi + rbx] ;ref_pixels_per_line 507%if ABI_IS_32BIT 508 add rdi, dword ptr arg(3) ;src_pixels_per_line 509%else 510 lea rdi, [rdi + r9] 511%endif 512 513 sub rcx, 1 ; 514 jnz filter_block2d_bil_var_sse2_loop ; 515 516 jmp filter_block2d_bil_variance 517 518filter_block2d_bil_var_sse2_sp_only: 519 movsxd rdx, dword ptr arg(6) ; yoffset 520 521 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 522 je filter_block2d_bil_var_sse2_full_pixel 523 524 shl rdx, 5 525 lea rdx, [rdx + rcx] ; VFilter 526 527 mov rsi, arg(0) ;ref_ptr 528 mov rdi, arg(2) ;src_ptr 529 movsxd rcx, dword ptr arg(4) ;Height 530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 531 532 pxor xmm0, xmm0 ; 533 movq xmm1, QWORD PTR [rsi] ; 534 punpcklbw xmm1, xmm0 ; 535 536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 537 lea rsi, [rsi + rax] 538 539filter_block2d_bil_sp_only_loop: 540 movq xmm3, QWORD PTR [rsi] ; 541 punpcklbw xmm3, xmm0 ; 542 movdqa xmm5, xmm3 543 544 pmullw xmm1, [rdx] ; 545 pmullw xmm3, [rdx+16] ; 546 paddw xmm1, xmm3 ; 547 paddw xmm1, xmm4 ; 548 psraw xmm1, xmm_filter_shift ; 549 550 movq xmm3, QWORD PTR [rdi] ; 551 punpcklbw xmm3, xmm0 ; 552 553 psubw xmm1, xmm3 ; 554 paddw xmm6, xmm1 ; 555 556 pmaddwd xmm1, xmm1 ; 557 paddd xmm7, xmm1 ; 558 559 movdqa xmm1, xmm5 ; 560 lea rsi, [rsi + rax] ;ref_pixels_per_line 561 lea rdi, [rdi + rbx] ;src_pixels_per_line 562 563 sub rcx, 1 ; 564 jnz filter_block2d_bil_sp_only_loop ; 565 566 jmp filter_block2d_bil_variance 567 568filter_block2d_bil_var_sse2_full_pixel: 569 mov rsi, arg(0) ;ref_ptr 570 mov rdi, arg(2) ;src_ptr 571 movsxd rcx, dword ptr arg(4) ;Height 572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 574 pxor xmm0, xmm0 ; 575 576filter_block2d_bil_full_pixel_loop: 577 movq xmm1, QWORD PTR [rsi] ; 578 punpcklbw xmm1, xmm0 ; 579 580 movq xmm2, QWORD PTR [rdi] ; 581 punpcklbw xmm2, xmm0 ; 582 583 psubw xmm1, xmm2 ; 584 paddw xmm6, xmm1 ; 585 586 pmaddwd xmm1, xmm1 ; 587 paddd xmm7, xmm1 ; 588 589 lea rsi, [rsi + rax] ;ref_pixels_per_line 590 lea rdi, [rdi + rbx] ;src_pixels_per_line 591 592 sub rcx, 1 ; 593 jnz filter_block2d_bil_full_pixel_loop ; 594 595 jmp filter_block2d_bil_variance 596 597filter_block2d_bil_var_sse2_fp_only: 598 mov rsi, arg(0) ;ref_ptr 599 mov rdi, arg(2) ;src_ptr 600 movsxd rcx, dword ptr arg(4) ;Height 601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 602 603 pxor xmm0, xmm0 ; 604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 605 606filter_block2d_bil_fp_only_loop: 607 movq xmm1, QWORD PTR [rsi] ; 608 movq xmm3, QWORD PTR [rsi+1] ; 609 610 punpcklbw xmm1, xmm0 ; 611 pmullw xmm1, [rax] ; 612 punpcklbw xmm3, xmm0 ; 613 pmullw xmm3, [rax+16] ; 614 615 paddw xmm1, xmm3 ; 616 paddw xmm1, xmm4 ; 617 psraw xmm1, xmm_filter_shift ; 618 619 movq xmm3, QWORD PTR [rdi] ; 620 punpcklbw xmm3, xmm0 ; 621 622 psubw xmm1, xmm3 ; 623 paddw xmm6, xmm1 ; 624 625 pmaddwd xmm1, xmm1 ; 626 paddd xmm7, xmm1 ; 627 lea rsi, [rsi + rdx] 628 lea rdi, [rdi + rbx] ;src_pixels_per_line 629 630 sub rcx, 1 ; 631 jnz filter_block2d_bil_fp_only_loop ; 632 633 jmp filter_block2d_bil_variance 634 635filter_block2d_bil_variance: 636 movdq2q mm6, xmm6 ; 637 movdq2q mm7, xmm7 ; 638 639 psrldq xmm6, 8 640 psrldq xmm7, 8 641 642 movdq2q mm2, xmm6 643 movdq2q mm3, xmm7 644 645 paddw mm6, mm2 646 paddd mm7, mm3 647 648 pxor mm3, mm3 ; 649 pxor mm2, mm2 ; 650 651 punpcklwd mm2, mm6 ; 652 punpckhwd mm3, mm6 ; 653 654 paddd mm2, mm3 ; 655 movq mm6, mm2 ; 656 657 psrlq mm6, 32 ; 658 paddd mm2, mm6 ; 659 660 psrad mm2, 16 ; 661 movq mm4, mm7 ; 662 663 psrlq mm4, 32 ; 664 paddd mm4, mm7 ; 665 666 mov rsi, arg(7) ; sum 667 mov rdi, arg(8) ; sumsquared 668 669 movd [rsi], mm2 ; xsum 670 movd [rdi], mm4 ; xxsum 671 672 ; begin epilog 673 pop rbx 674 pop rdi 675 pop rsi 676 RESTORE_GOT 677 RESTORE_XMM 678 UNSHADOW_ARGS 679 pop rbp 680 ret 681 682 683;void vp8_half_horiz_vert_variance8x_h_sse2 684;( 685; unsigned char *ref_ptr, 686; int ref_pixels_per_line, 687; unsigned char *src_ptr, 688; int src_pixels_per_line, 689; unsigned int Height, 690; int *sum, 691; unsigned int *sumsquared 692;) 693global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE 694sym(vp8_half_horiz_vert_variance8x_h_sse2): 695 push rbp 696 mov rbp, rsp 697 SHADOW_ARGS_TO_STACK 7 698 SAVE_XMM 7 699 GET_GOT rbx 700 push rsi 701 push rdi 702 ; end prolog 703 704%if ABI_IS_32BIT=0 705 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 706 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 707%endif 708 709 pxor xmm6, xmm6 ; error accumulator 710 pxor xmm7, xmm7 ; sse eaccumulator 711 mov rsi, arg(0) ;ref_ptr ; 712 713 mov rdi, arg(2) ;src_ptr ; 714 movsxd rcx, dword ptr arg(4) ;Height ; 715 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 716 717 pxor xmm0, xmm0 ; 718 719 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 720 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 721 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 722 723%if ABI_IS_32BIT 724 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 725%else 726 add rsi, r8 727%endif 728 729vp8_half_horiz_vert_variance8x_h_1: 730 731 movq xmm1, QWORD PTR [rsi] ; 732 movq xmm2, QWORD PTR [rsi+1] ; 733 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 734 735 pavgb xmm5, xmm1 ; xmm = vertical average of the above 736 punpcklbw xmm5, xmm0 ; xmm5 = words of above 737 738 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 739 punpcklbw xmm3, xmm0 ; xmm3 = words of above 740 741 psubw xmm5, xmm3 ; xmm5 -= xmm3 742 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 743 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 744 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 745 746 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 747 748%if ABI_IS_32BIT 749 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 750 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 751%else 752 add rsi, r8 753 add rdi, r9 754%endif 755 756 sub rcx, 1 ; 757 jnz vp8_half_horiz_vert_variance8x_h_1 ; 758 759 movdq2q mm6, xmm6 ; 760 movdq2q mm7, xmm7 ; 761 762 psrldq xmm6, 8 763 psrldq xmm7, 8 764 765 movdq2q mm2, xmm6 766 movdq2q mm3, xmm7 767 768 paddw mm6, mm2 769 paddd mm7, mm3 770 771 pxor mm3, mm3 ; 772 pxor mm2, mm2 ; 773 774 punpcklwd mm2, mm6 ; 775 punpckhwd mm3, mm6 ; 776 777 paddd mm2, mm3 ; 778 movq mm6, mm2 ; 779 780 psrlq mm6, 32 ; 781 paddd mm2, mm6 ; 782 783 psrad mm2, 16 ; 784 movq mm4, mm7 ; 785 786 psrlq mm4, 32 ; 787 paddd mm4, mm7 ; 788 789 mov rsi, arg(5) ; sum 790 mov rdi, arg(6) ; sumsquared 791 792 movd [rsi], mm2 ; 793 movd [rdi], mm4 ; 794 795 796 ; begin epilog 797 pop rdi 798 pop rsi 799 RESTORE_GOT 800 RESTORE_XMM 801 UNSHADOW_ARGS 802 pop rbp 803 ret 804 805;void vp8_half_horiz_vert_variance16x_h_sse2 806;( 807; unsigned char *ref_ptr, 808; int ref_pixels_per_line, 809; unsigned char *src_ptr, 810; int src_pixels_per_line, 811; unsigned int Height, 812; int *sum, 813; unsigned int *sumsquared 814;) 815global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE 816sym(vp8_half_horiz_vert_variance16x_h_sse2): 817 push rbp 818 mov rbp, rsp 819 SHADOW_ARGS_TO_STACK 7 820 SAVE_XMM 7 821 GET_GOT rbx 822 push rsi 823 push rdi 824 ; end prolog 825 826 pxor xmm6, xmm6 ; error accumulator 827 pxor xmm7, xmm7 ; sse eaccumulator 828 mov rsi, arg(0) ;ref_ptr ; 829 830 mov rdi, arg(2) ;src_ptr ; 831 movsxd rcx, dword ptr arg(4) ;Height ; 832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 834 835 pxor xmm0, xmm0 ; 836 837 movdqu xmm5, XMMWORD PTR [rsi] 838 movdqu xmm3, XMMWORD PTR [rsi+1] 839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 840 841 lea rsi, [rsi + rax] 842 843vp8_half_horiz_vert_variance16x_h_1: 844 movdqu xmm1, XMMWORD PTR [rsi] ; 845 movdqu xmm2, XMMWORD PTR [rsi+1] ; 846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 847 848 pavgb xmm5, xmm1 ; xmm = vertical average of the above 849 850 movdqa xmm4, xmm5 851 punpcklbw xmm5, xmm0 ; xmm5 = words of above 852 punpckhbw xmm4, xmm0 853 854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 855 punpcklbw xmm3, xmm0 ; xmm3 = words of above 856 psubw xmm5, xmm3 ; xmm5 -= xmm3 857 858 movq xmm3, QWORD PTR [rdi+8] 859 punpcklbw xmm3, xmm0 860 psubw xmm4, xmm3 861 862 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 863 paddw xmm6, xmm4 864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 865 pmaddwd xmm4, xmm4 866 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 867 paddd xmm7, xmm4 868 869 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 870 871 lea rsi, [rsi + rax] 872 lea rdi, [rdi + rdx] 873 874 sub rcx, 1 ; 875 jnz vp8_half_horiz_vert_variance16x_h_1 ; 876 877 pxor xmm1, xmm1 878 pxor xmm5, xmm5 879 880 punpcklwd xmm0, xmm6 881 punpckhwd xmm1, xmm6 882 psrad xmm0, 16 883 psrad xmm1, 16 884 paddd xmm0, xmm1 885 movdqa xmm1, xmm0 886 887 movdqa xmm6, xmm7 888 punpckldq xmm6, xmm5 889 punpckhdq xmm7, xmm5 890 paddd xmm6, xmm7 891 892 punpckldq xmm0, xmm5 893 punpckhdq xmm1, xmm5 894 paddd xmm0, xmm1 895 896 movdqa xmm7, xmm6 897 movdqa xmm1, xmm0 898 899 psrldq xmm7, 8 900 psrldq xmm1, 8 901 902 paddd xmm6, xmm7 903 paddd xmm0, xmm1 904 905 mov rsi, arg(5) ;[Sum] 906 mov rdi, arg(6) ;[SSE] 907 908 movd [rsi], xmm0 909 movd [rdi], xmm6 910 911 ; begin epilog 912 pop rdi 913 pop rsi 914 RESTORE_GOT 915 RESTORE_XMM 916 UNSHADOW_ARGS 917 pop rbp 918 ret 919 920 921;void vp8_half_vert_variance8x_h_sse2 922;( 923; unsigned char *ref_ptr, 924; int ref_pixels_per_line, 925; unsigned char *src_ptr, 926; int src_pixels_per_line, 927; unsigned int Height, 928; int *sum, 929; unsigned int *sumsquared 930;) 931global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE 932sym(vp8_half_vert_variance8x_h_sse2): 933 push rbp 934 mov rbp, rsp 935 SHADOW_ARGS_TO_STACK 7 936 SAVE_XMM 7 937 GET_GOT rbx 938 push rsi 939 push rdi 940 ; end prolog 941 942%if ABI_IS_32BIT=0 943 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 944 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 945%endif 946 947 pxor xmm6, xmm6 ; error accumulator 948 pxor xmm7, xmm7 ; sse eaccumulator 949 mov rsi, arg(0) ;ref_ptr ; 950 951 mov rdi, arg(2) ;src_ptr ; 952 movsxd rcx, dword ptr arg(4) ;Height ; 953 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 954 955 pxor xmm0, xmm0 ; 956vp8_half_vert_variance8x_h_1: 957 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 958 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 959 960 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 961 punpcklbw xmm5, xmm0 ; xmm5 = words of above 962 963 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 964 punpcklbw xmm3, xmm0 ; xmm3 = words of above 965 966 psubw xmm5, xmm3 ; xmm5 -= xmm3 967 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 968 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 969 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 970 971%if ABI_IS_32BIT 972 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 973 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 974%else 975 add rsi, r8 976 add rdi, r9 977%endif 978 979 sub rcx, 1 ; 980 jnz vp8_half_vert_variance8x_h_1 ; 981 982 movdq2q mm6, xmm6 ; 983 movdq2q mm7, xmm7 ; 984 985 psrldq xmm6, 8 986 psrldq xmm7, 8 987 988 movdq2q mm2, xmm6 989 movdq2q mm3, xmm7 990 991 paddw mm6, mm2 992 paddd mm7, mm3 993 994 pxor mm3, mm3 ; 995 pxor mm2, mm2 ; 996 997 punpcklwd mm2, mm6 ; 998 punpckhwd mm3, mm6 ; 999 1000 paddd mm2, mm3 ; 1001 movq mm6, mm2 ; 1002 1003 psrlq mm6, 32 ; 1004 paddd mm2, mm6 ; 1005 1006 psrad mm2, 16 ; 1007 movq mm4, mm7 ; 1008 1009 psrlq mm4, 32 ; 1010 paddd mm4, mm7 ; 1011 1012 mov rsi, arg(5) ; sum 1013 mov rdi, arg(6) ; sumsquared 1014 1015 movd [rsi], mm2 ; 1016 movd [rdi], mm4 ; 1017 1018 1019 ; begin epilog 1020 pop rdi 1021 pop rsi 1022 RESTORE_GOT 1023 RESTORE_XMM 1024 UNSHADOW_ARGS 1025 pop rbp 1026 ret 1027 1028;void vp8_half_vert_variance16x_h_sse2 1029;( 1030; unsigned char *ref_ptr, 1031; int ref_pixels_per_line, 1032; unsigned char *src_ptr, 1033; int src_pixels_per_line, 1034; unsigned int Height, 1035; int *sum, 1036; unsigned int *sumsquared 1037;) 1038global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE 1039sym(vp8_half_vert_variance16x_h_sse2): 1040 push rbp 1041 mov rbp, rsp 1042 SHADOW_ARGS_TO_STACK 7 1043 SAVE_XMM 7 1044 GET_GOT rbx 1045 push rsi 1046 push rdi 1047 ; end prolog 1048 1049 pxor xmm6, xmm6 ; error accumulator 1050 pxor xmm7, xmm7 ; sse eaccumulator 1051 mov rsi, arg(0) ;ref_ptr 1052 1053 mov rdi, arg(2) ;src_ptr 1054 movsxd rcx, dword ptr arg(4) ;Height 1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1057 1058 movdqu xmm5, XMMWORD PTR [rsi] 1059 lea rsi, [rsi + rax ] 1060 pxor xmm0, xmm0 1061 1062vp8_half_vert_variance16x_h_1: 1063 movdqu xmm3, XMMWORD PTR [rsi] 1064 1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1066 movdqa xmm4, xmm5 1067 punpcklbw xmm5, xmm0 1068 punpckhbw xmm4, xmm0 1069 1070 movq xmm2, QWORD PTR [rdi] 1071 punpcklbw xmm2, xmm0 1072 psubw xmm5, xmm2 1073 movq xmm2, QWORD PTR [rdi+8] 1074 punpcklbw xmm2, xmm0 1075 psubw xmm4, xmm2 1076 1077 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1078 paddw xmm6, xmm4 1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1080 pmaddwd xmm4, xmm4 1081 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1082 paddd xmm7, xmm4 1083 1084 movdqa xmm5, xmm3 1085 1086 lea rsi, [rsi + rax] 1087 lea rdi, [rdi + rdx] 1088 1089 sub rcx, 1 1090 jnz vp8_half_vert_variance16x_h_1 1091 1092 pxor xmm1, xmm1 1093 pxor xmm5, xmm5 1094 1095 punpcklwd xmm0, xmm6 1096 punpckhwd xmm1, xmm6 1097 psrad xmm0, 16 1098 psrad xmm1, 16 1099 paddd xmm0, xmm1 1100 movdqa xmm1, xmm0 1101 1102 movdqa xmm6, xmm7 1103 punpckldq xmm6, xmm5 1104 punpckhdq xmm7, xmm5 1105 paddd xmm6, xmm7 1106 1107 punpckldq xmm0, xmm5 1108 punpckhdq xmm1, xmm5 1109 paddd xmm0, xmm1 1110 1111 movdqa xmm7, xmm6 1112 movdqa xmm1, xmm0 1113 1114 psrldq xmm7, 8 1115 psrldq xmm1, 8 1116 1117 paddd xmm6, xmm7 1118 paddd xmm0, xmm1 1119 1120 mov rsi, arg(5) ;[Sum] 1121 mov rdi, arg(6) ;[SSE] 1122 1123 movd [rsi], xmm0 1124 movd [rdi], xmm6 1125 1126 ; begin epilog 1127 pop rdi 1128 pop rsi 1129 RESTORE_GOT 1130 RESTORE_XMM 1131 UNSHADOW_ARGS 1132 pop rbp 1133 ret 1134 1135 1136;void vp8_half_horiz_variance8x_h_sse2 1137;( 1138; unsigned char *ref_ptr, 1139; int ref_pixels_per_line, 1140; unsigned char *src_ptr, 1141; int src_pixels_per_line, 1142; unsigned int Height, 1143; int *sum, 1144; unsigned int *sumsquared 1145;) 1146global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE 1147sym(vp8_half_horiz_variance8x_h_sse2): 1148 push rbp 1149 mov rbp, rsp 1150 SHADOW_ARGS_TO_STACK 7 1151 SAVE_XMM 7 1152 GET_GOT rbx 1153 push rsi 1154 push rdi 1155 ; end prolog 1156 1157%if ABI_IS_32BIT=0 1158 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1159 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1160%endif 1161 1162 pxor xmm6, xmm6 ; error accumulator 1163 pxor xmm7, xmm7 ; sse eaccumulator 1164 mov rsi, arg(0) ;ref_ptr ; 1165 1166 mov rdi, arg(2) ;src_ptr ; 1167 movsxd rcx, dword ptr arg(4) ;Height ; 1168 1169 pxor xmm0, xmm0 ; 1170vp8_half_horiz_variance8x_h_1: 1171 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1172 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1173 1174 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1175 punpcklbw xmm5, xmm0 ; xmm5 = words of above 1176 1177 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1178 punpcklbw xmm3, xmm0 ; xmm3 = words of above 1179 1180 psubw xmm5, xmm3 ; xmm5 -= xmm3 1181 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1182 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1183 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1184 1185%if ABI_IS_32BIT 1186 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1187 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1188%else 1189 add rsi, r8 1190 add rdi, r9 1191%endif 1192 sub rcx, 1 ; 1193 jnz vp8_half_horiz_variance8x_h_1 ; 1194 1195 movdq2q mm6, xmm6 ; 1196 movdq2q mm7, xmm7 ; 1197 1198 psrldq xmm6, 8 1199 psrldq xmm7, 8 1200 1201 movdq2q mm2, xmm6 1202 movdq2q mm3, xmm7 1203 1204 paddw mm6, mm2 1205 paddd mm7, mm3 1206 1207 pxor mm3, mm3 ; 1208 pxor mm2, mm2 ; 1209 1210 punpcklwd mm2, mm6 ; 1211 punpckhwd mm3, mm6 ; 1212 1213 paddd mm2, mm3 ; 1214 movq mm6, mm2 ; 1215 1216 psrlq mm6, 32 ; 1217 paddd mm2, mm6 ; 1218 1219 psrad mm2, 16 ; 1220 movq mm4, mm7 ; 1221 1222 psrlq mm4, 32 ; 1223 paddd mm4, mm7 ; 1224 1225 mov rsi, arg(5) ; sum 1226 mov rdi, arg(6) ; sumsquared 1227 1228 movd [rsi], mm2 ; 1229 movd [rdi], mm4 ; 1230 1231 1232 ; begin epilog 1233 pop rdi 1234 pop rsi 1235 RESTORE_GOT 1236 RESTORE_XMM 1237 UNSHADOW_ARGS 1238 pop rbp 1239 ret 1240 1241;void vp8_half_horiz_variance16x_h_sse2 1242;( 1243; unsigned char *ref_ptr, 1244; int ref_pixels_per_line, 1245; unsigned char *src_ptr, 1246; int src_pixels_per_line, 1247; unsigned int Height, 1248; int *sum, 1249; unsigned int *sumsquared 1250;) 1251global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE 1252sym(vp8_half_horiz_variance16x_h_sse2): 1253 push rbp 1254 mov rbp, rsp 1255 SHADOW_ARGS_TO_STACK 7 1256 SAVE_XMM 7 1257 GET_GOT rbx 1258 push rsi 1259 push rdi 1260 ; end prolog 1261 1262 pxor xmm6, xmm6 ; error accumulator 1263 pxor xmm7, xmm7 ; sse eaccumulator 1264 mov rsi, arg(0) ;ref_ptr ; 1265 1266 mov rdi, arg(2) ;src_ptr ; 1267 movsxd rcx, dword ptr arg(4) ;Height ; 1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1270 1271 pxor xmm0, xmm0 ; 1272 1273vp8_half_horiz_variance16x_h_1: 1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 1276 1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1278 movdqa xmm1, xmm5 1279 punpcklbw xmm5, xmm0 ; xmm5 = words of above 1280 punpckhbw xmm1, xmm0 1281 1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1283 punpcklbw xmm3, xmm0 ; xmm3 = words of above 1284 movq xmm2, QWORD PTR [rdi+8] 1285 punpcklbw xmm2, xmm0 1286 1287 psubw xmm5, xmm3 ; xmm5 -= xmm3 1288 psubw xmm1, xmm2 1289 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1290 paddw xmm6, xmm1 1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1292 pmaddwd xmm1, xmm1 1293 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1294 paddd xmm7, xmm1 1295 1296 lea rsi, [rsi + rax] 1297 lea rdi, [rdi + rdx] 1298 1299 sub rcx, 1 ; 1300 jnz vp8_half_horiz_variance16x_h_1 ; 1301 1302 pxor xmm1, xmm1 1303 pxor xmm5, xmm5 1304 1305 punpcklwd xmm0, xmm6 1306 punpckhwd xmm1, xmm6 1307 psrad xmm0, 16 1308 psrad xmm1, 16 1309 paddd xmm0, xmm1 1310 movdqa xmm1, xmm0 1311 1312 movdqa xmm6, xmm7 1313 punpckldq xmm6, xmm5 1314 punpckhdq xmm7, xmm5 1315 paddd xmm6, xmm7 1316 1317 punpckldq xmm0, xmm5 1318 punpckhdq xmm1, xmm5 1319 paddd xmm0, xmm1 1320 1321 movdqa xmm7, xmm6 1322 movdqa xmm1, xmm0 1323 1324 psrldq xmm7, 8 1325 psrldq xmm1, 8 1326 1327 paddd xmm6, xmm7 1328 paddd xmm0, xmm1 1329 1330 mov rsi, arg(5) ;[Sum] 1331 mov rdi, arg(6) ;[SSE] 1332 1333 movd [rsi], xmm0 1334 movd [rdi], xmm6 1335 1336 ; begin epilog 1337 pop rdi 1338 pop rsi 1339 RESTORE_GOT 1340 RESTORE_XMM 1341 UNSHADOW_ARGS 1342 pop rbp 1343 ret 1344 1345SECTION_RODATA 1346; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 1347align 16 1348xmm_bi_rd: 1349 times 8 dw 64 1350align 16 1351vp8_bilinear_filters_sse2: 1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 1360