1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void copy_mem16x16_sse2( 15; unsigned char *src, 16; int src_stride, 17; unsigned char *dst, 18; int dst_stride 19; ) 20global sym(vp8_copy_mem16x16_sse2) PRIVATE 21sym(vp8_copy_mem16x16_sse2): 22 push rbp 23 mov rbp, rsp 24 SHADOW_ARGS_TO_STACK 4 25 push rsi 26 push rdi 27 ; end prolog 28 29 mov rsi, arg(0) ;src; 30 movdqu xmm0, [rsi] 31 32 movsxd rax, dword ptr arg(1) ;src_stride; 33 mov rdi, arg(2) ;dst; 34 35 movdqu xmm1, [rsi+rax] 36 movdqu xmm2, [rsi+rax*2] 37 38 movsxd rcx, dword ptr arg(3) ;dst_stride 39 lea rsi, [rsi+rax*2] 40 41 movdqa [rdi], xmm0 42 add rsi, rax 43 44 movdqa [rdi+rcx], xmm1 45 movdqa [rdi+rcx*2],xmm2 46 47 lea rdi, [rdi+rcx*2] 48 movdqu xmm3, [rsi] 49 50 add rdi, rcx 51 movdqu xmm4, [rsi+rax] 52 53 movdqu xmm5, [rsi+rax*2] 54 lea rsi, [rsi+rax*2] 55 56 movdqa [rdi], xmm3 57 add rsi, rax 58 59 movdqa [rdi+rcx], xmm4 60 movdqa [rdi+rcx*2],xmm5 61 62 lea rdi, [rdi+rcx*2] 63 movdqu xmm0, [rsi] 64 65 add rdi, rcx 66 movdqu xmm1, [rsi+rax] 67 68 movdqu xmm2, [rsi+rax*2] 69 lea rsi, [rsi+rax*2] 70 71 movdqa [rdi], xmm0 72 add rsi, rax 73 74 movdqa [rdi+rcx], xmm1 75 76 movdqa [rdi+rcx*2], xmm2 77 movdqu xmm3, [rsi] 78 79 movdqu xmm4, [rsi+rax] 80 lea rdi, [rdi+rcx*2] 81 82 add rdi, rcx 83 movdqu xmm5, [rsi+rax*2] 84 85 lea rsi, [rsi+rax*2] 86 movdqa [rdi], xmm3 87 88 add rsi, rax 89 movdqa [rdi+rcx], xmm4 90 91 movdqa [rdi+rcx*2],xmm5 92 movdqu xmm0, [rsi] 93 94 lea rdi, [rdi+rcx*2] 95 movdqu xmm1, [rsi+rax] 96 97 add rdi, rcx 98 movdqu xmm2, [rsi+rax*2] 99 100 lea rsi, [rsi+rax*2] 101 movdqa [rdi], xmm0 102 103 movdqa [rdi+rcx], xmm1 104 movdqa [rdi+rcx*2],xmm2 105 106 movdqu xmm3, [rsi+rax] 107 lea rdi, [rdi+rcx*2] 108 109 movdqa [rdi+rcx], xmm3 110 111 ; begin epilog 112 pop rdi 113 pop rsi 114 UNSHADOW_ARGS 115 pop rbp 116 ret 117 118 119;void vp8_intra_pred_uv_dc_mmx2( 120; unsigned char *dst, 121; int dst_stride 122; unsigned char *above, 123; unsigned char *left, 124; int left_stride, 125; ) 126global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE 127sym(vp8_intra_pred_uv_dc_mmx2): 128 push rbp 129 mov rbp, rsp 130 SHADOW_ARGS_TO_STACK 5 131 push rsi 132 push rdi 133 ; end prolog 134 135 ; from top 136 mov rdi, arg(2) ;above; 137 mov rsi, arg(3) ;left; 138 movsxd rax, dword ptr arg(4) ;left_stride; 139 pxor mm0, mm0 140 movq mm1, [rdi] 141 lea rdi, [rax*3] 142 psadbw mm1, mm0 143 ; from left 144 movzx ecx, byte [rsi] 145 movzx edx, byte [rsi+rax*1] 146 add ecx, edx 147 movzx edx, byte [rsi+rax*2] 148 add ecx, edx 149 150 movzx edx, byte [rsi+rdi] 151 lea rsi, [rsi+rax*4] 152 add ecx, edx 153 movzx edx, byte [rsi] 154 add ecx, edx 155 movzx edx, byte [rsi+rax] 156 add ecx, edx 157 movzx edx, byte [rsi+rax*2] 158 add ecx, edx 159 movzx edx, byte [rsi+rdi] 160 add ecx, edx 161 162 ; add up 163 pextrw edx, mm1, 0x0 164 lea edx, [edx+ecx+8] 165 sar edx, 4 166 movd mm1, edx 167 movsxd rcx, dword ptr arg(1) ;dst_stride 168 pshufw mm1, mm1, 0x0 169 mov rdi, arg(0) ;dst; 170 packuswb mm1, mm1 171 172 ; write out 173 lea rax, [rcx*3] 174 lea rdx, [rdi+rcx*4] 175 176 movq [rdi ], mm1 177 movq [rdi+rcx ], mm1 178 movq [rdi+rcx*2], mm1 179 movq [rdi+rax ], mm1 180 movq [rdx ], mm1 181 movq [rdx+rcx ], mm1 182 movq [rdx+rcx*2], mm1 183 movq [rdx+rax ], mm1 184 185 ; begin epilog 186 pop rdi 187 pop rsi 188 UNSHADOW_ARGS 189 pop rbp 190 ret 191 192;void vp8_intra_pred_uv_dctop_mmx2( 193; unsigned char *dst, 194; int dst_stride 195; unsigned char *above, 196; unsigned char *left, 197; int left_stride, 198; ) 199global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE 200sym(vp8_intra_pred_uv_dctop_mmx2): 201 push rbp 202 mov rbp, rsp 203 SHADOW_ARGS_TO_STACK 5 204 GET_GOT rbx 205 push rsi 206 push rdi 207 ; end prolog 208 209 ;arg(3), arg(4) not used 210 211 ; from top 212 mov rsi, arg(2) ;above; 213 pxor mm0, mm0 214 movq mm1, [rsi] 215 psadbw mm1, mm0 216 217 ; add up 218 paddw mm1, [GLOBAL(dc_4)] 219 psraw mm1, 3 220 pshufw mm1, mm1, 0x0 221 packuswb mm1, mm1 222 223 ; write out 224 mov rdi, arg(0) ;dst; 225 movsxd rcx, dword ptr arg(1) ;dst_stride 226 lea rax, [rcx*3] 227 228 movq [rdi ], mm1 229 movq [rdi+rcx ], mm1 230 movq [rdi+rcx*2], mm1 231 movq [rdi+rax ], mm1 232 lea rdi, [rdi+rcx*4] 233 movq [rdi ], mm1 234 movq [rdi+rcx ], mm1 235 movq [rdi+rcx*2], mm1 236 movq [rdi+rax ], mm1 237 238 ; begin epilog 239 pop rdi 240 pop rsi 241 RESTORE_GOT 242 UNSHADOW_ARGS 243 pop rbp 244 ret 245 246;void vp8_intra_pred_uv_dcleft_mmx2( 247; unsigned char *dst, 248; int dst_stride 249; unsigned char *above, 250; unsigned char *left, 251; int left_stride, 252; ) 253global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE 254sym(vp8_intra_pred_uv_dcleft_mmx2): 255 push rbp 256 mov rbp, rsp 257 SHADOW_ARGS_TO_STACK 5 258 push rsi 259 push rdi 260 ; end prolog 261 262 ;arg(2) not used 263 264 ; from left 265 mov rsi, arg(3) ;left; 266 movsxd rax, dword ptr arg(4) ;left_stride; 267 lea rdi, [rax*3] 268 movzx ecx, byte [rsi] 269 movzx edx, byte [rsi+rax] 270 add ecx, edx 271 movzx edx, byte [rsi+rax*2] 272 add ecx, edx 273 movzx edx, byte [rsi+rdi] 274 add ecx, edx 275 lea rsi, [rsi+rax*4] 276 movzx edx, byte [rsi] 277 add ecx, edx 278 movzx edx, byte [rsi+rax] 279 add ecx, edx 280 movzx edx, byte [rsi+rax*2] 281 add ecx, edx 282 movzx edx, byte [rsi+rdi] 283 lea edx, [ecx+edx+4] 284 285 ; add up 286 shr edx, 3 287 movd mm1, edx 288 pshufw mm1, mm1, 0x0 289 packuswb mm1, mm1 290 291 ; write out 292 mov rdi, arg(0) ;dst; 293 movsxd rcx, dword ptr arg(1) ;dst_stride 294 lea rax, [rcx*3] 295 296 movq [rdi ], mm1 297 movq [rdi+rcx ], mm1 298 movq [rdi+rcx*2], mm1 299 movq [rdi+rax ], mm1 300 lea rdi, [rdi+rcx*4] 301 movq [rdi ], mm1 302 movq [rdi+rcx ], mm1 303 movq [rdi+rcx*2], mm1 304 movq [rdi+rax ], mm1 305 306 ; begin epilog 307 pop rdi 308 pop rsi 309 UNSHADOW_ARGS 310 pop rbp 311 ret 312 313;void vp8_intra_pred_uv_dc128_mmx( 314; unsigned char *dst, 315; int dst_stride 316; unsigned char *above, 317; unsigned char *left, 318; int left_stride, 319; ) 320global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE 321sym(vp8_intra_pred_uv_dc128_mmx): 322 push rbp 323 mov rbp, rsp 324 SHADOW_ARGS_TO_STACK 5 325 GET_GOT rbx 326 ; end prolog 327 328 ;arg(2), arg(3), arg(4) not used 329 330 ; write out 331 movq mm1, [GLOBAL(dc_128)] 332 mov rax, arg(0) ;dst; 333 movsxd rdx, dword ptr arg(1) ;dst_stride 334 lea rcx, [rdx*3] 335 336 movq [rax ], mm1 337 movq [rax+rdx ], mm1 338 movq [rax+rdx*2], mm1 339 movq [rax+rcx ], mm1 340 lea rax, [rax+rdx*4] 341 movq [rax ], mm1 342 movq [rax+rdx ], mm1 343 movq [rax+rdx*2], mm1 344 movq [rax+rcx ], mm1 345 346 ; begin epilog 347 RESTORE_GOT 348 UNSHADOW_ARGS 349 pop rbp 350 ret 351 352;void vp8_intra_pred_uv_tm_sse2( 353; unsigned char *dst, 354; int dst_stride 355; unsigned char *above, 356; unsigned char *left, 357; int left_stride, 358; ) 359%macro vp8_intra_pred_uv_tm 1 360global sym(vp8_intra_pred_uv_tm_%1) PRIVATE 361sym(vp8_intra_pred_uv_tm_%1): 362 push rbp 363 mov rbp, rsp 364 SHADOW_ARGS_TO_STACK 5 365 GET_GOT rbx 366 push rsi 367 push rdi 368 push rbx 369 ; end prolog 370 371 ; read top row 372 mov edx, 4 373 mov rsi, arg(2) ;above 374 movsxd rax, dword ptr arg(4) ;left_stride; 375 pxor xmm0, xmm0 376%ifidn %1, ssse3 377 movdqa xmm2, [GLOBAL(dc_1024)] 378%endif 379 movq xmm1, [rsi] 380 punpcklbw xmm1, xmm0 381 382 ; set up left ptrs ans subtract topleft 383 movd xmm3, [rsi-1] 384 mov rsi, arg(3) ;left; 385%ifidn %1, sse2 386 punpcklbw xmm3, xmm0 387 pshuflw xmm3, xmm3, 0x0 388 punpcklqdq xmm3, xmm3 389%else 390 pshufb xmm3, xmm2 391%endif 392 psubw xmm1, xmm3 393 394 ; set up dest ptrs 395 mov rdi, arg(0) ;dst; 396 movsxd rcx, dword ptr arg(1) ;dst_stride 397 398.vp8_intra_pred_uv_tm_%1_loop: 399 mov bl, [rsi] 400 movd xmm3, ebx 401 402 mov bl, [rsi+rax] 403 movd xmm5, ebx 404%ifidn %1, sse2 405 punpcklbw xmm3, xmm0 406 punpcklbw xmm5, xmm0 407 pshuflw xmm3, xmm3, 0x0 408 pshuflw xmm5, xmm5, 0x0 409 punpcklqdq xmm3, xmm3 410 punpcklqdq xmm5, xmm5 411%else 412 pshufb xmm3, xmm2 413 pshufb xmm5, xmm2 414%endif 415 paddw xmm3, xmm1 416 paddw xmm5, xmm1 417 packuswb xmm3, xmm5 418 movq [rdi ], xmm3 419 movhps[rdi+rcx], xmm3 420 lea rsi, [rsi+rax*2] 421 lea rdi, [rdi+rcx*2] 422 dec edx 423 jnz .vp8_intra_pred_uv_tm_%1_loop 424 425 ; begin epilog 426 pop rbx 427 pop rdi 428 pop rsi 429 RESTORE_GOT 430 UNSHADOW_ARGS 431 pop rbp 432 ret 433%endmacro 434 435vp8_intra_pred_uv_tm sse2 436vp8_intra_pred_uv_tm ssse3 437 438;void vp8_intra_pred_uv_ve_mmx( 439; unsigned char *dst, 440; int dst_stride 441; unsigned char *above, 442; unsigned char *left, 443; int left_stride, 444; ) 445global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE 446sym(vp8_intra_pred_uv_ve_mmx): 447 push rbp 448 mov rbp, rsp 449 SHADOW_ARGS_TO_STACK 5 450 ; end prolog 451 452 ; arg(3), arg(4) not used 453 454 ; read from top 455 mov rax, arg(2) ;src; 456 457 movq mm1, [rax] 458 459 ; write out 460 mov rax, arg(0) ;dst; 461 movsxd rdx, dword ptr arg(1) ;dst_stride 462 lea rcx, [rdx*3] 463 464 movq [rax ], mm1 465 movq [rax+rdx ], mm1 466 movq [rax+rdx*2], mm1 467 movq [rax+rcx ], mm1 468 lea rax, [rax+rdx*4] 469 movq [rax ], mm1 470 movq [rax+rdx ], mm1 471 movq [rax+rdx*2], mm1 472 movq [rax+rcx ], mm1 473 474 ; begin epilog 475 UNSHADOW_ARGS 476 pop rbp 477 ret 478 479;void vp8_intra_pred_uv_ho_mmx2( 480; unsigned char *dst, 481; int dst_stride 482; unsigned char *above, 483; unsigned char *left, 484; int left_stride 485; ) 486%macro vp8_intra_pred_uv_ho 1 487global sym(vp8_intra_pred_uv_ho_%1) PRIVATE 488sym(vp8_intra_pred_uv_ho_%1): 489 push rbp 490 mov rbp, rsp 491 SHADOW_ARGS_TO_STACK 5 492 push rsi 493 push rdi 494 push rbx 495%ifidn %1, ssse3 496 GET_GOT rbx 497%endif 498 ; end prolog 499 500 ;arg(2) not used 501 502 ; read from left and write out 503%ifidn %1, mmx2 504 mov edx, 4 505%endif 506 mov rsi, arg(3) ;left 507 movsxd rax, dword ptr arg(4) ;left_stride; 508 mov rdi, arg(0) ;dst; 509 movsxd rcx, dword ptr arg(1) ;dst_stride 510%ifidn %1, ssse3 511 lea rdx, [rcx*3] 512 movdqa xmm2, [GLOBAL(dc_00001111)] 513%endif 514 515%ifidn %1, mmx2 516.vp8_intra_pred_uv_ho_%1_loop: 517 mov bl, [rsi] 518 movd mm0, ebx 519 520 mov bl, [rsi+rax] 521 movd mm1, ebx 522 523 punpcklbw mm0, mm0 524 punpcklbw mm1, mm1 525 pshufw mm0, mm0, 0x0 526 pshufw mm1, mm1, 0x0 527 movq [rdi ], mm0 528 movq [rdi+rcx], mm1 529 lea rsi, [rsi+rax*2] 530 lea rdi, [rdi+rcx*2] 531 dec edx 532 jnz .vp8_intra_pred_uv_ho_%1_loop 533%else 534 mov bl, [rsi] 535 movd xmm0, ebx 536 537 mov bl, [rsi+rax] 538 movd xmm3, ebx 539 540 mov bl, [rsi+rax*2] 541 movd xmm1, ebx 542 543 lea rbx, [rax*3] 544 mov bl, [rsi+rbx] 545 movd xmm4, ebx 546 547 punpcklbw xmm0, xmm3 548 punpcklbw xmm1, xmm4 549 pshufb xmm0, xmm2 550 pshufb xmm1, xmm2 551 movq [rdi ], xmm0 552 movhps [rdi+rcx], xmm0 553 movq [rdi+rcx*2], xmm1 554 movhps [rdi+rdx], xmm1 555 lea rsi, [rsi+rax*4] 556 lea rdi, [rdi+rcx*4] 557 558 mov bl, [rsi] 559 movd xmm0, ebx 560 561 mov bl, [rsi+rax] 562 movd xmm3, ebx 563 564 mov bl, [rsi+rax*2] 565 movd xmm1, ebx 566 567 lea rbx, [rax*3] 568 mov bl, [rsi+rbx] 569 movd xmm4, ebx 570 571 punpcklbw xmm0, xmm3 572 punpcklbw xmm1, xmm4 573 pshufb xmm0, xmm2 574 pshufb xmm1, xmm2 575 movq [rdi ], xmm0 576 movhps [rdi+rcx], xmm0 577 movq [rdi+rcx*2], xmm1 578 movhps [rdi+rdx], xmm1 579%endif 580 581 ; begin epilog 582%ifidn %1, ssse3 583 RESTORE_GOT 584%endif 585 pop rbx 586 pop rdi 587 pop rsi 588 UNSHADOW_ARGS 589 pop rbp 590 ret 591%endmacro 592 593vp8_intra_pred_uv_ho mmx2 594vp8_intra_pred_uv_ho ssse3 595 596;void vp8_intra_pred_y_dc_sse2( 597; unsigned char *dst, 598; int dst_stride 599; unsigned char *above, 600; unsigned char *left, 601; int left_stride 602; ) 603global sym(vp8_intra_pred_y_dc_sse2) PRIVATE 604sym(vp8_intra_pred_y_dc_sse2): 605 push rbp 606 mov rbp, rsp 607 SHADOW_ARGS_TO_STACK 5 608 push rsi 609 push rdi 610 ; end prolog 611 612 ; from top 613 mov rdi, arg(2) ;above 614 mov rsi, arg(3) ;left 615 movsxd rax, dword ptr arg(4) ;left_stride; 616 617 pxor xmm0, xmm0 618 movdqa xmm1, [rdi] 619 psadbw xmm1, xmm0 620 movq xmm2, xmm1 621 punpckhqdq xmm1, xmm1 622 paddw xmm1, xmm2 623 624 ; from left 625 lea rdi, [rax*3] 626 627 movzx ecx, byte [rsi] 628 movzx edx, byte [rsi+rax] 629 add ecx, edx 630 movzx edx, byte [rsi+rax*2] 631 add ecx, edx 632 movzx edx, byte [rsi+rdi] 633 add ecx, edx 634 lea rsi, [rsi+rax*4] 635 636 movzx edx, byte [rsi] 637 add ecx, edx 638 movzx edx, byte [rsi+rax] 639 add ecx, edx 640 movzx edx, byte [rsi+rax*2] 641 add ecx, edx 642 movzx edx, byte [rsi+rdi] 643 add ecx, edx 644 lea rsi, [rsi+rax*4] 645 646 movzx edx, byte [rsi] 647 add ecx, edx 648 movzx edx, byte [rsi+rax] 649 add ecx, edx 650 movzx edx, byte [rsi+rax*2] 651 add ecx, edx 652 movzx edx, byte [rsi+rdi] 653 add ecx, edx 654 lea rsi, [rsi+rax*4] 655 656 movzx edx, byte [rsi] 657 add ecx, edx 658 movzx edx, byte [rsi+rax] 659 add ecx, edx 660 movzx edx, byte [rsi+rax*2] 661 add ecx, edx 662 movzx edx, byte [rsi+rdi] 663 add ecx, edx 664 665 ; add up 666 pextrw edx, xmm1, 0x0 667 lea edx, [edx+ecx+16] 668 sar edx, 5 669 movd xmm1, edx 670 ; FIXME use pshufb for ssse3 version 671 pshuflw xmm1, xmm1, 0x0 672 punpcklqdq xmm1, xmm1 673 packuswb xmm1, xmm1 674 675 ; write out 676 mov rsi, 2 677 mov rdi, arg(0) ;dst; 678 movsxd rcx, dword ptr arg(1) ;dst_stride 679 lea rax, [rcx*3] 680 681.label 682 movdqa [rdi ], xmm1 683 movdqa [rdi+rcx ], xmm1 684 movdqa [rdi+rcx*2], xmm1 685 movdqa [rdi+rax ], xmm1 686 lea rdi, [rdi+rcx*4] 687 movdqa [rdi ], xmm1 688 movdqa [rdi+rcx ], xmm1 689 movdqa [rdi+rcx*2], xmm1 690 movdqa [rdi+rax ], xmm1 691 lea rdi, [rdi+rcx*4] 692 dec rsi 693 jnz .label 694 695 ; begin epilog 696 pop rdi 697 pop rsi 698 UNSHADOW_ARGS 699 pop rbp 700 ret 701 702;void vp8_intra_pred_y_dctop_sse2( 703; unsigned char *dst, 704; int dst_stride 705; unsigned char *above, 706; unsigned char *left, 707; int left_stride 708; ) 709global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE 710sym(vp8_intra_pred_y_dctop_sse2): 711 push rbp 712 mov rbp, rsp 713 SHADOW_ARGS_TO_STACK 5 714 push rsi 715 GET_GOT rbx 716 ; end prolog 717 718 ;arg(3), arg(4) not used 719 720 ; from top 721 mov rcx, arg(2) ;above; 722 pxor xmm0, xmm0 723 movdqa xmm1, [rcx] 724 psadbw xmm1, xmm0 725 movdqa xmm2, xmm1 726 punpckhqdq xmm1, xmm1 727 paddw xmm1, xmm2 728 729 ; add up 730 paddw xmm1, [GLOBAL(dc_8)] 731 psraw xmm1, 4 732 ; FIXME use pshufb for ssse3 version 733 pshuflw xmm1, xmm1, 0x0 734 punpcklqdq xmm1, xmm1 735 packuswb xmm1, xmm1 736 737 ; write out 738 mov rsi, 2 739 mov rdx, arg(0) ;dst; 740 movsxd rcx, dword ptr arg(1) ;dst_stride 741 lea rax, [rcx*3] 742 743.label 744 movdqa [rdx ], xmm1 745 movdqa [rdx+rcx ], xmm1 746 movdqa [rdx+rcx*2], xmm1 747 movdqa [rdx+rax ], xmm1 748 lea rdx, [rdx+rcx*4] 749 movdqa [rdx ], xmm1 750 movdqa [rdx+rcx ], xmm1 751 movdqa [rdx+rcx*2], xmm1 752 movdqa [rdx+rax ], xmm1 753 lea rdx, [rdx+rcx*4] 754 dec rsi 755 jnz .label 756 757 ; begin epilog 758 RESTORE_GOT 759 pop rsi 760 UNSHADOW_ARGS 761 pop rbp 762 ret 763 764;void vp8_intra_pred_y_dcleft_sse2( 765; unsigned char *dst, 766; int dst_stride 767; unsigned char *above, 768; unsigned char *left, 769; int left_stride 770; ) 771global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE 772sym(vp8_intra_pred_y_dcleft_sse2): 773 push rbp 774 mov rbp, rsp 775 SHADOW_ARGS_TO_STACK 5 776 push rsi 777 push rdi 778 ; end prolog 779 780 ;arg(2) not used 781 782 ; from left 783 mov rsi, arg(3) ;left; 784 movsxd rax, dword ptr arg(4) ;left_stride; 785 786 lea rdi, [rax*3] 787 movzx ecx, byte [rsi] 788 movzx edx, byte [rsi+rax] 789 add ecx, edx 790 movzx edx, byte [rsi+rax*2] 791 add ecx, edx 792 movzx edx, byte [rsi+rdi] 793 add ecx, edx 794 lea rsi, [rsi+rax*4] 795 movzx edx, byte [rsi] 796 add ecx, edx 797 movzx edx, byte [rsi+rax] 798 add ecx, edx 799 movzx edx, byte [rsi+rax*2] 800 add ecx, edx 801 movzx edx, byte [rsi+rdi] 802 add ecx, edx 803 lea rsi, [rsi+rax*4] 804 movzx edx, byte [rsi] 805 add ecx, edx 806 movzx edx, byte [rsi+rax] 807 add ecx, edx 808 movzx edx, byte [rsi+rax*2] 809 add ecx, edx 810 movzx edx, byte [rsi+rdi] 811 add ecx, edx 812 lea rsi, [rsi+rax*4] 813 movzx edx, byte [rsi] 814 add ecx, edx 815 movzx edx, byte [rsi+rax] 816 add ecx, edx 817 movzx edx, byte [rsi+rax*2] 818 add ecx, edx 819 movzx edx, byte [rsi+rdi] 820 lea edx, [ecx+edx+8] 821 822 ; add up 823 shr edx, 4 824 movd xmm1, edx 825 ; FIXME use pshufb for ssse3 version 826 pshuflw xmm1, xmm1, 0x0 827 punpcklqdq xmm1, xmm1 828 packuswb xmm1, xmm1 829 830 ; write out 831 mov rsi, 2 832 mov rdi, arg(0) ;dst; 833 movsxd rcx, dword ptr arg(1) ;dst_stride 834 lea rax, [rcx*3] 835 836.label 837 movdqa [rdi ], xmm1 838 movdqa [rdi+rcx ], xmm1 839 movdqa [rdi+rcx*2], xmm1 840 movdqa [rdi+rax ], xmm1 841 lea rdi, [rdi+rcx*4] 842 movdqa [rdi ], xmm1 843 movdqa [rdi+rcx ], xmm1 844 movdqa [rdi+rcx*2], xmm1 845 movdqa [rdi+rax ], xmm1 846 lea rdi, [rdi+rcx*4] 847 dec rsi 848 jnz .label 849 850 ; begin epilog 851 pop rdi 852 pop rsi 853 UNSHADOW_ARGS 854 pop rbp 855 ret 856 857;void vp8_intra_pred_y_dc128_sse2( 858; unsigned char *dst, 859; int dst_stride 860; unsigned char *above, 861; unsigned char *left, 862; int left_stride 863; ) 864global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE 865sym(vp8_intra_pred_y_dc128_sse2): 866 push rbp 867 mov rbp, rsp 868 SHADOW_ARGS_TO_STACK 5 869 push rsi 870 GET_GOT rbx 871 ; end prolog 872 873 ;arg(2), arg(3), arg(4) not used 874 875 ; write out 876 mov rsi, 2 877 movdqa xmm1, [GLOBAL(dc_128)] 878 mov rax, arg(0) ;dst; 879 movsxd rdx, dword ptr arg(1) ;dst_stride 880 lea rcx, [rdx*3] 881 882.label 883 movdqa [rax ], xmm1 884 movdqa [rax+rdx ], xmm1 885 movdqa [rax+rdx*2], xmm1 886 movdqa [rax+rcx ], xmm1 887 lea rax, [rax+rdx*4] 888 movdqa [rax ], xmm1 889 movdqa [rax+rdx ], xmm1 890 movdqa [rax+rdx*2], xmm1 891 movdqa [rax+rcx ], xmm1 892 lea rax, [rax+rdx*4] 893 dec rsi 894 jnz .label 895 896 ; begin epilog 897 RESTORE_GOT 898 pop rsi 899 UNSHADOW_ARGS 900 pop rbp 901 ret 902 903;void vp8_intra_pred_y_tm_sse2( 904; unsigned char *dst, 905; int dst_stride 906; unsigned char *above, 907; unsigned char *left, 908; int left_stride 909; ) 910%macro vp8_intra_pred_y_tm 1 911global sym(vp8_intra_pred_y_tm_%1) PRIVATE 912sym(vp8_intra_pred_y_tm_%1): 913 push rbp 914 mov rbp, rsp 915 SHADOW_ARGS_TO_STACK 5 916 SAVE_XMM 7 917 push rsi 918 push rdi 919 push rbx 920 GET_GOT rbx 921 ; end prolog 922 923 ; read top row 924 mov edx, 8 925 mov rsi, arg(2) ;above 926 movsxd rax, dword ptr arg(4) ;left_stride; 927 pxor xmm0, xmm0 928%ifidn %1, ssse3 929 movdqa xmm3, [GLOBAL(dc_1024)] 930%endif 931 movdqa xmm1, [rsi] 932 movdqa xmm2, xmm1 933 punpcklbw xmm1, xmm0 934 punpckhbw xmm2, xmm0 935 936 ; set up left ptrs ans subtract topleft 937 movd xmm4, [rsi-1] 938 mov rsi, arg(3) ;left 939%ifidn %1, sse2 940 punpcklbw xmm4, xmm0 941 pshuflw xmm4, xmm4, 0x0 942 punpcklqdq xmm4, xmm4 943%else 944 pshufb xmm4, xmm3 945%endif 946 psubw xmm1, xmm4 947 psubw xmm2, xmm4 948 949 ; set up dest ptrs 950 mov rdi, arg(0) ;dst; 951 movsxd rcx, dword ptr arg(1) ;dst_stride 952vp8_intra_pred_y_tm_%1_loop: 953 mov bl, [rsi] 954 movd xmm4, ebx 955 956 mov bl, [rsi+rax] 957 movd xmm5, ebx 958%ifidn %1, sse2 959 punpcklbw xmm4, xmm0 960 punpcklbw xmm5, xmm0 961 pshuflw xmm4, xmm4, 0x0 962 pshuflw xmm5, xmm5, 0x0 963 punpcklqdq xmm4, xmm4 964 punpcklqdq xmm5, xmm5 965%else 966 pshufb xmm4, xmm3 967 pshufb xmm5, xmm3 968%endif 969 movdqa xmm6, xmm4 970 movdqa xmm7, xmm5 971 paddw xmm4, xmm1 972 paddw xmm6, xmm2 973 paddw xmm5, xmm1 974 paddw xmm7, xmm2 975 packuswb xmm4, xmm6 976 packuswb xmm5, xmm7 977 movdqa [rdi ], xmm4 978 movdqa [rdi+rcx], xmm5 979 lea rsi, [rsi+rax*2] 980 lea rdi, [rdi+rcx*2] 981 dec edx 982 jnz vp8_intra_pred_y_tm_%1_loop 983 984 ; begin epilog 985 RESTORE_GOT 986 pop rbx 987 pop rdi 988 pop rsi 989 RESTORE_XMM 990 UNSHADOW_ARGS 991 pop rbp 992 ret 993%endmacro 994 995vp8_intra_pred_y_tm sse2 996vp8_intra_pred_y_tm ssse3 997 998;void vp8_intra_pred_y_ve_sse2( 999; unsigned char *dst, 1000; int dst_stride 1001; unsigned char *above, 1002; unsigned char *left, 1003; int left_stride 1004; ) 1005global sym(vp8_intra_pred_y_ve_sse2) PRIVATE 1006sym(vp8_intra_pred_y_ve_sse2): 1007 push rbp 1008 mov rbp, rsp 1009 SHADOW_ARGS_TO_STACK 5 1010 push rsi 1011 ; end prolog 1012 1013 ;arg(3), arg(4) not used 1014 1015 mov rax, arg(2) ;above; 1016 mov rsi, 2 1017 movsxd rdx, dword ptr arg(1) ;dst_stride 1018 1019 ; read from top 1020 movdqa xmm1, [rax] 1021 1022 ; write out 1023 mov rax, arg(0) ;dst; 1024 lea rcx, [rdx*3] 1025 1026.label 1027 movdqa [rax ], xmm1 1028 movdqa [rax+rdx ], xmm1 1029 movdqa [rax+rdx*2], xmm1 1030 movdqa [rax+rcx ], xmm1 1031 lea rax, [rax+rdx*4] 1032 movdqa [rax ], xmm1 1033 movdqa [rax+rdx ], xmm1 1034 movdqa [rax+rdx*2], xmm1 1035 movdqa [rax+rcx ], xmm1 1036 lea rax, [rax+rdx*4] 1037 dec rsi 1038 jnz .label 1039 1040 ; begin epilog 1041 pop rsi 1042 UNSHADOW_ARGS 1043 pop rbp 1044 ret 1045 1046;void vp8_intra_pred_y_ho_sse2( 1047; unsigned char *dst, 1048; int dst_stride 1049; unsigned char *above, 1050; unsigned char *left, 1051; int left_stride, 1052; ) 1053global sym(vp8_intra_pred_y_ho_sse2) PRIVATE 1054sym(vp8_intra_pred_y_ho_sse2): 1055 push rbp 1056 mov rbp, rsp 1057 SHADOW_ARGS_TO_STACK 5 1058 push rsi 1059 push rdi 1060 push rbx 1061 ; end prolog 1062 1063 ;arg(2) not used 1064 1065 ; read from left and write out 1066 mov edx, 8 1067 mov rsi, arg(3) ;left; 1068 movsxd rax, dword ptr arg(4) ;left_stride; 1069 mov rdi, arg(0) ;dst; 1070 movsxd rcx, dword ptr arg(1) ;dst_stride 1071 1072vp8_intra_pred_y_ho_sse2_loop: 1073 mov bl, [rsi] 1074 movd xmm0, ebx 1075 mov bl, [rsi+rax] 1076 movd xmm1, ebx 1077 1078 ; FIXME use pshufb for ssse3 version 1079 punpcklbw xmm0, xmm0 1080 punpcklbw xmm1, xmm1 1081 pshuflw xmm0, xmm0, 0x0 1082 pshuflw xmm1, xmm1, 0x0 1083 punpcklqdq xmm0, xmm0 1084 punpcklqdq xmm1, xmm1 1085 movdqa [rdi ], xmm0 1086 movdqa [rdi+rcx], xmm1 1087 lea rsi, [rsi+rax*2] 1088 lea rdi, [rdi+rcx*2] 1089 dec edx 1090 jnz vp8_intra_pred_y_ho_sse2_loop 1091 1092 ; begin epilog 1093 pop rbx 1094 pop rdi 1095 pop rsi 1096 UNSHADOW_ARGS 1097 pop rbp 1098 ret 1099 1100SECTION_RODATA 1101align 16 1102dc_128: 1103 times 16 db 128 1104dc_4: 1105 times 4 dw 4 1106align 16 1107dc_8: 1108 times 8 dw 8 1109align 16 1110dc_1024: 1111 times 8 dw 0x400 1112align 16 1113dc_00001111: 1114 times 8 db 0 1115 times 8 db 1 1116