1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro VERTx4 1 15 mov rdx, arg(5) ;filter ptr 16 mov rsi, arg(0) ;src_ptr 17 mov rdi, arg(2) ;output_ptr 18 mov rcx, 0x0400040 19 20 movdqa xmm4, [rdx] ;load filters 21 movd xmm5, rcx 22 packsswb xmm4, xmm4 23 pshuflw xmm0, xmm4, 0b ;k0_k1 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 27 28 punpcklqdq xmm0, xmm0 29 punpcklqdq xmm1, xmm1 30 punpcklqdq xmm2, xmm2 31 punpcklqdq xmm3, xmm3 32 33 movdqa k0k1, xmm0 34 movdqa k2k3, xmm1 35 pshufd xmm5, xmm5, 0 36 movdqa k4k5, xmm2 37 movdqa k6k7, xmm3 38 movdqa krd, xmm5 39 40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 41 42%if ABI_IS_32BIT=0 43 movsxd r8, DWORD PTR arg(3) ;out_pitch 44%endif 45 mov rax, rsi 46 movsxd rcx, DWORD PTR arg(4) ;output_height 47 add rax, rdx 48 49 lea rbx, [rdx + rdx*4] 50 add rbx, rdx ;pitch * 6 51 52.loop: 53 movd xmm0, [rsi] ;A 54 movd xmm1, [rsi + rdx] ;B 55 movd xmm2, [rsi + rdx * 2] ;C 56 movd xmm3, [rax + rdx * 2] ;D 57 movd xmm4, [rsi + rdx * 4] ;E 58 movd xmm5, [rax + rdx * 4] ;F 59 60 punpcklbw xmm0, xmm1 ;A B 61 punpcklbw xmm2, xmm3 ;C D 62 punpcklbw xmm4, xmm5 ;E F 63 64 movd xmm6, [rsi + rbx] ;G 65 movd xmm7, [rax + rbx] ;H 66 67 pmaddubsw xmm0, k0k1 68 pmaddubsw xmm2, k2k3 69 punpcklbw xmm6, xmm7 ;G H 70 pmaddubsw xmm4, k4k5 71 pmaddubsw xmm6, k6k7 72 73 movdqa xmm1, xmm2 74 paddsw xmm0, xmm6 75 pmaxsw xmm2, xmm4 76 pminsw xmm4, xmm1 77 paddsw xmm0, xmm4 78 paddsw xmm0, xmm2 79 80 paddsw xmm0, krd 81 psraw xmm0, 7 82 packuswb xmm0, xmm0 83 84 add rsi, rdx 85 add rax, rdx 86%if %1 87 movd xmm1, [rdi] 88 pavgb xmm0, xmm1 89%endif 90 movd [rdi], xmm0 91 92%if ABI_IS_32BIT 93 add rdi, DWORD PTR arg(3) ;out_pitch 94%else 95 add rdi, r8 96%endif 97 dec rcx 98 jnz .loop 99%endm 100 101%macro VERTx8 1 102 mov rdx, arg(5) ;filter ptr 103 mov rsi, arg(0) ;src_ptr 104 mov rdi, arg(2) ;output_ptr 105 mov rcx, 0x0400040 106 107 movdqa xmm4, [rdx] ;load filters 108 movq xmm5, rcx 109 packsswb xmm4, xmm4 110 pshuflw xmm0, xmm4, 0b ;k0_k1 111 pshuflw xmm1, xmm4, 01010101b ;k2_k3 112 pshuflw xmm2, xmm4, 10101010b ;k4_k5 113 pshuflw xmm3, xmm4, 11111111b ;k6_k7 114 115 punpcklqdq xmm0, xmm0 116 punpcklqdq xmm1, xmm1 117 punpcklqdq xmm2, xmm2 118 punpcklqdq xmm3, xmm3 119 120 movdqa k0k1, xmm0 121 movdqa k2k3, xmm1 122 pshufd xmm5, xmm5, 0 123 movdqa k4k5, xmm2 124 movdqa k6k7, xmm3 125 movdqa krd, xmm5 126 127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 128 129%if ABI_IS_32BIT=0 130 movsxd r8, DWORD PTR arg(3) ;out_pitch 131%endif 132 mov rax, rsi 133 movsxd rcx, DWORD PTR arg(4) ;output_height 134 add rax, rdx 135 136 lea rbx, [rdx + rdx*4] 137 add rbx, rdx ;pitch * 6 138 139.loop: 140 movq xmm0, [rsi] ;A 141 movq xmm1, [rsi + rdx] ;B 142 movq xmm2, [rsi + rdx * 2] ;C 143 movq xmm3, [rax + rdx * 2] ;D 144 movq xmm4, [rsi + rdx * 4] ;E 145 movq xmm5, [rax + rdx * 4] ;F 146 147 punpcklbw xmm0, xmm1 ;A B 148 punpcklbw xmm2, xmm3 ;C D 149 punpcklbw xmm4, xmm5 ;E F 150 151 movq xmm6, [rsi + rbx] ;G 152 movq xmm7, [rax + rbx] ;H 153 154 pmaddubsw xmm0, k0k1 155 pmaddubsw xmm2, k2k3 156 punpcklbw xmm6, xmm7 ;G H 157 pmaddubsw xmm4, k4k5 158 pmaddubsw xmm6, k6k7 159 160 paddsw xmm0, xmm6 161 movdqa xmm1, xmm2 162 pmaxsw xmm2, xmm4 163 pminsw xmm4, xmm1 164 paddsw xmm0, xmm4 165 paddsw xmm0, xmm2 166 167 paddsw xmm0, krd 168 psraw xmm0, 7 169 packuswb xmm0, xmm0 170 171 add rsi, rdx 172 add rax, rdx 173%if %1 174 movq xmm1, [rdi] 175 pavgb xmm0, xmm1 176%endif 177 movq [rdi], xmm0 178 179%if ABI_IS_32BIT 180 add rdi, DWORD PTR arg(3) ;out_pitch 181%else 182 add rdi, r8 183%endif 184 dec rcx 185 jnz .loop 186%endm 187 188 189%macro VERTx16 1 190 mov rdx, arg(5) ;filter ptr 191 mov rsi, arg(0) ;src_ptr 192 mov rdi, arg(2) ;output_ptr 193 mov rcx, 0x0400040 194 195 movdqa xmm4, [rdx] ;load filters 196 movq xmm5, rcx 197 packsswb xmm4, xmm4 198 pshuflw xmm0, xmm4, 0b ;k0_k1 199 pshuflw xmm1, xmm4, 01010101b ;k2_k3 200 pshuflw xmm2, xmm4, 10101010b ;k4_k5 201 pshuflw xmm3, xmm4, 11111111b ;k6_k7 202 203 punpcklqdq xmm0, xmm0 204 punpcklqdq xmm1, xmm1 205 punpcklqdq xmm2, xmm2 206 punpcklqdq xmm3, xmm3 207 208 movdqa k0k1, xmm0 209 movdqa k2k3, xmm1 210 pshufd xmm5, xmm5, 0 211 movdqa k4k5, xmm2 212 movdqa k6k7, xmm3 213 movdqa krd, xmm5 214 215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 216 217%if ABI_IS_32BIT=0 218 movsxd r8, DWORD PTR arg(3) ;out_pitch 219%endif 220 mov rax, rsi 221 movsxd rcx, DWORD PTR arg(4) ;output_height 222 add rax, rdx 223 224 lea rbx, [rdx + rdx*4] 225 add rbx, rdx ;pitch * 6 226 227.loop: 228 movq xmm0, [rsi] ;A 229 movq xmm1, [rsi + rdx] ;B 230 movq xmm2, [rsi + rdx * 2] ;C 231 movq xmm3, [rax + rdx * 2] ;D 232 movq xmm4, [rsi + rdx * 4] ;E 233 movq xmm5, [rax + rdx * 4] ;F 234 235 punpcklbw xmm0, xmm1 ;A B 236 punpcklbw xmm2, xmm3 ;C D 237 punpcklbw xmm4, xmm5 ;E F 238 239 movq xmm6, [rsi + rbx] ;G 240 movq xmm7, [rax + rbx] ;H 241 242 pmaddubsw xmm0, k0k1 243 pmaddubsw xmm2, k2k3 244 punpcklbw xmm6, xmm7 ;G H 245 pmaddubsw xmm4, k4k5 246 pmaddubsw xmm6, k6k7 247 248 paddsw xmm0, xmm6 249 movdqa xmm1, xmm2 250 pmaxsw xmm2, xmm4 251 pminsw xmm4, xmm1 252 paddsw xmm0, xmm4 253 paddsw xmm0, xmm2 254 255 paddsw xmm0, krd 256 psraw xmm0, 7 257 packuswb xmm0, xmm0 258%if %1 259 movq xmm1, [rdi] 260 pavgb xmm0, xmm1 261%endif 262 movq [rdi], xmm0 263 264 movq xmm0, [rsi + 8] ;A 265 movq xmm1, [rsi + rdx + 8] ;B 266 movq xmm2, [rsi + rdx * 2 + 8] ;C 267 movq xmm3, [rax + rdx * 2 + 8] ;D 268 movq xmm4, [rsi + rdx * 4 + 8] ;E 269 movq xmm5, [rax + rdx * 4 + 8] ;F 270 271 punpcklbw xmm0, xmm1 ;A B 272 punpcklbw xmm2, xmm3 ;C D 273 punpcklbw xmm4, xmm5 ;E F 274 275 movq xmm6, [rsi + rbx + 8] ;G 276 movq xmm7, [rax + rbx + 8] ;H 277 punpcklbw xmm6, xmm7 ;G H 278 279 pmaddubsw xmm0, k0k1 280 pmaddubsw xmm2, k2k3 281 pmaddubsw xmm4, k4k5 282 pmaddubsw xmm6, k6k7 283 284 paddsw xmm0, xmm6 285 movdqa xmm1, xmm2 286 pmaxsw xmm2, xmm4 287 pminsw xmm4, xmm1 288 paddsw xmm0, xmm4 289 paddsw xmm0, xmm2 290 291 paddsw xmm0, krd 292 psraw xmm0, 7 293 packuswb xmm0, xmm0 294 295 add rsi, rdx 296 add rax, rdx 297%if %1 298 movq xmm1, [rdi+8] 299 pavgb xmm0, xmm1 300%endif 301 302 movq [rdi+8], xmm0 303 304%if ABI_IS_32BIT 305 add rdi, DWORD PTR arg(3) ;out_pitch 306%else 307 add rdi, r8 308%endif 309 dec rcx 310 jnz .loop 311%endm 312 313;void vp9_filter_block1d8_v8_ssse3 314;( 315; unsigned char *src_ptr, 316; unsigned int src_pitch, 317; unsigned char *output_ptr, 318; unsigned int out_pitch, 319; unsigned int output_height, 320; short *filter 321;) 322global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE 323sym(vp9_filter_block1d4_v8_ssse3): 324 push rbp 325 mov rbp, rsp 326 SHADOW_ARGS_TO_STACK 6 327 SAVE_XMM 7 328 push rsi 329 push rdi 330 push rbx 331 ; end prolog 332 333 ALIGN_STACK 16, rax 334 sub rsp, 16*5 335 %define k0k1 [rsp + 16*0] 336 %define k2k3 [rsp + 16*1] 337 %define k4k5 [rsp + 16*2] 338 %define k6k7 [rsp + 16*3] 339 %define krd [rsp + 16*4] 340 341 VERTx4 0 342 343 add rsp, 16*5 344 pop rsp 345 pop rbx 346 ; begin epilog 347 pop rdi 348 pop rsi 349 RESTORE_XMM 350 UNSHADOW_ARGS 351 pop rbp 352 ret 353 354;void vp9_filter_block1d8_v8_ssse3 355;( 356; unsigned char *src_ptr, 357; unsigned int src_pitch, 358; unsigned char *output_ptr, 359; unsigned int out_pitch, 360; unsigned int output_height, 361; short *filter 362;) 363global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE 364sym(vp9_filter_block1d8_v8_ssse3): 365 push rbp 366 mov rbp, rsp 367 SHADOW_ARGS_TO_STACK 6 368 SAVE_XMM 7 369 push rsi 370 push rdi 371 push rbx 372 ; end prolog 373 374 ALIGN_STACK 16, rax 375 sub rsp, 16*5 376 %define k0k1 [rsp + 16*0] 377 %define k2k3 [rsp + 16*1] 378 %define k4k5 [rsp + 16*2] 379 %define k6k7 [rsp + 16*3] 380 %define krd [rsp + 16*4] 381 382 VERTx8 0 383 384 add rsp, 16*5 385 pop rsp 386 pop rbx 387 ; begin epilog 388 pop rdi 389 pop rsi 390 RESTORE_XMM 391 UNSHADOW_ARGS 392 pop rbp 393 ret 394 395;void vp9_filter_block1d16_v8_ssse3 396;( 397; unsigned char *src_ptr, 398; unsigned int src_pitch, 399; unsigned char *output_ptr, 400; unsigned int out_pitch, 401; unsigned int output_height, 402; short *filter 403;) 404global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE 405sym(vp9_filter_block1d16_v8_ssse3): 406 push rbp 407 mov rbp, rsp 408 SHADOW_ARGS_TO_STACK 6 409 SAVE_XMM 7 410 push rsi 411 push rdi 412 push rbx 413 ; end prolog 414 415 ALIGN_STACK 16, rax 416 sub rsp, 16*5 417 %define k0k1 [rsp + 16*0] 418 %define k2k3 [rsp + 16*1] 419 %define k4k5 [rsp + 16*2] 420 %define k6k7 [rsp + 16*3] 421 %define krd [rsp + 16*4] 422 423 VERTx16 0 424 425 add rsp, 16*5 426 pop rsp 427 pop rbx 428 ; begin epilog 429 pop rdi 430 pop rsi 431 RESTORE_XMM 432 UNSHADOW_ARGS 433 pop rbp 434 ret 435 436;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 437 438 439global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE 440sym(vp9_filter_block1d4_v8_avg_ssse3): 441 push rbp 442 mov rbp, rsp 443 SHADOW_ARGS_TO_STACK 6 444 SAVE_XMM 7 445 push rsi 446 push rdi 447 push rbx 448 ; end prolog 449 450 ALIGN_STACK 16, rax 451 sub rsp, 16*5 452 %define k0k1 [rsp + 16*0] 453 %define k2k3 [rsp + 16*1] 454 %define k4k5 [rsp + 16*2] 455 %define k6k7 [rsp + 16*3] 456 %define krd [rsp + 16*4] 457 458 VERTx4 1 459 460 add rsp, 16*5 461 pop rsp 462 pop rbx 463 ; begin epilog 464 pop rdi 465 pop rsi 466 RESTORE_XMM 467 UNSHADOW_ARGS 468 pop rbp 469 ret 470 471global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE 472sym(vp9_filter_block1d8_v8_avg_ssse3): 473 push rbp 474 mov rbp, rsp 475 SHADOW_ARGS_TO_STACK 6 476 SAVE_XMM 7 477 push rsi 478 push rdi 479 push rbx 480 ; end prolog 481 482 ALIGN_STACK 16, rax 483 sub rsp, 16*5 484 %define k0k1 [rsp + 16*0] 485 %define k2k3 [rsp + 16*1] 486 %define k4k5 [rsp + 16*2] 487 %define k6k7 [rsp + 16*3] 488 %define krd [rsp + 16*4] 489 490 VERTx8 1 491 492 add rsp, 16*5 493 pop rsp 494 pop rbx 495 ; begin epilog 496 pop rdi 497 pop rsi 498 RESTORE_XMM 499 UNSHADOW_ARGS 500 pop rbp 501 ret 502 503global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE 504sym(vp9_filter_block1d16_v8_avg_ssse3): 505 push rbp 506 mov rbp, rsp 507 SHADOW_ARGS_TO_STACK 6 508 SAVE_XMM 7 509 push rsi 510 push rdi 511 push rbx 512 ; end prolog 513 514 ALIGN_STACK 16, rax 515 sub rsp, 16*5 516 %define k0k1 [rsp + 16*0] 517 %define k2k3 [rsp + 16*1] 518 %define k4k5 [rsp + 16*2] 519 %define k6k7 [rsp + 16*3] 520 %define krd [rsp + 16*4] 521 522 VERTx16 1 523 524 add rsp, 16*5 525 pop rsp 526 pop rbx 527 ; begin epilog 528 pop rdi 529 pop rsi 530 RESTORE_XMM 531 UNSHADOW_ARGS 532 pop rbp 533 ret 534 535;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 536%macro HORIZx4_ROW 2 537 movdqa %2, %1 538 pshufb %1, [GLOBAL(shuf_t0t1)] 539 pshufb %2, [GLOBAL(shuf_t2t3)] 540 pmaddubsw %1, k0k1k4k5 541 pmaddubsw %2, k2k3k6k7 542 543 movdqa xmm4, %1 544 movdqa xmm5, %2 545 psrldq %1, 8 546 psrldq %2, 8 547 movdqa xmm6, xmm5 548 549 paddsw xmm4, %2 550 pmaxsw xmm5, %1 551 pminsw %1, xmm6 552 paddsw %1, xmm4 553 paddsw %1, xmm5 554 555 paddsw %1, krd 556 psraw %1, 7 557 packuswb %1, %1 558%endm 559 560%macro HORIZx4 1 561 mov rdx, arg(5) ;filter ptr 562 mov rsi, arg(0) ;src_ptr 563 mov rdi, arg(2) ;output_ptr 564 mov rcx, 0x0400040 565 566 movdqa xmm4, [rdx] ;load filters 567 movq xmm5, rcx 568 packsswb xmm4, xmm4 569 pshuflw xmm6, xmm4, 0b ;k0_k1 570 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 571 pshuflw xmm7, xmm4, 01010101b ;k2_k3 572 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 573 pshufd xmm5, xmm5, 0 ;rounding 574 575 movdqa k0k1k4k5, xmm6 576 movdqa k2k3k6k7, xmm7 577 movdqa krd, xmm5 578 579 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 580 movsxd rdx, dword ptr arg(3) ;output_pitch 581 movsxd rcx, dword ptr arg(4) ;output_height 582 shr rcx, 1 583.loop: 584 ;Do two rows once 585 movq xmm0, [rsi - 3] ;load src 586 movq xmm1, [rsi + 5] 587 movq xmm2, [rsi + rax - 3] 588 movq xmm3, [rsi + rax + 5] 589 punpcklqdq xmm0, xmm1 590 punpcklqdq xmm2, xmm3 591 592 HORIZx4_ROW xmm0, xmm1 593 HORIZx4_ROW xmm2, xmm3 594%if %1 595 movd xmm1, [rdi] 596 pavgb xmm0, xmm1 597 movd xmm3, [rdi + rdx] 598 pavgb xmm2, xmm3 599%endif 600 movd [rdi], xmm0 601 movd [rdi +rdx], xmm2 602 603 lea rsi, [rsi + rax] 604 prefetcht0 [rsi + 4 * rax - 3] 605 lea rsi, [rsi + rax] 606 lea rdi, [rdi + 2 * rdx] 607 prefetcht0 [rsi + 2 * rax - 3] 608 609 dec rcx 610 jnz .loop 611 612 ; Do last row if output_height is odd 613 movsxd rcx, dword ptr arg(4) ;output_height 614 and rcx, 1 615 je .done 616 617 movq xmm0, [rsi - 3] ; load src 618 movq xmm1, [rsi + 5] 619 punpcklqdq xmm0, xmm1 620 621 HORIZx4_ROW xmm0, xmm1 622%if %1 623 movd xmm1, [rdi] 624 pavgb xmm0, xmm1 625%endif 626 movd [rdi], xmm0 627.done 628%endm 629 630%macro HORIZx8_ROW 4 631 movdqa %2, %1 632 movdqa %3, %1 633 movdqa %4, %1 634 635 pshufb %1, [GLOBAL(shuf_t0t1)] 636 pshufb %2, [GLOBAL(shuf_t2t3)] 637 pshufb %3, [GLOBAL(shuf_t4t5)] 638 pshufb %4, [GLOBAL(shuf_t6t7)] 639 640 pmaddubsw %1, k0k1 641 pmaddubsw %2, k2k3 642 pmaddubsw %3, k4k5 643 pmaddubsw %4, k6k7 644 645 paddsw %1, %4 646 movdqa %4, %2 647 pmaxsw %2, %3 648 pminsw %3, %4 649 paddsw %1, %3 650 paddsw %1, %2 651 652 paddsw %1, krd 653 psraw %1, 7 654 packuswb %1, %1 655%endm 656 657%macro HORIZx8 1 658 mov rdx, arg(5) ;filter ptr 659 mov rsi, arg(0) ;src_ptr 660 mov rdi, arg(2) ;output_ptr 661 mov rcx, 0x0400040 662 663 movdqa xmm4, [rdx] ;load filters 664 movd xmm5, rcx 665 packsswb xmm4, xmm4 666 pshuflw xmm0, xmm4, 0b ;k0_k1 667 pshuflw xmm1, xmm4, 01010101b ;k2_k3 668 pshuflw xmm2, xmm4, 10101010b ;k4_k5 669 pshuflw xmm3, xmm4, 11111111b ;k6_k7 670 671 punpcklqdq xmm0, xmm0 672 punpcklqdq xmm1, xmm1 673 punpcklqdq xmm2, xmm2 674 punpcklqdq xmm3, xmm3 675 676 movdqa k0k1, xmm0 677 movdqa k2k3, xmm1 678 pshufd xmm5, xmm5, 0 679 movdqa k4k5, xmm2 680 movdqa k6k7, xmm3 681 movdqa krd, xmm5 682 683 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 684 movsxd rdx, dword ptr arg(3) ;output_pitch 685 movsxd rcx, dword ptr arg(4) ;output_height 686 shr rcx, 1 687 688.loop: 689 movq xmm0, [rsi - 3] ;load src 690 movq xmm3, [rsi + 5] 691 movq xmm4, [rsi + rax - 3] 692 movq xmm7, [rsi + rax + 5] 693 punpcklqdq xmm0, xmm3 694 punpcklqdq xmm4, xmm7 695 696 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 697 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 698%if %1 699 movq xmm1, [rdi] 700 movq xmm2, [rdi + rdx] 701 pavgb xmm0, xmm1 702 pavgb xmm4, xmm2 703%endif 704 movq [rdi], xmm0 705 movq [rdi + rdx], xmm4 706 707 lea rsi, [rsi + rax] 708 prefetcht0 [rsi + 4 * rax - 3] 709 lea rsi, [rsi + rax] 710 lea rdi, [rdi + 2 * rdx] 711 prefetcht0 [rsi + 2 * rax - 3] 712 dec rcx 713 jnz .loop 714 715 ;Do last row if output_height is odd 716 movsxd rcx, dword ptr arg(4) ;output_height 717 and rcx, 1 718 je .done 719 720 movq xmm0, [rsi - 3] 721 movq xmm3, [rsi + 5] 722 punpcklqdq xmm0, xmm3 723 724 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 725%if %1 726 movq xmm1, [rdi] 727 pavgb xmm0, xmm1 728%endif 729 movq [rdi], xmm0 730.done 731%endm 732 733%macro HORIZx16 1 734 mov rdx, arg(5) ;filter ptr 735 mov rsi, arg(0) ;src_ptr 736 mov rdi, arg(2) ;output_ptr 737 mov rcx, 0x0400040 738 739 movdqa xmm4, [rdx] ;load filters 740 movq xmm5, rcx 741 packsswb xmm4, xmm4 742 pshuflw xmm0, xmm4, 0b ;k0_k1 743 pshuflw xmm1, xmm4, 01010101b ;k2_k3 744 pshuflw xmm2, xmm4, 10101010b ;k4_k5 745 pshuflw xmm3, xmm4, 11111111b ;k6_k7 746 747 punpcklqdq xmm0, xmm0 748 punpcklqdq xmm1, xmm1 749 punpcklqdq xmm2, xmm2 750 punpcklqdq xmm3, xmm3 751 752 movdqa k0k1, xmm0 753 movdqa k2k3, xmm1 754 pshufd xmm5, xmm5, 0 755 movdqa k4k5, xmm2 756 movdqa k6k7, xmm3 757 movdqa krd, xmm5 758 759 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 760 movsxd rdx, dword ptr arg(3) ;output_pitch 761 movsxd rcx, dword ptr arg(4) ;output_height 762 763.loop: 764 prefetcht0 [rsi + 2 * rax -3] 765 766 movq xmm0, [rsi - 3] ;load src data 767 movq xmm4, [rsi + 5] 768 movq xmm7, [rsi + 13] 769 punpcklqdq xmm0, xmm4 770 punpcklqdq xmm4, xmm7 771 772 movdqa xmm1, xmm0 773 movdqa xmm2, xmm0 774 movdqa xmm3, xmm0 775 movdqa xmm5, xmm4 776 movdqa xmm6, xmm4 777 movdqa xmm7, xmm4 778 779 pshufb xmm0, [GLOBAL(shuf_t0t1)] 780 pshufb xmm1, [GLOBAL(shuf_t2t3)] 781 pshufb xmm2, [GLOBAL(shuf_t4t5)] 782 pshufb xmm3, [GLOBAL(shuf_t6t7)] 783 pshufb xmm4, [GLOBAL(shuf_t0t1)] 784 pshufb xmm5, [GLOBAL(shuf_t2t3)] 785 pshufb xmm6, [GLOBAL(shuf_t4t5)] 786 pshufb xmm7, [GLOBAL(shuf_t6t7)] 787 788 pmaddubsw xmm0, k0k1 789 pmaddubsw xmm1, k2k3 790 pmaddubsw xmm2, k4k5 791 pmaddubsw xmm3, k6k7 792 pmaddubsw xmm4, k0k1 793 pmaddubsw xmm5, k2k3 794 pmaddubsw xmm6, k4k5 795 pmaddubsw xmm7, k6k7 796 797 paddsw xmm0, xmm3 798 movdqa xmm3, xmm1 799 pmaxsw xmm1, xmm2 800 pminsw xmm2, xmm3 801 paddsw xmm0, xmm2 802 paddsw xmm0, xmm1 803 804 paddsw xmm4, xmm7 805 movdqa xmm7, xmm5 806 pmaxsw xmm5, xmm6 807 pminsw xmm6, xmm7 808 paddsw xmm4, xmm6 809 paddsw xmm4, xmm5 810 811 paddsw xmm0, krd 812 paddsw xmm4, krd 813 psraw xmm0, 7 814 psraw xmm4, 7 815 packuswb xmm0, xmm0 816 packuswb xmm4, xmm4 817 punpcklqdq xmm0, xmm4 818%if %1 819 movdqa xmm1, [rdi] 820 pavgb xmm0, xmm1 821%endif 822 823 lea rsi, [rsi + rax] 824 movdqa [rdi], xmm0 825 826 lea rdi, [rdi + rdx] 827 dec rcx 828 jnz .loop 829%endm 830 831;void vp9_filter_block1d4_h8_ssse3 832;( 833; unsigned char *src_ptr, 834; unsigned int src_pixels_per_line, 835; unsigned char *output_ptr, 836; unsigned int output_pitch, 837; unsigned int output_height, 838; short *filter 839;) 840global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE 841sym(vp9_filter_block1d4_h8_ssse3): 842 push rbp 843 mov rbp, rsp 844 SHADOW_ARGS_TO_STACK 6 845 SAVE_XMM 7 846 GET_GOT rbx 847 push rsi 848 push rdi 849 ; end prolog 850 851 ALIGN_STACK 16, rax 852 sub rsp, 16 * 3 853 %define k0k1k4k5 [rsp + 16 * 0] 854 %define k2k3k6k7 [rsp + 16 * 1] 855 %define krd [rsp + 16 * 2] 856 857 HORIZx4 0 858 859 add rsp, 16 * 3 860 pop rsp 861 ; begin epilog 862 pop rdi 863 pop rsi 864 RESTORE_GOT 865 RESTORE_XMM 866 UNSHADOW_ARGS 867 pop rbp 868 ret 869 870;void vp9_filter_block1d8_h8_ssse3 871;( 872; unsigned char *src_ptr, 873; unsigned int src_pixels_per_line, 874; unsigned char *output_ptr, 875; unsigned int output_pitch, 876; unsigned int output_height, 877; short *filter 878;) 879global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE 880sym(vp9_filter_block1d8_h8_ssse3): 881 push rbp 882 mov rbp, rsp 883 SHADOW_ARGS_TO_STACK 6 884 SAVE_XMM 7 885 GET_GOT rbx 886 push rsi 887 push rdi 888 ; end prolog 889 890 ALIGN_STACK 16, rax 891 sub rsp, 16*5 892 %define k0k1 [rsp + 16*0] 893 %define k2k3 [rsp + 16*1] 894 %define k4k5 [rsp + 16*2] 895 %define k6k7 [rsp + 16*3] 896 %define krd [rsp + 16*4] 897 898 HORIZx8 0 899 900 add rsp, 16*5 901 pop rsp 902 903 ; begin epilog 904 pop rdi 905 pop rsi 906 RESTORE_GOT 907 RESTORE_XMM 908 UNSHADOW_ARGS 909 pop rbp 910 ret 911 912;void vp9_filter_block1d16_h8_ssse3 913;( 914; unsigned char *src_ptr, 915; unsigned int src_pixels_per_line, 916; unsigned char *output_ptr, 917; unsigned int output_pitch, 918; unsigned int output_height, 919; short *filter 920;) 921global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE 922sym(vp9_filter_block1d16_h8_ssse3): 923 push rbp 924 mov rbp, rsp 925 SHADOW_ARGS_TO_STACK 6 926 SAVE_XMM 7 927 GET_GOT rbx 928 push rsi 929 push rdi 930 ; end prolog 931 932 ALIGN_STACK 16, rax 933 sub rsp, 16*5 934 %define k0k1 [rsp + 16*0] 935 %define k2k3 [rsp + 16*1] 936 %define k4k5 [rsp + 16*2] 937 %define k6k7 [rsp + 16*3] 938 %define krd [rsp + 16*4] 939 940 HORIZx16 0 941 942 add rsp, 16*5 943 pop rsp 944 945 ; begin epilog 946 pop rdi 947 pop rsi 948 RESTORE_GOT 949 RESTORE_XMM 950 UNSHADOW_ARGS 951 pop rbp 952 ret 953 954global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE 955sym(vp9_filter_block1d4_h8_avg_ssse3): 956 push rbp 957 mov rbp, rsp 958 SHADOW_ARGS_TO_STACK 6 959 SAVE_XMM 7 960 GET_GOT rbx 961 push rsi 962 push rdi 963 ; end prolog 964 965 ALIGN_STACK 16, rax 966 sub rsp, 16 * 3 967 %define k0k1k4k5 [rsp + 16 * 0] 968 %define k2k3k6k7 [rsp + 16 * 1] 969 %define krd [rsp + 16 * 2] 970 971 HORIZx4 1 972 973 add rsp, 16 * 3 974 pop rsp 975 ; begin epilog 976 pop rdi 977 pop rsi 978 RESTORE_GOT 979 RESTORE_XMM 980 UNSHADOW_ARGS 981 pop rbp 982 ret 983 984global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 985sym(vp9_filter_block1d8_h8_avg_ssse3): 986 push rbp 987 mov rbp, rsp 988 SHADOW_ARGS_TO_STACK 6 989 SAVE_XMM 7 990 GET_GOT rbx 991 push rsi 992 push rdi 993 ; end prolog 994 995 ALIGN_STACK 16, rax 996 sub rsp, 16*5 997 %define k0k1 [rsp + 16*0] 998 %define k2k3 [rsp + 16*1] 999 %define k4k5 [rsp + 16*2] 1000 %define k6k7 [rsp + 16*3] 1001 %define krd [rsp + 16*4] 1002 1003 HORIZx8 1 1004 1005 add rsp, 16*5 1006 pop rsp 1007 1008 ; begin epilog 1009 pop rdi 1010 pop rsi 1011 RESTORE_GOT 1012 RESTORE_XMM 1013 UNSHADOW_ARGS 1014 pop rbp 1015 ret 1016 1017global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE 1018sym(vp9_filter_block1d16_h8_avg_ssse3): 1019 push rbp 1020 mov rbp, rsp 1021 SHADOW_ARGS_TO_STACK 6 1022 SAVE_XMM 7 1023 GET_GOT rbx 1024 push rsi 1025 push rdi 1026 ; end prolog 1027 1028 ALIGN_STACK 16, rax 1029 sub rsp, 16*5 1030 %define k0k1 [rsp + 16*0] 1031 %define k2k3 [rsp + 16*1] 1032 %define k4k5 [rsp + 16*2] 1033 %define k6k7 [rsp + 16*3] 1034 %define krd [rsp + 16*4] 1035 1036 HORIZx16 1 1037 1038 add rsp, 16*5 1039 pop rsp 1040 1041 ; begin epilog 1042 pop rdi 1043 pop rsi 1044 RESTORE_GOT 1045 RESTORE_XMM 1046 UNSHADOW_ARGS 1047 pop rbp 1048 ret 1049SECTION_RODATA 1050align 16 1051shuf_t0t1: 1052 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1053align 16 1054shuf_t2t3: 1055 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1056align 16 1057shuf_t4t5: 1058 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1059align 16 1060shuf_t6t7: 1061 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1062