1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;Note: tap3 and tap4 have to be applied and added after other taps to avoid 15;overflow. 16 17%macro GET_FILTERS_4 0 18 mov rdx, arg(5) ;filter ptr 19 mov rcx, 0x0400040 20 21 movdqa xmm7, [rdx] ;load filters 22 pshuflw xmm0, xmm7, 0b ;k0 23 pshuflw xmm1, xmm7, 01010101b ;k1 24 pshuflw xmm2, xmm7, 10101010b ;k2 25 pshuflw xmm3, xmm7, 11111111b ;k3 26 psrldq xmm7, 8 27 pshuflw xmm4, xmm7, 0b ;k4 28 pshuflw xmm5, xmm7, 01010101b ;k5 29 pshuflw xmm6, xmm7, 10101010b ;k6 30 pshuflw xmm7, xmm7, 11111111b ;k7 31 32 punpcklqdq xmm0, xmm1 33 punpcklqdq xmm2, xmm3 34 punpcklqdq xmm5, xmm4 35 punpcklqdq xmm6, xmm7 36 37 movdqa k0k1, xmm0 38 movdqa k2k3, xmm2 39 movdqa k5k4, xmm5 40 movdqa k6k7, xmm6 41 42 movq xmm6, rcx 43 pshufd xmm6, xmm6, 0 44 movdqa krd, xmm6 45 46 pxor xmm7, xmm7 47 movdqa zero, xmm7 48%endm 49 50%macro APPLY_FILTER_4 1 51 punpckldq xmm0, xmm1 ;two row in one register 52 punpckldq xmm6, xmm7 53 punpckldq xmm2, xmm3 54 punpckldq xmm5, xmm4 55 56 punpcklbw xmm0, zero ;unpack to word 57 punpcklbw xmm6, zero 58 punpcklbw xmm2, zero 59 punpcklbw xmm5, zero 60 61 pmullw xmm0, k0k1 ;multiply the filter factors 62 pmullw xmm6, k6k7 63 pmullw xmm2, k2k3 64 pmullw xmm5, k5k4 65 66 paddsw xmm0, xmm6 ;sum 67 movdqa xmm1, xmm0 68 psrldq xmm1, 8 69 paddsw xmm0, xmm1 70 paddsw xmm0, xmm2 71 psrldq xmm2, 8 72 paddsw xmm0, xmm5 73 psrldq xmm5, 8 74 paddsw xmm0, xmm2 75 paddsw xmm0, xmm5 76 77 paddsw xmm0, krd ;rounding 78 psraw xmm0, 7 ;shift 79 packuswb xmm0, xmm0 ;pack to byte 80 81%if %1 82 movd xmm1, [rdi] 83 pavgb xmm0, xmm1 84%endif 85 movd [rdi], xmm0 86%endm 87 88%macro GET_FILTERS 0 89 mov rdx, arg(5) ;filter ptr 90 mov rsi, arg(0) ;src_ptr 91 mov rdi, arg(2) ;output_ptr 92 mov rcx, 0x0400040 93 94 movdqa xmm7, [rdx] ;load filters 95 pshuflw xmm0, xmm7, 0b ;k0 96 pshuflw xmm1, xmm7, 01010101b ;k1 97 pshuflw xmm2, xmm7, 10101010b ;k2 98 pshuflw xmm3, xmm7, 11111111b ;k3 99 pshufhw xmm4, xmm7, 0b ;k4 100 pshufhw xmm5, xmm7, 01010101b ;k5 101 pshufhw xmm6, xmm7, 10101010b ;k6 102 pshufhw xmm7, xmm7, 11111111b ;k7 103 104 punpcklwd xmm0, xmm0 105 punpcklwd xmm1, xmm1 106 punpcklwd xmm2, xmm2 107 punpcklwd xmm3, xmm3 108 punpckhwd xmm4, xmm4 109 punpckhwd xmm5, xmm5 110 punpckhwd xmm6, xmm6 111 punpckhwd xmm7, xmm7 112 113 movdqa k0, xmm0 ;store filter factors on stack 114 movdqa k1, xmm1 115 movdqa k2, xmm2 116 movdqa k3, xmm3 117 movdqa k4, xmm4 118 movdqa k5, xmm5 119 movdqa k6, xmm6 120 movdqa k7, xmm7 121 122 movq xmm6, rcx 123 pshufd xmm6, xmm6, 0 124 movdqa krd, xmm6 ;rounding 125 126 pxor xmm7, xmm7 127 movdqa zero, xmm7 128%endm 129 130%macro LOAD_VERT_8 1 131 movq xmm0, [rsi + %1] ;0 132 movq xmm1, [rsi + rax + %1] ;1 133 movq xmm6, [rsi + rdx * 2 + %1] ;6 134 lea rsi, [rsi + rax] 135 movq xmm7, [rsi + rdx * 2 + %1] ;7 136 movq xmm2, [rsi + rax + %1] ;2 137 movq xmm3, [rsi + rax * 2 + %1] ;3 138 movq xmm4, [rsi + rdx + %1] ;4 139 movq xmm5, [rsi + rax * 4 + %1] ;5 140%endm 141 142%macro APPLY_FILTER_8 2 143 punpcklbw xmm0, zero 144 punpcklbw xmm1, zero 145 punpcklbw xmm6, zero 146 punpcklbw xmm7, zero 147 punpcklbw xmm2, zero 148 punpcklbw xmm5, zero 149 punpcklbw xmm3, zero 150 punpcklbw xmm4, zero 151 152 pmullw xmm0, k0 153 pmullw xmm1, k1 154 pmullw xmm6, k6 155 pmullw xmm7, k7 156 pmullw xmm2, k2 157 pmullw xmm5, k5 158 pmullw xmm3, k3 159 pmullw xmm4, k4 160 161 paddsw xmm0, xmm1 162 paddsw xmm0, xmm6 163 paddsw xmm0, xmm7 164 paddsw xmm0, xmm2 165 paddsw xmm0, xmm5 166 paddsw xmm0, xmm3 167 paddsw xmm0, xmm4 168 169 paddsw xmm0, krd ;rounding 170 psraw xmm0, 7 ;shift 171 packuswb xmm0, xmm0 ;pack back to byte 172%if %1 173 movq xmm1, [rdi + %2] 174 pavgb xmm0, xmm1 175%endif 176 movq [rdi + %2], xmm0 177%endm 178 179SECTION .text 180 181;void vpx_filter_block1d4_v8_sse2 182;( 183; unsigned char *src_ptr, 184; unsigned int src_pitch, 185; unsigned char *output_ptr, 186; unsigned int out_pitch, 187; unsigned int output_height, 188; short *filter 189;) 190global sym(vpx_filter_block1d4_v8_sse2) PRIVATE 191sym(vpx_filter_block1d4_v8_sse2): 192 push rbp 193 mov rbp, rsp 194 SHADOW_ARGS_TO_STACK 6 195 SAVE_XMM 7 196 push rsi 197 push rdi 198 push rbx 199 ; end prolog 200 201 ALIGN_STACK 16, rax 202 sub rsp, 16 * 6 203 %define k0k1 [rsp + 16 * 0] 204 %define k2k3 [rsp + 16 * 1] 205 %define k5k4 [rsp + 16 * 2] 206 %define k6k7 [rsp + 16 * 3] 207 %define krd [rsp + 16 * 4] 208 %define zero [rsp + 16 * 5] 209 210 GET_FILTERS_4 211 212 mov rsi, arg(0) ;src_ptr 213 mov rdi, arg(2) ;output_ptr 214 215 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 216 movsxd rbx, DWORD PTR arg(3) ;out_pitch 217 lea rdx, [rax + rax * 2] 218 movsxd rcx, DWORD PTR arg(4) ;output_height 219 220.loop: 221 movd xmm0, [rsi] ;load src: row 0 222 movd xmm1, [rsi + rax] ;1 223 movd xmm6, [rsi + rdx * 2] ;6 224 lea rsi, [rsi + rax] 225 movd xmm7, [rsi + rdx * 2] ;7 226 movd xmm2, [rsi + rax] ;2 227 movd xmm3, [rsi + rax * 2] ;3 228 movd xmm4, [rsi + rdx] ;4 229 movd xmm5, [rsi + rax * 4] ;5 230 231 APPLY_FILTER_4 0 232 233 lea rdi, [rdi + rbx] 234 dec rcx 235 jnz .loop 236 237 add rsp, 16 * 6 238 pop rsp 239 pop rbx 240 ; begin epilog 241 pop rdi 242 pop rsi 243 RESTORE_XMM 244 UNSHADOW_ARGS 245 pop rbp 246 ret 247 248;void vpx_filter_block1d8_v8_sse2 249;( 250; unsigned char *src_ptr, 251; unsigned int src_pitch, 252; unsigned char *output_ptr, 253; unsigned int out_pitch, 254; unsigned int output_height, 255; short *filter 256;) 257global sym(vpx_filter_block1d8_v8_sse2) PRIVATE 258sym(vpx_filter_block1d8_v8_sse2): 259 push rbp 260 mov rbp, rsp 261 SHADOW_ARGS_TO_STACK 6 262 SAVE_XMM 7 263 push rsi 264 push rdi 265 push rbx 266 ; end prolog 267 268 ALIGN_STACK 16, rax 269 sub rsp, 16 * 10 270 %define k0 [rsp + 16 * 0] 271 %define k1 [rsp + 16 * 1] 272 %define k2 [rsp + 16 * 2] 273 %define k3 [rsp + 16 * 3] 274 %define k4 [rsp + 16 * 4] 275 %define k5 [rsp + 16 * 5] 276 %define k6 [rsp + 16 * 6] 277 %define k7 [rsp + 16 * 7] 278 %define krd [rsp + 16 * 8] 279 %define zero [rsp + 16 * 9] 280 281 GET_FILTERS 282 283 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 284 movsxd rbx, DWORD PTR arg(3) ;out_pitch 285 lea rdx, [rax + rax * 2] 286 movsxd rcx, DWORD PTR arg(4) ;output_height 287 288.loop: 289 LOAD_VERT_8 0 290 APPLY_FILTER_8 0, 0 291 292 lea rdi, [rdi + rbx] 293 dec rcx 294 jnz .loop 295 296 add rsp, 16 * 10 297 pop rsp 298 pop rbx 299 ; begin epilog 300 pop rdi 301 pop rsi 302 RESTORE_XMM 303 UNSHADOW_ARGS 304 pop rbp 305 ret 306 307;void vpx_filter_block1d16_v8_sse2 308;( 309; unsigned char *src_ptr, 310; unsigned int src_pitch, 311; unsigned char *output_ptr, 312; unsigned int out_pitch, 313; unsigned int output_height, 314; short *filter 315;) 316global sym(vpx_filter_block1d16_v8_sse2) PRIVATE 317sym(vpx_filter_block1d16_v8_sse2): 318 push rbp 319 mov rbp, rsp 320 SHADOW_ARGS_TO_STACK 6 321 SAVE_XMM 7 322 push rsi 323 push rdi 324 push rbx 325 ; end prolog 326 327 ALIGN_STACK 16, rax 328 sub rsp, 16 * 10 329 %define k0 [rsp + 16 * 0] 330 %define k1 [rsp + 16 * 1] 331 %define k2 [rsp + 16 * 2] 332 %define k3 [rsp + 16 * 3] 333 %define k4 [rsp + 16 * 4] 334 %define k5 [rsp + 16 * 5] 335 %define k6 [rsp + 16 * 6] 336 %define k7 [rsp + 16 * 7] 337 %define krd [rsp + 16 * 8] 338 %define zero [rsp + 16 * 9] 339 340 GET_FILTERS 341 342 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 343 movsxd rbx, DWORD PTR arg(3) ;out_pitch 344 lea rdx, [rax + rax * 2] 345 movsxd rcx, DWORD PTR arg(4) ;output_height 346 347.loop: 348 LOAD_VERT_8 0 349 APPLY_FILTER_8 0, 0 350 sub rsi, rax 351 352 LOAD_VERT_8 8 353 APPLY_FILTER_8 0, 8 354 add rdi, rbx 355 356 dec rcx 357 jnz .loop 358 359 add rsp, 16 * 10 360 pop rsp 361 pop rbx 362 ; begin epilog 363 pop rdi 364 pop rsi 365 RESTORE_XMM 366 UNSHADOW_ARGS 367 pop rbp 368 ret 369 370global sym(vpx_filter_block1d4_v8_avg_sse2) PRIVATE 371sym(vpx_filter_block1d4_v8_avg_sse2): 372 push rbp 373 mov rbp, rsp 374 SHADOW_ARGS_TO_STACK 6 375 SAVE_XMM 7 376 push rsi 377 push rdi 378 push rbx 379 ; end prolog 380 381 ALIGN_STACK 16, rax 382 sub rsp, 16 * 6 383 %define k0k1 [rsp + 16 * 0] 384 %define k2k3 [rsp + 16 * 1] 385 %define k5k4 [rsp + 16 * 2] 386 %define k6k7 [rsp + 16 * 3] 387 %define krd [rsp + 16 * 4] 388 %define zero [rsp + 16 * 5] 389 390 GET_FILTERS_4 391 392 mov rsi, arg(0) ;src_ptr 393 mov rdi, arg(2) ;output_ptr 394 395 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 396 movsxd rbx, DWORD PTR arg(3) ;out_pitch 397 lea rdx, [rax + rax * 2] 398 movsxd rcx, DWORD PTR arg(4) ;output_height 399 400.loop: 401 movd xmm0, [rsi] ;load src: row 0 402 movd xmm1, [rsi + rax] ;1 403 movd xmm6, [rsi + rdx * 2] ;6 404 lea rsi, [rsi + rax] 405 movd xmm7, [rsi + rdx * 2] ;7 406 movd xmm2, [rsi + rax] ;2 407 movd xmm3, [rsi + rax * 2] ;3 408 movd xmm4, [rsi + rdx] ;4 409 movd xmm5, [rsi + rax * 4] ;5 410 411 APPLY_FILTER_4 1 412 413 lea rdi, [rdi + rbx] 414 dec rcx 415 jnz .loop 416 417 add rsp, 16 * 6 418 pop rsp 419 pop rbx 420 ; begin epilog 421 pop rdi 422 pop rsi 423 RESTORE_XMM 424 UNSHADOW_ARGS 425 pop rbp 426 ret 427 428global sym(vpx_filter_block1d8_v8_avg_sse2) PRIVATE 429sym(vpx_filter_block1d8_v8_avg_sse2): 430 push rbp 431 mov rbp, rsp 432 SHADOW_ARGS_TO_STACK 6 433 SAVE_XMM 7 434 push rsi 435 push rdi 436 push rbx 437 ; end prolog 438 439 ALIGN_STACK 16, rax 440 sub rsp, 16 * 10 441 %define k0 [rsp + 16 * 0] 442 %define k1 [rsp + 16 * 1] 443 %define k2 [rsp + 16 * 2] 444 %define k3 [rsp + 16 * 3] 445 %define k4 [rsp + 16 * 4] 446 %define k5 [rsp + 16 * 5] 447 %define k6 [rsp + 16 * 6] 448 %define k7 [rsp + 16 * 7] 449 %define krd [rsp + 16 * 8] 450 %define zero [rsp + 16 * 9] 451 452 GET_FILTERS 453 454 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 455 movsxd rbx, DWORD PTR arg(3) ;out_pitch 456 lea rdx, [rax + rax * 2] 457 movsxd rcx, DWORD PTR arg(4) ;output_height 458.loop: 459 LOAD_VERT_8 0 460 APPLY_FILTER_8 1, 0 461 462 lea rdi, [rdi + rbx] 463 dec rcx 464 jnz .loop 465 466 add rsp, 16 * 10 467 pop rsp 468 pop rbx 469 ; begin epilog 470 pop rdi 471 pop rsi 472 RESTORE_XMM 473 UNSHADOW_ARGS 474 pop rbp 475 ret 476 477global sym(vpx_filter_block1d16_v8_avg_sse2) PRIVATE 478sym(vpx_filter_block1d16_v8_avg_sse2): 479 push rbp 480 mov rbp, rsp 481 SHADOW_ARGS_TO_STACK 6 482 SAVE_XMM 7 483 push rsi 484 push rdi 485 push rbx 486 ; end prolog 487 488 ALIGN_STACK 16, rax 489 sub rsp, 16 * 10 490 %define k0 [rsp + 16 * 0] 491 %define k1 [rsp + 16 * 1] 492 %define k2 [rsp + 16 * 2] 493 %define k3 [rsp + 16 * 3] 494 %define k4 [rsp + 16 * 4] 495 %define k5 [rsp + 16 * 5] 496 %define k6 [rsp + 16 * 6] 497 %define k7 [rsp + 16 * 7] 498 %define krd [rsp + 16 * 8] 499 %define zero [rsp + 16 * 9] 500 501 GET_FILTERS 502 503 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 504 movsxd rbx, DWORD PTR arg(3) ;out_pitch 505 lea rdx, [rax + rax * 2] 506 movsxd rcx, DWORD PTR arg(4) ;output_height 507.loop: 508 LOAD_VERT_8 0 509 APPLY_FILTER_8 1, 0 510 sub rsi, rax 511 512 LOAD_VERT_8 8 513 APPLY_FILTER_8 1, 8 514 add rdi, rbx 515 516 dec rcx 517 jnz .loop 518 519 add rsp, 16 * 10 520 pop rsp 521 pop rbx 522 ; begin epilog 523 pop rdi 524 pop rsi 525 RESTORE_XMM 526 UNSHADOW_ARGS 527 pop rbp 528 ret 529 530;void vpx_filter_block1d4_h8_sse2 531;( 532; unsigned char *src_ptr, 533; unsigned int src_pixels_per_line, 534; unsigned char *output_ptr, 535; unsigned int output_pitch, 536; unsigned int output_height, 537; short *filter 538;) 539global sym(vpx_filter_block1d4_h8_sse2) PRIVATE 540sym(vpx_filter_block1d4_h8_sse2): 541 push rbp 542 mov rbp, rsp 543 SHADOW_ARGS_TO_STACK 6 544 SAVE_XMM 7 545 push rsi 546 push rdi 547 ; end prolog 548 549 ALIGN_STACK 16, rax 550 sub rsp, 16 * 6 551 %define k0k1 [rsp + 16 * 0] 552 %define k2k3 [rsp + 16 * 1] 553 %define k5k4 [rsp + 16 * 2] 554 %define k6k7 [rsp + 16 * 3] 555 %define krd [rsp + 16 * 4] 556 %define zero [rsp + 16 * 5] 557 558 GET_FILTERS_4 559 560 mov rsi, arg(0) ;src_ptr 561 mov rdi, arg(2) ;output_ptr 562 563 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 564 movsxd rdx, DWORD PTR arg(3) ;out_pitch 565 movsxd rcx, DWORD PTR arg(4) ;output_height 566 567.loop: 568 movdqu xmm0, [rsi - 3] ;load src 569 570 movdqa xmm1, xmm0 571 movdqa xmm6, xmm0 572 movdqa xmm7, xmm0 573 movdqa xmm2, xmm0 574 movdqa xmm3, xmm0 575 movdqa xmm5, xmm0 576 movdqa xmm4, xmm0 577 578 psrldq xmm1, 1 579 psrldq xmm6, 6 580 psrldq xmm7, 7 581 psrldq xmm2, 2 582 psrldq xmm3, 3 583 psrldq xmm5, 5 584 psrldq xmm4, 4 585 586 APPLY_FILTER_4 0 587 588 lea rsi, [rsi + rax] 589 lea rdi, [rdi + rdx] 590 dec rcx 591 jnz .loop 592 593 add rsp, 16 * 6 594 pop rsp 595 596 ; begin epilog 597 pop rdi 598 pop rsi 599 RESTORE_XMM 600 UNSHADOW_ARGS 601 pop rbp 602 ret 603 604;void vpx_filter_block1d8_h8_sse2 605;( 606; unsigned char *src_ptr, 607; unsigned int src_pixels_per_line, 608; unsigned char *output_ptr, 609; unsigned int output_pitch, 610; unsigned int output_height, 611; short *filter 612;) 613global sym(vpx_filter_block1d8_h8_sse2) PRIVATE 614sym(vpx_filter_block1d8_h8_sse2): 615 push rbp 616 mov rbp, rsp 617 SHADOW_ARGS_TO_STACK 6 618 SAVE_XMM 7 619 push rsi 620 push rdi 621 ; end prolog 622 623 ALIGN_STACK 16, rax 624 sub rsp, 16 * 10 625 %define k0 [rsp + 16 * 0] 626 %define k1 [rsp + 16 * 1] 627 %define k2 [rsp + 16 * 2] 628 %define k3 [rsp + 16 * 3] 629 %define k4 [rsp + 16 * 4] 630 %define k5 [rsp + 16 * 5] 631 %define k6 [rsp + 16 * 6] 632 %define k7 [rsp + 16 * 7] 633 %define krd [rsp + 16 * 8] 634 %define zero [rsp + 16 * 9] 635 636 GET_FILTERS 637 638 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 639 movsxd rdx, DWORD PTR arg(3) ;out_pitch 640 movsxd rcx, DWORD PTR arg(4) ;output_height 641 642.loop: 643 movdqu xmm0, [rsi - 3] ;load src 644 645 movdqa xmm1, xmm0 646 movdqa xmm6, xmm0 647 movdqa xmm7, xmm0 648 movdqa xmm2, xmm0 649 movdqa xmm5, xmm0 650 movdqa xmm3, xmm0 651 movdqa xmm4, xmm0 652 653 psrldq xmm1, 1 654 psrldq xmm6, 6 655 psrldq xmm7, 7 656 psrldq xmm2, 2 657 psrldq xmm5, 5 658 psrldq xmm3, 3 659 psrldq xmm4, 4 660 661 APPLY_FILTER_8 0, 0 662 663 lea rsi, [rsi + rax] 664 lea rdi, [rdi + rdx] 665 dec rcx 666 jnz .loop 667 668 add rsp, 16 * 10 669 pop rsp 670 671 ; begin epilog 672 pop rdi 673 pop rsi 674 RESTORE_XMM 675 UNSHADOW_ARGS 676 pop rbp 677 ret 678 679;void vpx_filter_block1d16_h8_sse2 680;( 681; unsigned char *src_ptr, 682; unsigned int src_pixels_per_line, 683; unsigned char *output_ptr, 684; unsigned int output_pitch, 685; unsigned int output_height, 686; short *filter 687;) 688global sym(vpx_filter_block1d16_h8_sse2) PRIVATE 689sym(vpx_filter_block1d16_h8_sse2): 690 push rbp 691 mov rbp, rsp 692 SHADOW_ARGS_TO_STACK 6 693 SAVE_XMM 7 694 push rsi 695 push rdi 696 ; end prolog 697 698 ALIGN_STACK 16, rax 699 sub rsp, 16 * 10 700 %define k0 [rsp + 16 * 0] 701 %define k1 [rsp + 16 * 1] 702 %define k2 [rsp + 16 * 2] 703 %define k3 [rsp + 16 * 3] 704 %define k4 [rsp + 16 * 4] 705 %define k5 [rsp + 16 * 5] 706 %define k6 [rsp + 16 * 6] 707 %define k7 [rsp + 16 * 7] 708 %define krd [rsp + 16 * 8] 709 %define zero [rsp + 16 * 9] 710 711 GET_FILTERS 712 713 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 714 movsxd rdx, DWORD PTR arg(3) ;out_pitch 715 movsxd rcx, DWORD PTR arg(4) ;output_height 716 717.loop: 718 movdqu xmm0, [rsi - 3] ;load src 719 720 movdqa xmm1, xmm0 721 movdqa xmm6, xmm0 722 movdqa xmm7, xmm0 723 movdqa xmm2, xmm0 724 movdqa xmm5, xmm0 725 movdqa xmm3, xmm0 726 movdqa xmm4, xmm0 727 728 psrldq xmm1, 1 729 psrldq xmm6, 6 730 psrldq xmm7, 7 731 psrldq xmm2, 2 732 psrldq xmm5, 5 733 psrldq xmm3, 3 734 psrldq xmm4, 4 735 736 APPLY_FILTER_8 0, 0 737 738 movdqu xmm0, [rsi + 5] ;load src 739 740 movdqa xmm1, xmm0 741 movdqa xmm6, xmm0 742 movdqa xmm7, xmm0 743 movdqa xmm2, xmm0 744 movdqa xmm5, xmm0 745 movdqa xmm3, xmm0 746 movdqa xmm4, xmm0 747 748 psrldq xmm1, 1 749 psrldq xmm6, 6 750 psrldq xmm7, 7 751 psrldq xmm2, 2 752 psrldq xmm5, 5 753 psrldq xmm3, 3 754 psrldq xmm4, 4 755 756 APPLY_FILTER_8 0, 8 757 758 lea rsi, [rsi + rax] 759 lea rdi, [rdi + rdx] 760 dec rcx 761 jnz .loop 762 763 add rsp, 16 * 10 764 pop rsp 765 766 ; begin epilog 767 pop rdi 768 pop rsi 769 RESTORE_XMM 770 UNSHADOW_ARGS 771 pop rbp 772 ret 773 774global sym(vpx_filter_block1d4_h8_avg_sse2) PRIVATE 775sym(vpx_filter_block1d4_h8_avg_sse2): 776 push rbp 777 mov rbp, rsp 778 SHADOW_ARGS_TO_STACK 6 779 SAVE_XMM 7 780 push rsi 781 push rdi 782 ; end prolog 783 784 ALIGN_STACK 16, rax 785 sub rsp, 16 * 6 786 %define k0k1 [rsp + 16 * 0] 787 %define k2k3 [rsp + 16 * 1] 788 %define k5k4 [rsp + 16 * 2] 789 %define k6k7 [rsp + 16 * 3] 790 %define krd [rsp + 16 * 4] 791 %define zero [rsp + 16 * 5] 792 793 GET_FILTERS_4 794 795 mov rsi, arg(0) ;src_ptr 796 mov rdi, arg(2) ;output_ptr 797 798 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 799 movsxd rdx, DWORD PTR arg(3) ;out_pitch 800 movsxd rcx, DWORD PTR arg(4) ;output_height 801 802.loop: 803 movdqu xmm0, [rsi - 3] ;load src 804 805 movdqa xmm1, xmm0 806 movdqa xmm6, xmm0 807 movdqa xmm7, xmm0 808 movdqa xmm2, xmm0 809 movdqa xmm3, xmm0 810 movdqa xmm5, xmm0 811 movdqa xmm4, xmm0 812 813 psrldq xmm1, 1 814 psrldq xmm6, 6 815 psrldq xmm7, 7 816 psrldq xmm2, 2 817 psrldq xmm3, 3 818 psrldq xmm5, 5 819 psrldq xmm4, 4 820 821 APPLY_FILTER_4 1 822 823 lea rsi, [rsi + rax] 824 lea rdi, [rdi + rdx] 825 dec rcx 826 jnz .loop 827 828 add rsp, 16 * 6 829 pop rsp 830 831 ; begin epilog 832 pop rdi 833 pop rsi 834 RESTORE_XMM 835 UNSHADOW_ARGS 836 pop rbp 837 ret 838 839global sym(vpx_filter_block1d8_h8_avg_sse2) PRIVATE 840sym(vpx_filter_block1d8_h8_avg_sse2): 841 push rbp 842 mov rbp, rsp 843 SHADOW_ARGS_TO_STACK 6 844 SAVE_XMM 7 845 push rsi 846 push rdi 847 ; end prolog 848 849 ALIGN_STACK 16, rax 850 sub rsp, 16 * 10 851 %define k0 [rsp + 16 * 0] 852 %define k1 [rsp + 16 * 1] 853 %define k2 [rsp + 16 * 2] 854 %define k3 [rsp + 16 * 3] 855 %define k4 [rsp + 16 * 4] 856 %define k5 [rsp + 16 * 5] 857 %define k6 [rsp + 16 * 6] 858 %define k7 [rsp + 16 * 7] 859 %define krd [rsp + 16 * 8] 860 %define zero [rsp + 16 * 9] 861 862 GET_FILTERS 863 864 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 865 movsxd rdx, DWORD PTR arg(3) ;out_pitch 866 movsxd rcx, DWORD PTR arg(4) ;output_height 867 868.loop: 869 movdqu xmm0, [rsi - 3] ;load src 870 871 movdqa xmm1, xmm0 872 movdqa xmm6, xmm0 873 movdqa xmm7, xmm0 874 movdqa xmm2, xmm0 875 movdqa xmm5, xmm0 876 movdqa xmm3, xmm0 877 movdqa xmm4, xmm0 878 879 psrldq xmm1, 1 880 psrldq xmm6, 6 881 psrldq xmm7, 7 882 psrldq xmm2, 2 883 psrldq xmm5, 5 884 psrldq xmm3, 3 885 psrldq xmm4, 4 886 887 APPLY_FILTER_8 1, 0 888 889 lea rsi, [rsi + rax] 890 lea rdi, [rdi + rdx] 891 dec rcx 892 jnz .loop 893 894 add rsp, 16 * 10 895 pop rsp 896 897 ; begin epilog 898 pop rdi 899 pop rsi 900 RESTORE_XMM 901 UNSHADOW_ARGS 902 pop rbp 903 ret 904 905global sym(vpx_filter_block1d16_h8_avg_sse2) PRIVATE 906sym(vpx_filter_block1d16_h8_avg_sse2): 907 push rbp 908 mov rbp, rsp 909 SHADOW_ARGS_TO_STACK 6 910 SAVE_XMM 7 911 push rsi 912 push rdi 913 ; end prolog 914 915 ALIGN_STACK 16, rax 916 sub rsp, 16 * 10 917 %define k0 [rsp + 16 * 0] 918 %define k1 [rsp + 16 * 1] 919 %define k2 [rsp + 16 * 2] 920 %define k3 [rsp + 16 * 3] 921 %define k4 [rsp + 16 * 4] 922 %define k5 [rsp + 16 * 5] 923 %define k6 [rsp + 16 * 6] 924 %define k7 [rsp + 16 * 7] 925 %define krd [rsp + 16 * 8] 926 %define zero [rsp + 16 * 9] 927 928 GET_FILTERS 929 930 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 931 movsxd rdx, DWORD PTR arg(3) ;out_pitch 932 movsxd rcx, DWORD PTR arg(4) ;output_height 933 934.loop: 935 movdqu xmm0, [rsi - 3] ;load src 936 937 movdqa xmm1, xmm0 938 movdqa xmm6, xmm0 939 movdqa xmm7, xmm0 940 movdqa xmm2, xmm0 941 movdqa xmm5, xmm0 942 movdqa xmm3, xmm0 943 movdqa xmm4, xmm0 944 945 psrldq xmm1, 1 946 psrldq xmm6, 6 947 psrldq xmm7, 7 948 psrldq xmm2, 2 949 psrldq xmm5, 5 950 psrldq xmm3, 3 951 psrldq xmm4, 4 952 953 APPLY_FILTER_8 1, 0 954 955 movdqu xmm0, [rsi + 5] ;load src 956 957 movdqa xmm1, xmm0 958 movdqa xmm6, xmm0 959 movdqa xmm7, xmm0 960 movdqa xmm2, xmm0 961 movdqa xmm5, xmm0 962 movdqa xmm3, xmm0 963 movdqa xmm4, xmm0 964 965 psrldq xmm1, 1 966 psrldq xmm6, 6 967 psrldq xmm7, 7 968 psrldq xmm2, 2 969 psrldq xmm5, 5 970 psrldq xmm3, 3 971 psrldq xmm4, 4 972 973 APPLY_FILTER_8 1, 8 974 975 lea rsi, [rsi + rax] 976 lea rdi, [rdi + rdx] 977 dec rcx 978 jnz .loop 979 980 add rsp, 16 * 10 981 pop rsp 982 983 ; begin epilog 984 pop rdi 985 pop rsi 986 RESTORE_XMM 987 UNSHADOW_ARGS 988 pop rbp 989 ret 990