1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;Note: tap3 and tap4 have to be applied and added after other taps to avoid 15;overflow. 16 17%macro HIGH_GET_FILTERS_4 0 18 mov rdx, arg(5) ;filter ptr 19 mov rcx, 0x00000040 20 21 movdqa xmm7, [rdx] ;load filters 22 pshuflw xmm0, xmm7, 0b ;k0 23 pshuflw xmm1, xmm7, 01010101b ;k1 24 pshuflw xmm2, xmm7, 10101010b ;k2 25 pshuflw xmm3, xmm7, 11111111b ;k3 26 psrldq xmm7, 8 27 pshuflw xmm4, xmm7, 0b ;k4 28 pshuflw xmm5, xmm7, 01010101b ;k5 29 pshuflw xmm6, xmm7, 10101010b ;k6 30 pshuflw xmm7, xmm7, 11111111b ;k7 31 32 punpcklwd xmm0, xmm6 33 punpcklwd xmm2, xmm5 34 punpcklwd xmm3, xmm4 35 punpcklwd xmm1, xmm7 36 37 movdqa k0k6, xmm0 38 movdqa k2k5, xmm2 39 movdqa k3k4, xmm3 40 movdqa k1k7, xmm1 41 42 movq xmm6, rcx 43 pshufd xmm6, xmm6, 0 44 movdqa krd, xmm6 45 46 ;Compute max and min values of a pixel 47 mov rdx, 0x00010001 48 movsxd rcx, DWORD PTR arg(6) ;bd 49 movq xmm0, rdx 50 movq xmm1, rcx 51 pshufd xmm0, xmm0, 0b 52 movdqa xmm2, xmm0 53 psllw xmm0, xmm1 54 psubw xmm0, xmm2 55 pxor xmm1, xmm1 56 movdqa max, xmm0 ;max value (for clamping) 57 movdqa min, xmm1 ;min value (for clamping) 58 59%endm 60 61%macro HIGH_APPLY_FILTER_4 1 62 punpcklwd xmm0, xmm6 ;two row in one register 63 punpcklwd xmm1, xmm7 64 punpcklwd xmm2, xmm5 65 punpcklwd xmm3, xmm4 66 67 pmaddwd xmm0, k0k6 ;multiply the filter factors 68 pmaddwd xmm1, k1k7 69 pmaddwd xmm2, k2k5 70 pmaddwd xmm3, k3k4 71 72 paddd xmm0, xmm1 ;sum 73 paddd xmm0, xmm2 74 paddd xmm0, xmm3 75 76 paddd xmm0, krd ;rounding 77 psrad xmm0, 7 ;shift 78 packssdw xmm0, xmm0 ;pack to word 79 80 ;clamp the values 81 pminsw xmm0, max 82 pmaxsw xmm0, min 83 84%if %1 85 movq xmm1, [rdi] 86 pavgw xmm0, xmm1 87%endif 88 movq [rdi], xmm0 89%endm 90 91%macro HIGH_GET_FILTERS 0 92 mov rdx, arg(5) ;filter ptr 93 mov rsi, arg(0) ;src_ptr 94 mov rdi, arg(2) ;output_ptr 95 mov rcx, 0x00000040 96 97 movdqa xmm7, [rdx] ;load filters 98 pshuflw xmm0, xmm7, 0b ;k0 99 pshuflw xmm1, xmm7, 01010101b ;k1 100 pshuflw xmm2, xmm7, 10101010b ;k2 101 pshuflw xmm3, xmm7, 11111111b ;k3 102 pshufhw xmm4, xmm7, 0b ;k4 103 pshufhw xmm5, xmm7, 01010101b ;k5 104 pshufhw xmm6, xmm7, 10101010b ;k6 105 pshufhw xmm7, xmm7, 11111111b ;k7 106 punpcklqdq xmm2, xmm2 107 punpcklqdq xmm3, xmm3 108 punpcklwd xmm0, xmm1 109 punpckhwd xmm6, xmm7 110 punpckhwd xmm2, xmm5 111 punpckhwd xmm3, xmm4 112 113 movdqa k0k1, xmm0 ;store filter factors on stack 114 movdqa k6k7, xmm6 115 movdqa k2k5, xmm2 116 movdqa k3k4, xmm3 117 118 movq xmm6, rcx 119 pshufd xmm6, xmm6, 0 120 movdqa krd, xmm6 ;rounding 121 122 ;Compute max and min values of a pixel 123 mov rdx, 0x00010001 124 movsxd rcx, DWORD PTR arg(6) ;bd 125 movq xmm0, rdx 126 movq xmm1, rcx 127 pshufd xmm0, xmm0, 0b 128 movdqa xmm2, xmm0 129 psllw xmm0, xmm1 130 psubw xmm0, xmm2 131 pxor xmm1, xmm1 132 movdqa max, xmm0 ;max value (for clamping) 133 movdqa min, xmm1 ;min value (for clamping) 134%endm 135 136%macro LOAD_VERT_8 1 137 movdqu xmm0, [rsi + %1] ;0 138 movdqu xmm1, [rsi + rax + %1] ;1 139 movdqu xmm6, [rsi + rdx * 2 + %1] ;6 140 lea rsi, [rsi + rax] 141 movdqu xmm7, [rsi + rdx * 2 + %1] ;7 142 movdqu xmm2, [rsi + rax + %1] ;2 143 movdqu xmm3, [rsi + rax * 2 + %1] ;3 144 movdqu xmm4, [rsi + rdx + %1] ;4 145 movdqu xmm5, [rsi + rax * 4 + %1] ;5 146%endm 147 148%macro HIGH_APPLY_FILTER_8 2 149 movdqu temp, xmm4 150 movdqa xmm4, xmm0 151 punpcklwd xmm0, xmm1 152 punpckhwd xmm4, xmm1 153 movdqa xmm1, xmm6 154 punpcklwd xmm6, xmm7 155 punpckhwd xmm1, xmm7 156 movdqa xmm7, xmm2 157 punpcklwd xmm2, xmm5 158 punpckhwd xmm7, xmm5 159 160 movdqu xmm5, temp 161 movdqu temp, xmm4 162 movdqa xmm4, xmm3 163 punpcklwd xmm3, xmm5 164 punpckhwd xmm4, xmm5 165 movdqu xmm5, temp 166 167 pmaddwd xmm0, k0k1 168 pmaddwd xmm5, k0k1 169 pmaddwd xmm6, k6k7 170 pmaddwd xmm1, k6k7 171 pmaddwd xmm2, k2k5 172 pmaddwd xmm7, k2k5 173 pmaddwd xmm3, k3k4 174 pmaddwd xmm4, k3k4 175 176 paddd xmm0, xmm6 177 paddd xmm0, xmm2 178 paddd xmm0, xmm3 179 paddd xmm5, xmm1 180 paddd xmm5, xmm7 181 paddd xmm5, xmm4 182 183 paddd xmm0, krd ;rounding 184 paddd xmm5, krd 185 psrad xmm0, 7 ;shift 186 psrad xmm5, 7 187 packssdw xmm0, xmm5 ;pack back to word 188 189 ;clamp the values 190 pminsw xmm0, max 191 pmaxsw xmm0, min 192 193%if %1 194 movdqu xmm1, [rdi + %2] 195 pavgw xmm0, xmm1 196%endif 197 movdqu [rdi + %2], xmm0 198%endm 199 200SECTION .text 201 202;void vpx_highbd_filter_block1d4_v8_sse2 203;( 204; unsigned char *src_ptr, 205; unsigned int src_pitch, 206; unsigned char *output_ptr, 207; unsigned int out_pitch, 208; unsigned int output_height, 209; short *filter 210;) 211globalsym(vpx_highbd_filter_block1d4_v8_sse2) 212sym(vpx_highbd_filter_block1d4_v8_sse2): 213 push rbp 214 mov rbp, rsp 215 SHADOW_ARGS_TO_STACK 7 216 SAVE_XMM 7 217 push rsi 218 push rdi 219 push rbx 220 ; end prolog 221 222 ALIGN_STACK 16, rax 223 sub rsp, 16 * 7 224 %define k0k6 [rsp + 16 * 0] 225 %define k2k5 [rsp + 16 * 1] 226 %define k3k4 [rsp + 16 * 2] 227 %define k1k7 [rsp + 16 * 3] 228 %define krd [rsp + 16 * 4] 229 %define max [rsp + 16 * 5] 230 %define min [rsp + 16 * 6] 231 232 HIGH_GET_FILTERS_4 233 234 mov rsi, arg(0) ;src_ptr 235 mov rdi, arg(2) ;output_ptr 236 237 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 238 movsxd rbx, DWORD PTR arg(3) ;out_pitch 239 lea rax, [rax + rax] ;bytes per line 240 lea rbx, [rbx + rbx] 241 lea rdx, [rax + rax * 2] 242 movsxd rcx, DWORD PTR arg(4) ;output_height 243 244.loop: 245 movq xmm0, [rsi] ;load src: row 0 246 movq xmm1, [rsi + rax] ;1 247 movq xmm6, [rsi + rdx * 2] ;6 248 lea rsi, [rsi + rax] 249 movq xmm7, [rsi + rdx * 2] ;7 250 movq xmm2, [rsi + rax] ;2 251 movq xmm3, [rsi + rax * 2] ;3 252 movq xmm4, [rsi + rdx] ;4 253 movq xmm5, [rsi + rax * 4] ;5 254 255 HIGH_APPLY_FILTER_4 0 256 257 lea rdi, [rdi + rbx] 258 dec rcx 259 jnz .loop 260 261 add rsp, 16 * 7 262 pop rsp 263 pop rbx 264 ; begin epilog 265 pop rdi 266 pop rsi 267 RESTORE_XMM 268 UNSHADOW_ARGS 269 pop rbp 270 ret 271 272;void vpx_highbd_filter_block1d8_v8_sse2 273;( 274; unsigned char *src_ptr, 275; unsigned int src_pitch, 276; unsigned char *output_ptr, 277; unsigned int out_pitch, 278; unsigned int output_height, 279; short *filter 280;) 281globalsym(vpx_highbd_filter_block1d8_v8_sse2) 282sym(vpx_highbd_filter_block1d8_v8_sse2): 283 push rbp 284 mov rbp, rsp 285 SHADOW_ARGS_TO_STACK 7 286 SAVE_XMM 7 287 push rsi 288 push rdi 289 push rbx 290 ; end prolog 291 292 ALIGN_STACK 16, rax 293 sub rsp, 16 * 8 294 %define k0k1 [rsp + 16 * 0] 295 %define k6k7 [rsp + 16 * 1] 296 %define k2k5 [rsp + 16 * 2] 297 %define k3k4 [rsp + 16 * 3] 298 %define krd [rsp + 16 * 4] 299 %define temp [rsp + 16 * 5] 300 %define max [rsp + 16 * 6] 301 %define min [rsp + 16 * 7] 302 303 HIGH_GET_FILTERS 304 305 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 306 movsxd rbx, DWORD PTR arg(3) ;out_pitch 307 lea rax, [rax + rax] ;bytes per line 308 lea rbx, [rbx + rbx] 309 lea rdx, [rax + rax * 2] 310 movsxd rcx, DWORD PTR arg(4) ;output_height 311 312.loop: 313 LOAD_VERT_8 0 314 HIGH_APPLY_FILTER_8 0, 0 315 316 lea rdi, [rdi + rbx] 317 dec rcx 318 jnz .loop 319 320 add rsp, 16 * 8 321 pop rsp 322 pop rbx 323 ; begin epilog 324 pop rdi 325 pop rsi 326 RESTORE_XMM 327 UNSHADOW_ARGS 328 pop rbp 329 ret 330 331;void vpx_highbd_filter_block1d16_v8_sse2 332;( 333; unsigned char *src_ptr, 334; unsigned int src_pitch, 335; unsigned char *output_ptr, 336; unsigned int out_pitch, 337; unsigned int output_height, 338; short *filter 339;) 340globalsym(vpx_highbd_filter_block1d16_v8_sse2) 341sym(vpx_highbd_filter_block1d16_v8_sse2): 342 push rbp 343 mov rbp, rsp 344 SHADOW_ARGS_TO_STACK 7 345 SAVE_XMM 7 346 push rsi 347 push rdi 348 push rbx 349 ; end prolog 350 351 ALIGN_STACK 16, rax 352 sub rsp, 16 * 8 353 %define k0k1 [rsp + 16 * 0] 354 %define k6k7 [rsp + 16 * 1] 355 %define k2k5 [rsp + 16 * 2] 356 %define k3k4 [rsp + 16 * 3] 357 %define krd [rsp + 16 * 4] 358 %define temp [rsp + 16 * 5] 359 %define max [rsp + 16 * 6] 360 %define min [rsp + 16 * 7] 361 362 HIGH_GET_FILTERS 363 364 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 365 movsxd rbx, DWORD PTR arg(3) ;out_pitch 366 lea rax, [rax + rax] ;bytes per line 367 lea rbx, [rbx + rbx] 368 lea rdx, [rax + rax * 2] 369 movsxd rcx, DWORD PTR arg(4) ;output_height 370 371.loop: 372 LOAD_VERT_8 0 373 HIGH_APPLY_FILTER_8 0, 0 374 sub rsi, rax 375 376 LOAD_VERT_8 16 377 HIGH_APPLY_FILTER_8 0, 16 378 add rdi, rbx 379 380 dec rcx 381 jnz .loop 382 383 add rsp, 16 * 8 384 pop rsp 385 pop rbx 386 ; begin epilog 387 pop rdi 388 pop rsi 389 RESTORE_XMM 390 UNSHADOW_ARGS 391 pop rbp 392 ret 393 394globalsym(vpx_highbd_filter_block1d4_v8_avg_sse2) 395sym(vpx_highbd_filter_block1d4_v8_avg_sse2): 396 push rbp 397 mov rbp, rsp 398 SHADOW_ARGS_TO_STACK 7 399 SAVE_XMM 7 400 push rsi 401 push rdi 402 push rbx 403 ; end prolog 404 405 ALIGN_STACK 16, rax 406 sub rsp, 16 * 7 407 %define k0k6 [rsp + 16 * 0] 408 %define k2k5 [rsp + 16 * 1] 409 %define k3k4 [rsp + 16 * 2] 410 %define k1k7 [rsp + 16 * 3] 411 %define krd [rsp + 16 * 4] 412 %define max [rsp + 16 * 5] 413 %define min [rsp + 16 * 6] 414 415 HIGH_GET_FILTERS_4 416 417 mov rsi, arg(0) ;src_ptr 418 mov rdi, arg(2) ;output_ptr 419 420 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 421 movsxd rbx, DWORD PTR arg(3) ;out_pitch 422 lea rax, [rax + rax] ;bytes per line 423 lea rbx, [rbx + rbx] 424 lea rdx, [rax + rax * 2] 425 movsxd rcx, DWORD PTR arg(4) ;output_height 426 427.loop: 428 movq xmm0, [rsi] ;load src: row 0 429 movq xmm1, [rsi + rax] ;1 430 movq xmm6, [rsi + rdx * 2] ;6 431 lea rsi, [rsi + rax] 432 movq xmm7, [rsi + rdx * 2] ;7 433 movq xmm2, [rsi + rax] ;2 434 movq xmm3, [rsi + rax * 2] ;3 435 movq xmm4, [rsi + rdx] ;4 436 movq xmm5, [rsi + rax * 4] ;5 437 438 HIGH_APPLY_FILTER_4 1 439 440 lea rdi, [rdi + rbx] 441 dec rcx 442 jnz .loop 443 444 add rsp, 16 * 7 445 pop rsp 446 pop rbx 447 ; begin epilog 448 pop rdi 449 pop rsi 450 RESTORE_XMM 451 UNSHADOW_ARGS 452 pop rbp 453 ret 454 455globalsym(vpx_highbd_filter_block1d8_v8_avg_sse2) 456sym(vpx_highbd_filter_block1d8_v8_avg_sse2): 457 push rbp 458 mov rbp, rsp 459 SHADOW_ARGS_TO_STACK 7 460 SAVE_XMM 7 461 push rsi 462 push rdi 463 push rbx 464 ; end prolog 465 466 ALIGN_STACK 16, rax 467 sub rsp, 16 * 8 468 %define k0k1 [rsp + 16 * 0] 469 %define k6k7 [rsp + 16 * 1] 470 %define k2k5 [rsp + 16 * 2] 471 %define k3k4 [rsp + 16 * 3] 472 %define krd [rsp + 16 * 4] 473 %define temp [rsp + 16 * 5] 474 %define max [rsp + 16 * 6] 475 %define min [rsp + 16 * 7] 476 477 HIGH_GET_FILTERS 478 479 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 480 movsxd rbx, DWORD PTR arg(3) ;out_pitch 481 lea rax, [rax + rax] ;bytes per line 482 lea rbx, [rbx + rbx] 483 lea rdx, [rax + rax * 2] 484 movsxd rcx, DWORD PTR arg(4) ;output_height 485.loop: 486 LOAD_VERT_8 0 487 HIGH_APPLY_FILTER_8 1, 0 488 489 lea rdi, [rdi + rbx] 490 dec rcx 491 jnz .loop 492 493 add rsp, 16 * 8 494 pop rsp 495 pop rbx 496 ; begin epilog 497 pop rdi 498 pop rsi 499 RESTORE_XMM 500 UNSHADOW_ARGS 501 pop rbp 502 ret 503 504globalsym(vpx_highbd_filter_block1d16_v8_avg_sse2) 505sym(vpx_highbd_filter_block1d16_v8_avg_sse2): 506 push rbp 507 mov rbp, rsp 508 SHADOW_ARGS_TO_STACK 7 509 SAVE_XMM 7 510 push rsi 511 push rdi 512 push rbx 513 ; end prolog 514 515 ALIGN_STACK 16, rax 516 sub rsp, 16 * 8 517 %define k0k1 [rsp + 16 * 0] 518 %define k6k7 [rsp + 16 * 1] 519 %define k2k5 [rsp + 16 * 2] 520 %define k3k4 [rsp + 16 * 3] 521 %define krd [rsp + 16 * 4] 522 %define temp [rsp + 16 * 5] 523 %define max [rsp + 16 * 6] 524 %define min [rsp + 16 * 7] 525 526 HIGH_GET_FILTERS 527 528 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 529 movsxd rbx, DWORD PTR arg(3) ;out_pitch 530 lea rax, [rax + rax] ;bytes per line 531 lea rbx, [rbx + rbx] 532 lea rdx, [rax + rax * 2] 533 movsxd rcx, DWORD PTR arg(4) ;output_height 534.loop: 535 LOAD_VERT_8 0 536 HIGH_APPLY_FILTER_8 1, 0 537 sub rsi, rax 538 539 LOAD_VERT_8 16 540 HIGH_APPLY_FILTER_8 1, 16 541 add rdi, rbx 542 543 dec rcx 544 jnz .loop 545 546 add rsp, 16 * 8 547 pop rsp 548 pop rbx 549 ; begin epilog 550 pop rdi 551 pop rsi 552 RESTORE_XMM 553 UNSHADOW_ARGS 554 pop rbp 555 ret 556 557;void vpx_highbd_filter_block1d4_h8_sse2 558;( 559; unsigned char *src_ptr, 560; unsigned int src_pixels_per_line, 561; unsigned char *output_ptr, 562; unsigned int output_pitch, 563; unsigned int output_height, 564; short *filter 565;) 566globalsym(vpx_highbd_filter_block1d4_h8_sse2) 567sym(vpx_highbd_filter_block1d4_h8_sse2): 568 push rbp 569 mov rbp, rsp 570 SHADOW_ARGS_TO_STACK 7 571 SAVE_XMM 7 572 push rsi 573 push rdi 574 ; end prolog 575 576 ALIGN_STACK 16, rax 577 sub rsp, 16 * 7 578 %define k0k6 [rsp + 16 * 0] 579 %define k2k5 [rsp + 16 * 1] 580 %define k3k4 [rsp + 16 * 2] 581 %define k1k7 [rsp + 16 * 3] 582 %define krd [rsp + 16 * 4] 583 %define max [rsp + 16 * 5] 584 %define min [rsp + 16 * 6] 585 586 HIGH_GET_FILTERS_4 587 588 mov rsi, arg(0) ;src_ptr 589 mov rdi, arg(2) ;output_ptr 590 591 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 592 movsxd rdx, DWORD PTR arg(3) ;out_pitch 593 lea rax, [rax + rax] ;bytes per line 594 lea rdx, [rdx + rdx] 595 movsxd rcx, DWORD PTR arg(4) ;output_height 596 597.loop: 598 movdqu xmm0, [rsi - 6] ;load src 599 movdqu xmm4, [rsi + 2] 600 movdqa xmm1, xmm0 601 movdqa xmm6, xmm4 602 movdqa xmm7, xmm4 603 movdqa xmm2, xmm0 604 movdqa xmm3, xmm0 605 movdqa xmm5, xmm4 606 607 psrldq xmm1, 2 608 psrldq xmm6, 4 609 psrldq xmm7, 6 610 psrldq xmm2, 4 611 psrldq xmm3, 6 612 psrldq xmm5, 2 613 614 HIGH_APPLY_FILTER_4 0 615 616 lea rsi, [rsi + rax] 617 lea rdi, [rdi + rdx] 618 dec rcx 619 jnz .loop 620 621 add rsp, 16 * 7 622 pop rsp 623 624 ; begin epilog 625 pop rdi 626 pop rsi 627 RESTORE_XMM 628 UNSHADOW_ARGS 629 pop rbp 630 ret 631 632;void vpx_highbd_filter_block1d8_h8_sse2 633;( 634; unsigned char *src_ptr, 635; unsigned int src_pixels_per_line, 636; unsigned char *output_ptr, 637; unsigned int output_pitch, 638; unsigned int output_height, 639; short *filter 640;) 641globalsym(vpx_highbd_filter_block1d8_h8_sse2) 642sym(vpx_highbd_filter_block1d8_h8_sse2): 643 push rbp 644 mov rbp, rsp 645 SHADOW_ARGS_TO_STACK 7 646 SAVE_XMM 7 647 push rsi 648 push rdi 649 ; end prolog 650 651 ALIGN_STACK 16, rax 652 sub rsp, 16 * 8 653 %define k0k1 [rsp + 16 * 0] 654 %define k6k7 [rsp + 16 * 1] 655 %define k2k5 [rsp + 16 * 2] 656 %define k3k4 [rsp + 16 * 3] 657 %define krd [rsp + 16 * 4] 658 %define temp [rsp + 16 * 5] 659 %define max [rsp + 16 * 6] 660 %define min [rsp + 16 * 7] 661 662 HIGH_GET_FILTERS 663 664 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 665 movsxd rdx, DWORD PTR arg(3) ;out_pitch 666 lea rax, [rax + rax] ;bytes per line 667 lea rdx, [rdx + rdx] 668 movsxd rcx, DWORD PTR arg(4) ;output_height 669 670.loop: 671 movdqu xmm0, [rsi - 6] ;load src 672 movdqu xmm1, [rsi - 4] 673 movdqu xmm2, [rsi - 2] 674 movdqu xmm3, [rsi] 675 movdqu xmm4, [rsi + 2] 676 movdqu xmm5, [rsi + 4] 677 movdqu xmm6, [rsi + 6] 678 movdqu xmm7, [rsi + 8] 679 680 HIGH_APPLY_FILTER_8 0, 0 681 682 lea rsi, [rsi + rax] 683 lea rdi, [rdi + rdx] 684 dec rcx 685 jnz .loop 686 687 add rsp, 16 * 8 688 pop rsp 689 690 ; begin epilog 691 pop rdi 692 pop rsi 693 RESTORE_XMM 694 UNSHADOW_ARGS 695 pop rbp 696 ret 697 698;void vpx_highbd_filter_block1d16_h8_sse2 699;( 700; unsigned char *src_ptr, 701; unsigned int src_pixels_per_line, 702; unsigned char *output_ptr, 703; unsigned int output_pitch, 704; unsigned int output_height, 705; short *filter 706;) 707globalsym(vpx_highbd_filter_block1d16_h8_sse2) 708sym(vpx_highbd_filter_block1d16_h8_sse2): 709 push rbp 710 mov rbp, rsp 711 SHADOW_ARGS_TO_STACK 7 712 SAVE_XMM 7 713 push rsi 714 push rdi 715 ; end prolog 716 717 ALIGN_STACK 16, rax 718 sub rsp, 16 * 8 719 %define k0k1 [rsp + 16 * 0] 720 %define k6k7 [rsp + 16 * 1] 721 %define k2k5 [rsp + 16 * 2] 722 %define k3k4 [rsp + 16 * 3] 723 %define krd [rsp + 16 * 4] 724 %define temp [rsp + 16 * 5] 725 %define max [rsp + 16 * 6] 726 %define min [rsp + 16 * 7] 727 728 HIGH_GET_FILTERS 729 730 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 731 movsxd rdx, DWORD PTR arg(3) ;out_pitch 732 lea rax, [rax + rax] ;bytes per line 733 lea rdx, [rdx + rdx] 734 movsxd rcx, DWORD PTR arg(4) ;output_height 735 736.loop: 737 movdqu xmm0, [rsi - 6] ;load src 738 movdqu xmm1, [rsi - 4] 739 movdqu xmm2, [rsi - 2] 740 movdqu xmm3, [rsi] 741 movdqu xmm4, [rsi + 2] 742 movdqu xmm5, [rsi + 4] 743 movdqu xmm6, [rsi + 6] 744 movdqu xmm7, [rsi + 8] 745 746 HIGH_APPLY_FILTER_8 0, 0 747 748 movdqu xmm0, [rsi + 10] ;load src 749 movdqu xmm1, [rsi + 12] 750 movdqu xmm2, [rsi + 14] 751 movdqu xmm3, [rsi + 16] 752 movdqu xmm4, [rsi + 18] 753 movdqu xmm5, [rsi + 20] 754 movdqu xmm6, [rsi + 22] 755 movdqu xmm7, [rsi + 24] 756 757 HIGH_APPLY_FILTER_8 0, 16 758 759 lea rsi, [rsi + rax] 760 lea rdi, [rdi + rdx] 761 dec rcx 762 jnz .loop 763 764 add rsp, 16 * 8 765 pop rsp 766 767 ; begin epilog 768 pop rdi 769 pop rsi 770 RESTORE_XMM 771 UNSHADOW_ARGS 772 pop rbp 773 ret 774 775globalsym(vpx_highbd_filter_block1d4_h8_avg_sse2) 776sym(vpx_highbd_filter_block1d4_h8_avg_sse2): 777 push rbp 778 mov rbp, rsp 779 SHADOW_ARGS_TO_STACK 7 780 SAVE_XMM 7 781 push rsi 782 push rdi 783 ; end prolog 784 785 ALIGN_STACK 16, rax 786 sub rsp, 16 * 7 787 %define k0k6 [rsp + 16 * 0] 788 %define k2k5 [rsp + 16 * 1] 789 %define k3k4 [rsp + 16 * 2] 790 %define k1k7 [rsp + 16 * 3] 791 %define krd [rsp + 16 * 4] 792 %define max [rsp + 16 * 5] 793 %define min [rsp + 16 * 6] 794 795 HIGH_GET_FILTERS_4 796 797 mov rsi, arg(0) ;src_ptr 798 mov rdi, arg(2) ;output_ptr 799 800 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 801 movsxd rdx, DWORD PTR arg(3) ;out_pitch 802 lea rax, [rax + rax] ;bytes per line 803 lea rdx, [rdx + rdx] 804 movsxd rcx, DWORD PTR arg(4) ;output_height 805 806.loop: 807 movdqu xmm0, [rsi - 6] ;load src 808 movdqu xmm4, [rsi + 2] 809 movdqa xmm1, xmm0 810 movdqa xmm6, xmm4 811 movdqa xmm7, xmm4 812 movdqa xmm2, xmm0 813 movdqa xmm3, xmm0 814 movdqa xmm5, xmm4 815 816 psrldq xmm1, 2 817 psrldq xmm6, 4 818 psrldq xmm7, 6 819 psrldq xmm2, 4 820 psrldq xmm3, 6 821 psrldq xmm5, 2 822 823 HIGH_APPLY_FILTER_4 1 824 825 lea rsi, [rsi + rax] 826 lea rdi, [rdi + rdx] 827 dec rcx 828 jnz .loop 829 830 add rsp, 16 * 7 831 pop rsp 832 833 ; begin epilog 834 pop rdi 835 pop rsi 836 RESTORE_XMM 837 UNSHADOW_ARGS 838 pop rbp 839 ret 840 841globalsym(vpx_highbd_filter_block1d8_h8_avg_sse2) 842sym(vpx_highbd_filter_block1d8_h8_avg_sse2): 843 push rbp 844 mov rbp, rsp 845 SHADOW_ARGS_TO_STACK 7 846 SAVE_XMM 7 847 push rsi 848 push rdi 849 ; end prolog 850 851 ALIGN_STACK 16, rax 852 sub rsp, 16 * 8 853 %define k0k1 [rsp + 16 * 0] 854 %define k6k7 [rsp + 16 * 1] 855 %define k2k5 [rsp + 16 * 2] 856 %define k3k4 [rsp + 16 * 3] 857 %define krd [rsp + 16 * 4] 858 %define temp [rsp + 16 * 5] 859 %define max [rsp + 16 * 6] 860 %define min [rsp + 16 * 7] 861 862 HIGH_GET_FILTERS 863 864 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 865 movsxd rdx, DWORD PTR arg(3) ;out_pitch 866 lea rax, [rax + rax] ;bytes per line 867 lea rdx, [rdx + rdx] 868 movsxd rcx, DWORD PTR arg(4) ;output_height 869 870.loop: 871 movdqu xmm0, [rsi - 6] ;load src 872 movdqu xmm1, [rsi - 4] 873 movdqu xmm2, [rsi - 2] 874 movdqu xmm3, [rsi] 875 movdqu xmm4, [rsi + 2] 876 movdqu xmm5, [rsi + 4] 877 movdqu xmm6, [rsi + 6] 878 movdqu xmm7, [rsi + 8] 879 880 HIGH_APPLY_FILTER_8 1, 0 881 882 lea rsi, [rsi + rax] 883 lea rdi, [rdi + rdx] 884 dec rcx 885 jnz .loop 886 887 add rsp, 16 * 8 888 pop rsp 889 890 ; begin epilog 891 pop rdi 892 pop rsi 893 RESTORE_XMM 894 UNSHADOW_ARGS 895 pop rbp 896 ret 897 898globalsym(vpx_highbd_filter_block1d16_h8_avg_sse2) 899sym(vpx_highbd_filter_block1d16_h8_avg_sse2): 900 push rbp 901 mov rbp, rsp 902 SHADOW_ARGS_TO_STACK 7 903 SAVE_XMM 7 904 push rsi 905 push rdi 906 ; end prolog 907 908 ALIGN_STACK 16, rax 909 sub rsp, 16 * 8 910 %define k0k1 [rsp + 16 * 0] 911 %define k6k7 [rsp + 16 * 1] 912 %define k2k5 [rsp + 16 * 2] 913 %define k3k4 [rsp + 16 * 3] 914 %define krd [rsp + 16 * 4] 915 %define temp [rsp + 16 * 5] 916 %define max [rsp + 16 * 6] 917 %define min [rsp + 16 * 7] 918 919 HIGH_GET_FILTERS 920 921 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 922 movsxd rdx, DWORD PTR arg(3) ;out_pitch 923 lea rax, [rax + rax] ;bytes per line 924 lea rdx, [rdx + rdx] 925 movsxd rcx, DWORD PTR arg(4) ;output_height 926 927.loop: 928 movdqu xmm0, [rsi - 6] ;load src 929 movdqu xmm1, [rsi - 4] 930 movdqu xmm2, [rsi - 2] 931 movdqu xmm3, [rsi] 932 movdqu xmm4, [rsi + 2] 933 movdqu xmm5, [rsi + 4] 934 movdqu xmm6, [rsi + 6] 935 movdqu xmm7, [rsi + 8] 936 937 HIGH_APPLY_FILTER_8 1, 0 938 939 movdqu xmm0, [rsi + 10] ;load src 940 movdqu xmm1, [rsi + 12] 941 movdqu xmm2, [rsi + 14] 942 movdqu xmm3, [rsi + 16] 943 movdqu xmm4, [rsi + 18] 944 movdqu xmm5, [rsi + 20] 945 movdqu xmm6, [rsi + 22] 946 movdqu xmm7, [rsi + 24] 947 948 HIGH_APPLY_FILTER_8 1, 16 949 950 lea rsi, [rsi + rax] 951 lea rdi, [rdi + rdx] 952 dec rcx 953 jnz .loop 954 955 add rsp, 16 * 8 956 pop rsp 957 958 ; begin epilog 959 pop rdi 960 pop rsi 961 RESTORE_XMM 962 UNSHADOW_ARGS 963 pop rbp 964 ret 965