1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define BLOCK_HEIGHT_WIDTH 4 15%define VP8_FILTER_WEIGHT 128 16%define VP8_FILTER_SHIFT 7 17 18SECTION .text 19 20;/************************************************************************************ 21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 22; input pixel array has output_height rows. This routine assumes that output_height is an 23; even number. This function handles 8 pixels in horizontal direction, calculating ONE 24; rows each iteration to take advantage of the 128 bits operations. 25; 26; This is an implementation of some of the SSE optimizations first seen in ffvp8 27; 28;*************************************************************************************/ 29;void vp8_filter_block1d8_h6_ssse3 30;( 31; unsigned char *src_ptr, 32; unsigned int src_pixels_per_line, 33; unsigned char *output_ptr, 34; unsigned int output_pitch, 35; unsigned int output_height, 36; unsigned int vp8_filter_index 37;) 38global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE 39sym(vp8_filter_block1d8_h6_ssse3): 40 push rbp 41 mov rbp, rsp 42 SHADOW_ARGS_TO_STACK 6 43 SAVE_XMM 7 44 GET_GOT rbx 45 push rsi 46 push rdi 47 ; end prolog 48 49 movsxd rdx, DWORD PTR arg(5) ;table index 50 xor rsi, rsi 51 shl rdx, 4 52 53 movdqa xmm7, [GLOBAL(rd)] 54 55 lea rax, [GLOBAL(k0_k5)] 56 add rax, rdx 57 mov rdi, arg(2) ;output_ptr 58 59 cmp esi, DWORD PTR [rax] 60 je vp8_filter_block1d8_h4_ssse3 61 62 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 63 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 64 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 65 66 mov rsi, arg(0) ;src_ptr 67 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 68 movsxd rcx, dword ptr arg(4) ;output_height 69 70 movsxd rdx, dword ptr arg(3) ;output_pitch 71 72 sub rdi, rdx 73;xmm3 free 74.filter_block1d8_h6_rowloop_ssse3: 75 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 76 77 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 78 79 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 80 81 movdqa xmm1, xmm0 82 pmaddubsw xmm0, xmm4 83 84 movdqa xmm2, xmm1 85 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 86 87 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 88 pmaddubsw xmm1, xmm5 89 90 lea rdi, [rdi + rdx] 91 pmaddubsw xmm2, xmm6 92 93 lea rsi, [rsi + rax] 94 dec rcx 95 96 paddsw xmm0, xmm1 97 paddsw xmm2, xmm7 98 99 paddsw xmm0, xmm2 100 101 psraw xmm0, 7 102 103 packuswb xmm0, xmm0 104 105 movq MMWORD Ptr [rdi], xmm0 106 jnz .filter_block1d8_h6_rowloop_ssse3 107 108 ; begin epilog 109 pop rdi 110 pop rsi 111 RESTORE_GOT 112 RESTORE_XMM 113 UNSHADOW_ARGS 114 pop rbp 115 ret 116 117vp8_filter_block1d8_h4_ssse3: 118 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 119 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 120 121 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] 122 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] 123 124 mov rsi, arg(0) ;src_ptr 125 126 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 127 movsxd rcx, dword ptr arg(4) ;output_height 128 129 movsxd rdx, dword ptr arg(3) ;output_pitch 130 131 sub rdi, rdx 132 133.filter_block1d8_h4_rowloop_ssse3: 134 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 135 136 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 137 138 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 139 140 movdqa xmm2, xmm0 141 pshufb xmm0, xmm3 142 143 pshufb xmm2, xmm4 144 pmaddubsw xmm0, xmm5 145 146 lea rdi, [rdi + rdx] 147 pmaddubsw xmm2, xmm6 148 149 lea rsi, [rsi + rax] 150 dec rcx 151 152 paddsw xmm0, xmm7 153 154 paddsw xmm0, xmm2 155 156 psraw xmm0, 7 157 158 packuswb xmm0, xmm0 159 160 movq MMWORD Ptr [rdi], xmm0 161 162 jnz .filter_block1d8_h4_rowloop_ssse3 163 164 ; begin epilog 165 pop rdi 166 pop rsi 167 RESTORE_GOT 168 RESTORE_XMM 169 UNSHADOW_ARGS 170 pop rbp 171 ret 172;void vp8_filter_block1d16_h6_ssse3 173;( 174; unsigned char *src_ptr, 175; unsigned int src_pixels_per_line, 176; unsigned char *output_ptr, 177; unsigned int output_pitch, 178; unsigned int output_height, 179; unsigned int vp8_filter_index 180;) 181global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE 182sym(vp8_filter_block1d16_h6_ssse3): 183 push rbp 184 mov rbp, rsp 185 SHADOW_ARGS_TO_STACK 6 186 SAVE_XMM 7 187 GET_GOT rbx 188 push rsi 189 push rdi 190 ; end prolog 191 192 movsxd rdx, DWORD PTR arg(5) ;table index 193 xor rsi, rsi 194 shl rdx, 4 ; 195 196 lea rax, [GLOBAL(k0_k5)] 197 add rax, rdx 198 199 mov rdi, arg(2) ;output_ptr 200 201 mov rsi, arg(0) ;src_ptr 202 203 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 204 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 205 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 206 207 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 208 movsxd rcx, dword ptr arg(4) ;output_height 209 movsxd rdx, dword ptr arg(3) ;output_pitch 210 211.filter_block1d16_h6_rowloop_ssse3: 212 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 213 214 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 215 216 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 217 218 movdqa xmm1, xmm0 219 pmaddubsw xmm0, xmm4 220 221 movdqa xmm2, xmm1 222 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 223 224 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 225 movq xmm3, MMWORD PTR [rsi + 6] 226 227 pmaddubsw xmm1, xmm5 228 movq xmm7, MMWORD PTR [rsi + 11] 229 230 pmaddubsw xmm2, xmm6 231 punpcklbw xmm3, xmm7 232 233 paddsw xmm0, xmm1 234 movdqa xmm1, xmm3 235 236 pmaddubsw xmm3, xmm4 237 paddsw xmm0, xmm2 238 239 movdqa xmm2, xmm1 240 paddsw xmm0, [GLOBAL(rd)] 241 242 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 243 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 244 245 psraw xmm0, 7 246 pmaddubsw xmm1, xmm5 247 248 pmaddubsw xmm2, xmm6 249 packuswb xmm0, xmm0 250 251 lea rsi, [rsi + rax] 252 paddsw xmm3, xmm1 253 254 paddsw xmm3, xmm2 255 256 paddsw xmm3, [GLOBAL(rd)] 257 258 psraw xmm3, 7 259 260 packuswb xmm3, xmm3 261 262 punpcklqdq xmm0, xmm3 263 264 movdqa XMMWORD Ptr [rdi], xmm0 265 266 lea rdi, [rdi + rdx] 267 dec rcx 268 jnz .filter_block1d16_h6_rowloop_ssse3 269 270 ; begin epilog 271 pop rdi 272 pop rsi 273 RESTORE_GOT 274 RESTORE_XMM 275 UNSHADOW_ARGS 276 pop rbp 277 ret 278 279;void vp8_filter_block1d4_h6_ssse3 280;( 281; unsigned char *src_ptr, 282; unsigned int src_pixels_per_line, 283; unsigned char *output_ptr, 284; unsigned int output_pitch, 285; unsigned int output_height, 286; unsigned int vp8_filter_index 287;) 288global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE 289sym(vp8_filter_block1d4_h6_ssse3): 290 push rbp 291 mov rbp, rsp 292 SHADOW_ARGS_TO_STACK 6 293 SAVE_XMM 7 294 GET_GOT rbx 295 push rsi 296 push rdi 297 ; end prolog 298 299 movsxd rdx, DWORD PTR arg(5) ;table index 300 xor rsi, rsi 301 shl rdx, 4 ; 302 303 lea rax, [GLOBAL(k0_k5)] 304 add rax, rdx 305 movdqa xmm7, [GLOBAL(rd)] 306 307 cmp esi, DWORD PTR [rax] 308 je .vp8_filter_block1d4_h4_ssse3 309 310 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 311 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 312 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 313 314 mov rsi, arg(0) ;src_ptr 315 mov rdi, arg(2) ;output_ptr 316 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 317 movsxd rcx, dword ptr arg(4) ;output_height 318 319 movsxd rdx, dword ptr arg(3) ;output_pitch 320 321;xmm3 free 322.filter_block1d4_h6_rowloop_ssse3: 323 movdqu xmm0, XMMWORD PTR [rsi - 2] 324 325 movdqa xmm1, xmm0 326 pshufb xmm0, [GLOBAL(shuf1b)] 327 328 movdqa xmm2, xmm1 329 pshufb xmm1, [GLOBAL(shuf2b)] 330 pmaddubsw xmm0, xmm4 331 pshufb xmm2, [GLOBAL(shuf3b)] 332 pmaddubsw xmm1, xmm5 333 334;-- 335 pmaddubsw xmm2, xmm6 336 337 lea rsi, [rsi + rax] 338;-- 339 paddsw xmm0, xmm1 340 paddsw xmm0, xmm7 341 pxor xmm1, xmm1 342 paddsw xmm0, xmm2 343 psraw xmm0, 7 344 packuswb xmm0, xmm0 345 346 movd DWORD PTR [rdi], xmm0 347 348 add rdi, rdx 349 dec rcx 350 jnz .filter_block1d4_h6_rowloop_ssse3 351 352 ; begin epilog 353 pop rdi 354 pop rsi 355 RESTORE_GOT 356 RESTORE_XMM 357 UNSHADOW_ARGS 358 pop rbp 359 ret 360 361.vp8_filter_block1d4_h4_ssse3: 362 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 363 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 364 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] 365 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] 366 367 mov rsi, arg(0) ;src_ptr 368 mov rdi, arg(2) ;output_ptr 369 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 370 movsxd rcx, dword ptr arg(4) ;output_height 371 372 movsxd rdx, dword ptr arg(3) ;output_pitch 373 374.filter_block1d4_h4_rowloop_ssse3: 375 movdqu xmm1, XMMWORD PTR [rsi - 2] 376 377 movdqa xmm2, xmm1 378 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] 379 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] 380 pmaddubsw xmm1, xmm5 381 382;-- 383 pmaddubsw xmm2, xmm6 384 385 lea rsi, [rsi + rax] 386;-- 387 paddsw xmm1, xmm7 388 paddsw xmm1, xmm2 389 psraw xmm1, 7 390 packuswb xmm1, xmm1 391 392 movd DWORD PTR [rdi], xmm1 393 394 add rdi, rdx 395 dec rcx 396 jnz .filter_block1d4_h4_rowloop_ssse3 397 398 ; begin epilog 399 pop rdi 400 pop rsi 401 RESTORE_GOT 402 RESTORE_XMM 403 UNSHADOW_ARGS 404 pop rbp 405 ret 406 407 408 409;void vp8_filter_block1d16_v6_ssse3 410;( 411; unsigned char *src_ptr, 412; unsigned int src_pitch, 413; unsigned char *output_ptr, 414; unsigned int out_pitch, 415; unsigned int output_height, 416; unsigned int vp8_filter_index 417;) 418global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE 419sym(vp8_filter_block1d16_v6_ssse3): 420 push rbp 421 mov rbp, rsp 422 SHADOW_ARGS_TO_STACK 6 423 SAVE_XMM 7 424 GET_GOT rbx 425 push rsi 426 push rdi 427 ; end prolog 428 429 movsxd rdx, DWORD PTR arg(5) ;table index 430 xor rsi, rsi 431 shl rdx, 4 ; 432 433 lea rax, [GLOBAL(k0_k5)] 434 add rax, rdx 435 436 cmp esi, DWORD PTR [rax] 437 je .vp8_filter_block1d16_v4_ssse3 438 439 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 440 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 441 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 442 443 mov rsi, arg(0) ;src_ptr 444 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 445 mov rdi, arg(2) ;output_ptr 446 447%if ABI_IS_32BIT=0 448 movsxd r8, DWORD PTR arg(3) ;out_pitch 449%endif 450 mov rax, rsi 451 movsxd rcx, DWORD PTR arg(4) ;output_height 452 add rax, rdx 453 454 455.vp8_filter_block1d16_v6_ssse3_loop: 456 movq xmm1, MMWORD PTR [rsi] ;A 457 movq xmm2, MMWORD PTR [rsi + rdx] ;B 458 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 459 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 460 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 461 462 punpcklbw xmm2, xmm4 ;B D 463 punpcklbw xmm3, xmm0 ;C E 464 465 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 466 467 pmaddubsw xmm3, xmm6 468 punpcklbw xmm1, xmm0 ;A F 469 pmaddubsw xmm2, xmm7 470 pmaddubsw xmm1, xmm5 471 472 paddsw xmm2, xmm3 473 paddsw xmm2, xmm1 474 paddsw xmm2, [GLOBAL(rd)] 475 psraw xmm2, 7 476 packuswb xmm2, xmm2 477 478 movq MMWORD PTR [rdi], xmm2 ;store the results 479 480 movq xmm1, MMWORD PTR [rsi + 8] ;A 481 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B 482 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C 483 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 484 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 485 486 punpcklbw xmm2, xmm4 ;B D 487 punpcklbw xmm3, xmm0 ;C E 488 489 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F 490 pmaddubsw xmm3, xmm6 491 punpcklbw xmm1, xmm0 ;A F 492 pmaddubsw xmm2, xmm7 493 pmaddubsw xmm1, xmm5 494 495 add rsi, rdx 496 add rax, rdx 497;-- 498;-- 499 paddsw xmm2, xmm3 500 paddsw xmm2, xmm1 501 paddsw xmm2, [GLOBAL(rd)] 502 psraw xmm2, 7 503 packuswb xmm2, xmm2 504 505 movq MMWORD PTR [rdi+8], xmm2 506 507%if ABI_IS_32BIT 508 add rdi, DWORD PTR arg(3) ;out_pitch 509%else 510 add rdi, r8 511%endif 512 dec rcx 513 jnz .vp8_filter_block1d16_v6_ssse3_loop 514 515 ; begin epilog 516 pop rdi 517 pop rsi 518 RESTORE_GOT 519 RESTORE_XMM 520 UNSHADOW_ARGS 521 pop rbp 522 ret 523 524.vp8_filter_block1d16_v4_ssse3: 525 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 526 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 527 528 mov rsi, arg(0) ;src_ptr 529 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 530 mov rdi, arg(2) ;output_ptr 531 532%if ABI_IS_32BIT=0 533 movsxd r8, DWORD PTR arg(3) ;out_pitch 534%endif 535 mov rax, rsi 536 movsxd rcx, DWORD PTR arg(4) ;output_height 537 add rax, rdx 538 539.vp8_filter_block1d16_v4_ssse3_loop: 540 movq xmm2, MMWORD PTR [rsi + rdx] ;B 541 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 542 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 543 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 544 545 punpcklbw xmm2, xmm4 ;B D 546 punpcklbw xmm3, xmm0 ;C E 547 548 pmaddubsw xmm3, xmm6 549 pmaddubsw xmm2, xmm7 550 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B 551 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C 552 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 553 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 554 555 paddsw xmm2, [GLOBAL(rd)] 556 paddsw xmm2, xmm3 557 psraw xmm2, 7 558 packuswb xmm2, xmm2 559 560 punpcklbw xmm5, xmm4 ;B D 561 punpcklbw xmm1, xmm0 ;C E 562 563 pmaddubsw xmm1, xmm6 564 pmaddubsw xmm5, xmm7 565 566 movdqa xmm4, [GLOBAL(rd)] 567 add rsi, rdx 568 add rax, rdx 569;-- 570;-- 571 paddsw xmm5, xmm1 572 paddsw xmm5, xmm4 573 psraw xmm5, 7 574 packuswb xmm5, xmm5 575 576 punpcklqdq xmm2, xmm5 577 578 movdqa XMMWORD PTR [rdi], xmm2 579 580%if ABI_IS_32BIT 581 add rdi, DWORD PTR arg(3) ;out_pitch 582%else 583 add rdi, r8 584%endif 585 dec rcx 586 jnz .vp8_filter_block1d16_v4_ssse3_loop 587 588 ; begin epilog 589 pop rdi 590 pop rsi 591 RESTORE_GOT 592 RESTORE_XMM 593 UNSHADOW_ARGS 594 pop rbp 595 ret 596 597;void vp8_filter_block1d8_v6_ssse3 598;( 599; unsigned char *src_ptr, 600; unsigned int src_pitch, 601; unsigned char *output_ptr, 602; unsigned int out_pitch, 603; unsigned int output_height, 604; unsigned int vp8_filter_index 605;) 606global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE 607sym(vp8_filter_block1d8_v6_ssse3): 608 push rbp 609 mov rbp, rsp 610 SHADOW_ARGS_TO_STACK 6 611 SAVE_XMM 7 612 GET_GOT rbx 613 push rsi 614 push rdi 615 ; end prolog 616 617 movsxd rdx, DWORD PTR arg(5) ;table index 618 xor rsi, rsi 619 shl rdx, 4 ; 620 621 lea rax, [GLOBAL(k0_k5)] 622 add rax, rdx 623 624 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 625 mov rdi, arg(2) ;output_ptr 626%if ABI_IS_32BIT=0 627 movsxd r8, DWORD PTR arg(3) ; out_pitch 628%endif 629 movsxd rcx, DWORD PTR arg(4) ;[output_height] 630 631 cmp esi, DWORD PTR [rax] 632 je .vp8_filter_block1d8_v4_ssse3 633 634 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 635 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 636 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 637 638 mov rsi, arg(0) ;src_ptr 639 640 mov rax, rsi 641 add rax, rdx 642 643.vp8_filter_block1d8_v6_ssse3_loop: 644 movq xmm1, MMWORD PTR [rsi] ;A 645 movq xmm2, MMWORD PTR [rsi + rdx] ;B 646 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 647 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 648 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 649 650 punpcklbw xmm2, xmm4 ;B D 651 punpcklbw xmm3, xmm0 ;C E 652 653 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 654 movdqa xmm4, [GLOBAL(rd)] 655 656 pmaddubsw xmm3, xmm6 657 punpcklbw xmm1, xmm0 ;A F 658 pmaddubsw xmm2, xmm7 659 pmaddubsw xmm1, xmm5 660 add rsi, rdx 661 add rax, rdx 662;-- 663;-- 664 paddsw xmm2, xmm3 665 paddsw xmm2, xmm1 666 paddsw xmm2, xmm4 667 psraw xmm2, 7 668 packuswb xmm2, xmm2 669 670 movq MMWORD PTR [rdi], xmm2 671 672%if ABI_IS_32BIT 673 add rdi, DWORD PTR arg(3) ;[out_pitch] 674%else 675 add rdi, r8 676%endif 677 dec rcx 678 jnz .vp8_filter_block1d8_v6_ssse3_loop 679 680 ; begin epilog 681 pop rdi 682 pop rsi 683 RESTORE_GOT 684 RESTORE_XMM 685 UNSHADOW_ARGS 686 pop rbp 687 ret 688 689.vp8_filter_block1d8_v4_ssse3: 690 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 691 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 692 movdqa xmm5, [GLOBAL(rd)] 693 694 mov rsi, arg(0) ;src_ptr 695 696 mov rax, rsi 697 add rax, rdx 698 699.vp8_filter_block1d8_v4_ssse3_loop: 700 movq xmm2, MMWORD PTR [rsi + rdx] ;B 701 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 702 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 703 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 704 705 punpcklbw xmm2, xmm4 ;B D 706 punpcklbw xmm3, xmm0 ;C E 707 708 pmaddubsw xmm3, xmm6 709 pmaddubsw xmm2, xmm7 710 add rsi, rdx 711 add rax, rdx 712;-- 713;-- 714 paddsw xmm2, xmm3 715 paddsw xmm2, xmm5 716 psraw xmm2, 7 717 packuswb xmm2, xmm2 718 719 movq MMWORD PTR [rdi], xmm2 720 721%if ABI_IS_32BIT 722 add rdi, DWORD PTR arg(3) ;[out_pitch] 723%else 724 add rdi, r8 725%endif 726 dec rcx 727 jnz .vp8_filter_block1d8_v4_ssse3_loop 728 729 ; begin epilog 730 pop rdi 731 pop rsi 732 RESTORE_GOT 733 RESTORE_XMM 734 UNSHADOW_ARGS 735 pop rbp 736 ret 737;void vp8_filter_block1d4_v6_ssse3 738;( 739; unsigned char *src_ptr, 740; unsigned int src_pitch, 741; unsigned char *output_ptr, 742; unsigned int out_pitch, 743; unsigned int output_height, 744; unsigned int vp8_filter_index 745;) 746global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE 747sym(vp8_filter_block1d4_v6_ssse3): 748 push rbp 749 mov rbp, rsp 750 SHADOW_ARGS_TO_STACK 6 751 GET_GOT rbx 752 push rsi 753 push rdi 754 ; end prolog 755 756 movsxd rdx, DWORD PTR arg(5) ;table index 757 xor rsi, rsi 758 shl rdx, 4 ; 759 760 lea rax, [GLOBAL(k0_k5)] 761 add rax, rdx 762 763 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 764 mov rdi, arg(2) ;output_ptr 765%if ABI_IS_32BIT=0 766 movsxd r8, DWORD PTR arg(3) ; out_pitch 767%endif 768 movsxd rcx, DWORD PTR arg(4) ;[output_height] 769 770 cmp esi, DWORD PTR [rax] 771 je .vp8_filter_block1d4_v4_ssse3 772 773 movq mm5, MMWORD PTR [rax] ;k0_k5 774 movq mm6, MMWORD PTR [rax+256] ;k2_k4 775 movq mm7, MMWORD PTR [rax+128] ;k1_k3 776 777 mov rsi, arg(0) ;src_ptr 778 779 mov rax, rsi 780 add rax, rdx 781 782.vp8_filter_block1d4_v6_ssse3_loop: 783 movd mm1, DWORD PTR [rsi] ;A 784 movd mm2, DWORD PTR [rsi + rdx] ;B 785 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 786 movd mm4, DWORD PTR [rax + rdx * 2] ;D 787 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 788 789 punpcklbw mm2, mm4 ;B D 790 punpcklbw mm3, mm0 ;C E 791 792 movd mm0, DWORD PTR [rax + rdx * 4] ;F 793 794 movq mm4, [GLOBAL(rd)] 795 796 pmaddubsw mm3, mm6 797 punpcklbw mm1, mm0 ;A F 798 pmaddubsw mm2, mm7 799 pmaddubsw mm1, mm5 800 add rsi, rdx 801 add rax, rdx 802;-- 803;-- 804 paddsw mm2, mm3 805 paddsw mm2, mm1 806 paddsw mm2, mm4 807 psraw mm2, 7 808 packuswb mm2, mm2 809 810 movd DWORD PTR [rdi], mm2 811 812%if ABI_IS_32BIT 813 add rdi, DWORD PTR arg(3) ;[out_pitch] 814%else 815 add rdi, r8 816%endif 817 dec rcx 818 jnz .vp8_filter_block1d4_v6_ssse3_loop 819 820 ; begin epilog 821 pop rdi 822 pop rsi 823 RESTORE_GOT 824 UNSHADOW_ARGS 825 pop rbp 826 ret 827 828.vp8_filter_block1d4_v4_ssse3: 829 movq mm6, MMWORD PTR [rax+256] ;k2_k4 830 movq mm7, MMWORD PTR [rax+128] ;k1_k3 831 movq mm5, MMWORD PTR [GLOBAL(rd)] 832 833 mov rsi, arg(0) ;src_ptr 834 835 mov rax, rsi 836 add rax, rdx 837 838.vp8_filter_block1d4_v4_ssse3_loop: 839 movd mm2, DWORD PTR [rsi + rdx] ;B 840 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 841 movd mm4, DWORD PTR [rax + rdx * 2] ;D 842 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 843 844 punpcklbw mm2, mm4 ;B D 845 punpcklbw mm3, mm0 ;C E 846 847 pmaddubsw mm3, mm6 848 pmaddubsw mm2, mm7 849 add rsi, rdx 850 add rax, rdx 851;-- 852;-- 853 paddsw mm2, mm3 854 paddsw mm2, mm5 855 psraw mm2, 7 856 packuswb mm2, mm2 857 858 movd DWORD PTR [rdi], mm2 859 860%if ABI_IS_32BIT 861 add rdi, DWORD PTR arg(3) ;[out_pitch] 862%else 863 add rdi, r8 864%endif 865 dec rcx 866 jnz .vp8_filter_block1d4_v4_ssse3_loop 867 868 ; begin epilog 869 pop rdi 870 pop rsi 871 RESTORE_GOT 872 UNSHADOW_ARGS 873 pop rbp 874 ret 875 876;void vp8_bilinear_predict16x16_ssse3 877;( 878; unsigned char *src_ptr, 879; int src_pixels_per_line, 880; int xoffset, 881; int yoffset, 882; unsigned char *dst_ptr, 883; int dst_pitch 884;) 885global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE 886sym(vp8_bilinear_predict16x16_ssse3): 887 push rbp 888 mov rbp, rsp 889 SHADOW_ARGS_TO_STACK 6 890 SAVE_XMM 7 891 GET_GOT rbx 892 push rsi 893 push rdi 894 ; end prolog 895 896 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 897 movsxd rax, dword ptr arg(2) ; xoffset 898 899 cmp rax, 0 ; skip first_pass filter if xoffset=0 900 je .b16x16_sp_only 901 902 shl rax, 4 903 lea rax, [rax + rcx] ; HFilter 904 905 mov rdi, arg(4) ; dst_ptr 906 mov rsi, arg(0) ; src_ptr 907 movsxd rdx, dword ptr arg(5) ; dst_pitch 908 909 movdqa xmm1, [rax] 910 911 movsxd rax, dword ptr arg(3) ; yoffset 912 913 cmp rax, 0 ; skip second_pass filter if yoffset=0 914 je .b16x16_fp_only 915 916 shl rax, 4 917 lea rax, [rax + rcx] ; VFilter 918 919 lea rcx, [rdi+rdx*8] 920 lea rcx, [rcx+rdx*8] 921 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line 922 923 movdqa xmm2, [rax] 924 925%if ABI_IS_32BIT=0 926 movsxd r8, dword ptr arg(5) ; dst_pitch 927%endif 928 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 929 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 930 931 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 932 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 933 934 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 935 936 lea rsi, [rsi + rdx] ; next line 937 938 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 939 940 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 941 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 942 943 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 944 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 945 946 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 947 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 948 949 movdqa xmm7, xmm3 950 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 951 952.next_row: 953 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 954 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 955 956 punpcklbw xmm6, xmm5 957 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 958 959 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 960 lea rsi, [rsi + rdx] ; next line 961 962 pmaddubsw xmm6, xmm1 963 964 punpcklbw xmm4, xmm5 965 pmaddubsw xmm4, xmm1 966 967 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 968 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 969 970 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 971 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 972 973 packuswb xmm6, xmm4 974 movdqa xmm5, xmm7 975 976 punpcklbw xmm5, xmm6 977 pmaddubsw xmm5, xmm2 978 979 punpckhbw xmm7, xmm6 980 pmaddubsw xmm7, xmm2 981 982 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value 983 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 984 985 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 986 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 987 988 packuswb xmm5, xmm7 989 movdqa xmm7, xmm6 990 991 movdqa [rdi], xmm5 ; store the results in the destination 992%if ABI_IS_32BIT 993 add rdi, DWORD PTR arg(5) ; dst_pitch 994%else 995 add rdi, r8 996%endif 997 998 cmp rdi, rcx 999 jne .next_row 1000 1001 jmp .done 1002 1003.b16x16_sp_only: 1004 movsxd rax, dword ptr arg(3) ; yoffset 1005 shl rax, 4 1006 lea rax, [rax + rcx] ; VFilter 1007 1008 mov rdi, arg(4) ; dst_ptr 1009 mov rsi, arg(0) ; src_ptr 1010 movsxd rdx, dword ptr arg(5) ; dst_pitch 1011 1012 movdqa xmm1, [rax] ; VFilter 1013 1014 lea rcx, [rdi+rdx*8] 1015 lea rcx, [rcx+rdx*8] 1016 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1017 1018 ; get the first horizontal line done 1019 movq xmm4, [rsi] ; load row 0 1020 movq xmm2, [rsi + 8] ; load row 0 1021 1022 lea rsi, [rsi + rax] ; next line 1023.next_row_sp: 1024 movq xmm3, [rsi] ; load row + 1 1025 movq xmm5, [rsi + 8] ; load row + 1 1026 1027 punpcklbw xmm4, xmm3 1028 punpcklbw xmm2, xmm5 1029 1030 pmaddubsw xmm4, xmm1 1031 movq xmm7, [rsi + rax] ; load row + 2 1032 1033 pmaddubsw xmm2, xmm1 1034 movq xmm6, [rsi + rax + 8] ; load row + 2 1035 1036 punpcklbw xmm3, xmm7 1037 punpcklbw xmm5, xmm6 1038 1039 pmaddubsw xmm3, xmm1 1040 paddw xmm4, [GLOBAL(rd)] 1041 1042 pmaddubsw xmm5, xmm1 1043 paddw xmm2, [GLOBAL(rd)] 1044 1045 psraw xmm4, VP8_FILTER_SHIFT 1046 psraw xmm2, VP8_FILTER_SHIFT 1047 1048 packuswb xmm4, xmm2 1049 paddw xmm3, [GLOBAL(rd)] 1050 1051 movdqa [rdi], xmm4 ; store row 0 1052 paddw xmm5, [GLOBAL(rd)] 1053 1054 psraw xmm3, VP8_FILTER_SHIFT 1055 psraw xmm5, VP8_FILTER_SHIFT 1056 1057 packuswb xmm3, xmm5 1058 movdqa xmm4, xmm7 1059 1060 movdqa [rdi + rdx],xmm3 ; store row 1 1061 lea rsi, [rsi + 2*rax] 1062 1063 movdqa xmm2, xmm6 1064 lea rdi, [rdi + 2*rdx] 1065 1066 cmp rdi, rcx 1067 jne .next_row_sp 1068 1069 jmp .done 1070 1071.b16x16_fp_only: 1072 lea rcx, [rdi+rdx*8] 1073 lea rcx, [rcx+rdx*8] 1074 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1075 1076.next_row_fp: 1077 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 1078 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 1079 1080 punpcklbw xmm2, xmm4 1081 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 1082 1083 pmaddubsw xmm2, xmm1 1084 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 1085 1086 lea rsi, [rsi + rax] ; next line 1087 punpcklbw xmm3, xmm4 1088 1089 pmaddubsw xmm3, xmm1 1090 movq xmm5, [rsi] 1091 1092 paddw xmm2, [GLOBAL(rd)] 1093 movq xmm7, [rsi+1] 1094 1095 movq xmm6, [rsi+8] 1096 psraw xmm2, VP8_FILTER_SHIFT 1097 1098 punpcklbw xmm5, xmm7 1099 movq xmm7, [rsi+9] 1100 1101 paddw xmm3, [GLOBAL(rd)] 1102 pmaddubsw xmm5, xmm1 1103 1104 psraw xmm3, VP8_FILTER_SHIFT 1105 punpcklbw xmm6, xmm7 1106 1107 packuswb xmm2, xmm3 1108 pmaddubsw xmm6, xmm1 1109 1110 movdqa [rdi], xmm2 ; store the results in the destination 1111 paddw xmm5, [GLOBAL(rd)] 1112 1113 lea rdi, [rdi + rdx] ; dst_pitch 1114 psraw xmm5, VP8_FILTER_SHIFT 1115 1116 paddw xmm6, [GLOBAL(rd)] 1117 psraw xmm6, VP8_FILTER_SHIFT 1118 1119 packuswb xmm5, xmm6 1120 lea rsi, [rsi + rax] ; next line 1121 1122 movdqa [rdi], xmm5 ; store the results in the destination 1123 lea rdi, [rdi + rdx] ; dst_pitch 1124 1125 cmp rdi, rcx 1126 1127 jne .next_row_fp 1128 1129.done: 1130 ; begin epilog 1131 pop rdi 1132 pop rsi 1133 RESTORE_GOT 1134 RESTORE_XMM 1135 UNSHADOW_ARGS 1136 pop rbp 1137 ret 1138 1139;void vp8_bilinear_predict8x8_ssse3 1140;( 1141; unsigned char *src_ptr, 1142; int src_pixels_per_line, 1143; int xoffset, 1144; int yoffset, 1145; unsigned char *dst_ptr, 1146; int dst_pitch 1147;) 1148global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE 1149sym(vp8_bilinear_predict8x8_ssse3): 1150 push rbp 1151 mov rbp, rsp 1152 SHADOW_ARGS_TO_STACK 6 1153 SAVE_XMM 7 1154 GET_GOT rbx 1155 push rsi 1156 push rdi 1157 ; end prolog 1158 1159 ALIGN_STACK 16, rax 1160 sub rsp, 144 ; reserve 144 bytes 1161 1162 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1163 1164 mov rsi, arg(0) ;src_ptr 1165 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1166 1167 ;Read 9-line unaligned data in and put them on stack. This gives a big 1168 ;performance boost. 1169 movdqu xmm0, [rsi] 1170 lea rax, [rdx + rdx*2] 1171 movdqu xmm1, [rsi+rdx] 1172 movdqu xmm2, [rsi+rdx*2] 1173 add rsi, rax 1174 movdqu xmm3, [rsi] 1175 movdqu xmm4, [rsi+rdx] 1176 movdqu xmm5, [rsi+rdx*2] 1177 add rsi, rax 1178 movdqu xmm6, [rsi] 1179 movdqu xmm7, [rsi+rdx] 1180 1181 movdqa XMMWORD PTR [rsp], xmm0 1182 1183 movdqu xmm0, [rsi+rdx*2] 1184 1185 movdqa XMMWORD PTR [rsp+16], xmm1 1186 movdqa XMMWORD PTR [rsp+32], xmm2 1187 movdqa XMMWORD PTR [rsp+48], xmm3 1188 movdqa XMMWORD PTR [rsp+64], xmm4 1189 movdqa XMMWORD PTR [rsp+80], xmm5 1190 movdqa XMMWORD PTR [rsp+96], xmm6 1191 movdqa XMMWORD PTR [rsp+112], xmm7 1192 movdqa XMMWORD PTR [rsp+128], xmm0 1193 1194 movsxd rax, dword ptr arg(2) ; xoffset 1195 cmp rax, 0 ; skip first_pass filter if xoffset=0 1196 je .b8x8_sp_only 1197 1198 shl rax, 4 1199 add rax, rcx ; HFilter 1200 1201 mov rdi, arg(4) ; dst_ptr 1202 movsxd rdx, dword ptr arg(5) ; dst_pitch 1203 1204 movdqa xmm0, [rax] 1205 1206 movsxd rax, dword ptr arg(3) ; yoffset 1207 cmp rax, 0 ; skip second_pass filter if yoffset=0 1208 je .b8x8_fp_only 1209 1210 shl rax, 4 1211 lea rax, [rax + rcx] ; VFilter 1212 1213 lea rcx, [rdi+rdx*8] 1214 1215 movdqa xmm1, [rax] 1216 1217 ; get the first horizontal line done 1218 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1219 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx 1220 1221 psrldq xmm5, 1 1222 lea rsp, [rsp + 16] ; next line 1223 1224 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 1225 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 1226 1227 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1228 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1229 1230 movdqa xmm7, xmm3 1231 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1232 1233.next_row: 1234 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1235 lea rsp, [rsp + 16] ; next line 1236 1237 movdqa xmm5, xmm6 1238 1239 psrldq xmm5, 1 1240 1241 punpcklbw xmm6, xmm5 1242 pmaddubsw xmm6, xmm0 1243 1244 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1245 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1246 1247 packuswb xmm6, xmm6 1248 1249 punpcklbw xmm7, xmm6 1250 pmaddubsw xmm7, xmm1 1251 1252 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1253 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1254 1255 packuswb xmm7, xmm7 1256 1257 movq [rdi], xmm7 ; store the results in the destination 1258 lea rdi, [rdi + rdx] 1259 1260 movdqa xmm7, xmm6 1261 1262 cmp rdi, rcx 1263 jne .next_row 1264 1265 jmp .done8x8 1266 1267.b8x8_sp_only: 1268 movsxd rax, dword ptr arg(3) ; yoffset 1269 shl rax, 4 1270 lea rax, [rax + rcx] ; VFilter 1271 1272 mov rdi, arg(4) ;dst_ptr 1273 movsxd rdx, dword ptr arg(5) ; dst_pitch 1274 1275 movdqa xmm0, [rax] ; VFilter 1276 1277 movq xmm1, XMMWORD PTR [rsp] 1278 movq xmm2, XMMWORD PTR [rsp+16] 1279 1280 movq xmm3, XMMWORD PTR [rsp+32] 1281 punpcklbw xmm1, xmm2 1282 1283 movq xmm4, XMMWORD PTR [rsp+48] 1284 punpcklbw xmm2, xmm3 1285 1286 movq xmm5, XMMWORD PTR [rsp+64] 1287 punpcklbw xmm3, xmm4 1288 1289 movq xmm6, XMMWORD PTR [rsp+80] 1290 punpcklbw xmm4, xmm5 1291 1292 movq xmm7, XMMWORD PTR [rsp+96] 1293 punpcklbw xmm5, xmm6 1294 1295 ; Because the source register (xmm0) is always treated as signed by 1296 ; pmaddubsw, the constant '128' is treated as '-128'. 1297 pmaddubsw xmm1, xmm0 1298 pmaddubsw xmm2, xmm0 1299 1300 pmaddubsw xmm3, xmm0 1301 pmaddubsw xmm4, xmm0 1302 1303 pmaddubsw xmm5, xmm0 1304 punpcklbw xmm6, xmm7 1305 1306 pmaddubsw xmm6, xmm0 1307 paddw xmm1, [GLOBAL(rd)] 1308 1309 paddw xmm2, [GLOBAL(rd)] 1310 psraw xmm1, VP8_FILTER_SHIFT 1311 1312 paddw xmm3, [GLOBAL(rd)] 1313 psraw xmm2, VP8_FILTER_SHIFT 1314 1315 paddw xmm4, [GLOBAL(rd)] 1316 psraw xmm3, VP8_FILTER_SHIFT 1317 1318 paddw xmm5, [GLOBAL(rd)] 1319 psraw xmm4, VP8_FILTER_SHIFT 1320 1321 paddw xmm6, [GLOBAL(rd)] 1322 psraw xmm5, VP8_FILTER_SHIFT 1323 1324 psraw xmm6, VP8_FILTER_SHIFT 1325 1326 ; Having multiplied everything by '-128' and obtained negative 1327 ; numbers, the unsigned saturation truncates those values to 0, 1328 ; resulting in incorrect handling of xoffset == 0 && yoffset == 0 1329 packuswb xmm1, xmm1 1330 1331 packuswb xmm2, xmm2 1332 movq [rdi], xmm1 1333 1334 packuswb xmm3, xmm3 1335 movq [rdi+rdx], xmm2 1336 1337 packuswb xmm4, xmm4 1338 movq xmm1, XMMWORD PTR [rsp+112] 1339 1340 lea rdi, [rdi + 2*rdx] 1341 movq xmm2, XMMWORD PTR [rsp+128] 1342 1343 packuswb xmm5, xmm5 1344 movq [rdi], xmm3 1345 1346 packuswb xmm6, xmm6 1347 movq [rdi+rdx], xmm4 1348 1349 lea rdi, [rdi + 2*rdx] 1350 punpcklbw xmm7, xmm1 1351 1352 movq [rdi], xmm5 1353 pmaddubsw xmm7, xmm0 1354 1355 movq [rdi+rdx], xmm6 1356 punpcklbw xmm1, xmm2 1357 1358 pmaddubsw xmm1, xmm0 1359 paddw xmm7, [GLOBAL(rd)] 1360 1361 psraw xmm7, VP8_FILTER_SHIFT 1362 paddw xmm1, [GLOBAL(rd)] 1363 1364 psraw xmm1, VP8_FILTER_SHIFT 1365 packuswb xmm7, xmm7 1366 1367 packuswb xmm1, xmm1 1368 lea rdi, [rdi + 2*rdx] 1369 1370 movq [rdi], xmm7 1371 1372 movq [rdi+rdx], xmm1 1373 lea rsp, [rsp + 144] 1374 1375 jmp .done8x8 1376 1377.b8x8_fp_only: 1378 lea rcx, [rdi+rdx*8] 1379 1380.next_row_fp: 1381 movdqa xmm1, XMMWORD PTR [rsp] 1382 movdqa xmm3, XMMWORD PTR [rsp+16] 1383 1384 movdqa xmm2, xmm1 1385 movdqa xmm5, XMMWORD PTR [rsp+32] 1386 1387 psrldq xmm2, 1 1388 movdqa xmm7, XMMWORD PTR [rsp+48] 1389 1390 movdqa xmm4, xmm3 1391 psrldq xmm4, 1 1392 1393 movdqa xmm6, xmm5 1394 psrldq xmm6, 1 1395 1396 punpcklbw xmm1, xmm2 1397 pmaddubsw xmm1, xmm0 1398 1399 punpcklbw xmm3, xmm4 1400 pmaddubsw xmm3, xmm0 1401 1402 punpcklbw xmm5, xmm6 1403 pmaddubsw xmm5, xmm0 1404 1405 movdqa xmm2, xmm7 1406 psrldq xmm2, 1 1407 1408 punpcklbw xmm7, xmm2 1409 pmaddubsw xmm7, xmm0 1410 1411 paddw xmm1, [GLOBAL(rd)] 1412 psraw xmm1, VP8_FILTER_SHIFT 1413 1414 paddw xmm3, [GLOBAL(rd)] 1415 psraw xmm3, VP8_FILTER_SHIFT 1416 1417 paddw xmm5, [GLOBAL(rd)] 1418 psraw xmm5, VP8_FILTER_SHIFT 1419 1420 paddw xmm7, [GLOBAL(rd)] 1421 psraw xmm7, VP8_FILTER_SHIFT 1422 1423 packuswb xmm1, xmm1 1424 packuswb xmm3, xmm3 1425 1426 packuswb xmm5, xmm5 1427 movq [rdi], xmm1 1428 1429 packuswb xmm7, xmm7 1430 movq [rdi+rdx], xmm3 1431 1432 lea rdi, [rdi + 2*rdx] 1433 movq [rdi], xmm5 1434 1435 lea rsp, [rsp + 4*16] 1436 movq [rdi+rdx], xmm7 1437 1438 lea rdi, [rdi + 2*rdx] 1439 cmp rdi, rcx 1440 1441 jne .next_row_fp 1442 1443 lea rsp, [rsp + 16] 1444 1445.done8x8: 1446 ;add rsp, 144 1447 pop rsp 1448 ; begin epilog 1449 pop rdi 1450 pop rsi 1451 RESTORE_GOT 1452 RESTORE_XMM 1453 UNSHADOW_ARGS 1454 pop rbp 1455 ret 1456 1457SECTION_RODATA 1458align 16 1459shuf1b: 1460 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 1461shuf2b: 1462 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 1463shuf3b: 1464 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 1465 1466align 16 1467shuf2bfrom1: 1468 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 1469align 16 1470shuf3bfrom1: 1471 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 1472 1473align 16 1474rd: 1475 times 8 dw 0x40 1476 1477align 16 1478k0_k5: 1479 times 8 db 0, 0 ;placeholder 1480 times 8 db 0, 0 1481 times 8 db 2, 1 1482 times 8 db 0, 0 1483 times 8 db 3, 3 1484 times 8 db 0, 0 1485 times 8 db 1, 2 1486 times 8 db 0, 0 1487k1_k3: 1488 times 8 db 0, 0 ;placeholder 1489 times 8 db -6, 12 1490 times 8 db -11, 36 1491 times 8 db -9, 50 1492 times 8 db -16, 77 1493 times 8 db -6, 93 1494 times 8 db -8, 108 1495 times 8 db -1, 123 1496k2_k4: 1497 times 8 db 128, 0 ;placeholder 1498 times 8 db 123, -1 1499 times 8 db 108, -8 1500 times 8 db 93, -6 1501 times 8 db 77, -16 1502 times 8 db 50, -9 1503 times 8 db 36, -11 1504 times 8 db 12, -6 1505align 16 1506vp8_bilinear_filters_ssse3: 1507 times 8 db 128, 0 1508 times 8 db 112, 16 1509 times 8 db 96, 32 1510 times 8 db 80, 48 1511 times 8 db 64, 64 1512 times 8 db 48, 80 1513 times 8 db 32, 96 1514 times 8 db 16, 112 1515 1516