1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14 15%define BLOCK_HEIGHT_WIDTH 4 16%define vp8_filter_weight 128 17%define VP8_FILTER_SHIFT 7 18 19 20;void vp8_filter_block1d_h6_mmx 21;( 22; unsigned char *src_ptr, 23; unsigned short *output_ptr, 24; unsigned int src_pixels_per_line, 25; unsigned int pixel_step, 26; unsigned int output_height, 27; unsigned int output_width, 28; short * vp8_filter 29;) 30global sym(vp8_filter_block1d_h6_mmx) 31sym(vp8_filter_block1d_h6_mmx): 32 push rbp 33 mov rbp, rsp 34 SHADOW_ARGS_TO_STACK 7 35 GET_GOT rbx 36 push rsi 37 push rdi 38 ; end prolog 39 40 mov rdx, arg(6) ;vp8_filter 41 42 movq mm1, [rdx + 16] ; do both the negative taps first!!! 43 movq mm2, [rdx + 32] ; 44 movq mm6, [rdx + 48] ; 45 movq mm7, [rdx + 64] ; 46 47 mov rdi, arg(1) ;output_ptr 48 mov rsi, arg(0) ;src_ptr 49 movsxd rcx, dword ptr arg(4) ;output_height 50 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 51 pxor mm0, mm0 ; mm0 = 00000000 52 53nextrow: 54 movq mm3, [rsi-2] ; mm3 = p-2..p5 55 movq mm4, mm3 ; mm4 = p-2..p5 56 psrlq mm3, 8 ; mm3 = p-1..p5 57 punpcklbw mm3, mm0 ; mm3 = p-1..p2 58 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 59 60 movq mm5, mm4 ; mm5 = p-2..p5 61 punpckhbw mm4, mm0 ; mm5 = p2..p5 62 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 63 paddsw mm3, mm4 ; mm3 += mm5 64 65 movq mm4, mm5 ; mm4 = p-2..p5; 66 psrlq mm5, 16 ; mm5 = p0..p5; 67 punpcklbw mm5, mm0 ; mm5 = p0..p3 68 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 69 paddsw mm3, mm5 ; mm3 += mm5 70 71 movq mm5, mm4 ; mm5 = p-2..p5 72 psrlq mm4, 24 ; mm4 = p1..p5 73 punpcklbw mm4, mm0 ; mm4 = p1..p4 74 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 75 paddsw mm3, mm4 ; mm3 += mm5 76 77 ; do outer positive taps 78 movd mm4, [rsi+3] 79 punpcklbw mm4, mm0 ; mm5 = p3..p6 80 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 81 paddsw mm3, mm4 ; mm3 += mm5 82 83 punpcklbw mm5, mm0 ; mm5 = p-2..p1 84 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 85 paddsw mm3, mm5 ; mm3 += mm5 86 87 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 88 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 89 packuswb mm3, mm0 ; pack and unpack to saturate 90 punpcklbw mm3, mm0 ; 91 92 movq [rdi], mm3 ; store the results in the destination 93 94%if ABI_IS_32BIT 95 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 96 add rdi, rax; 97%else 98 movsxd r8, dword ptr arg(2) ;src_pixels_per_line 99 add rdi, rax; 100 101 add rsi, r8 ; next line 102%endif 103 104 dec rcx ; decrement count 105 jnz nextrow ; next row 106 107 ; begin epilog 108 pop rdi 109 pop rsi 110 RESTORE_GOT 111 UNSHADOW_ARGS 112 pop rbp 113 ret 114 115 116; 117; THIS FUNCTION APPEARS TO BE UNUSED 118; 119;void vp8_filter_block1d_v6_mmx 120;( 121; short *src_ptr, 122; unsigned char *output_ptr, 123; unsigned int pixels_per_line, 124; unsigned int pixel_step, 125; unsigned int output_height, 126; unsigned int output_width, 127; short * vp8_filter 128;) 129global sym(vp8_filter_block1d_v6_mmx) 130sym(vp8_filter_block1d_v6_mmx): 131 push rbp 132 mov rbp, rsp 133 SHADOW_ARGS_TO_STACK 7 134 GET_GOT rbx 135 push rsi 136 push rdi 137 ; end prolog 138 139 movq mm5, [GLOBAL(rd)] 140 push rbx 141 mov rbx, arg(6) ;vp8_filter 142 movq mm1, [rbx + 16] ; do both the negative taps first!!! 143 movq mm2, [rbx + 32] ; 144 movq mm6, [rbx + 48] ; 145 movq mm7, [rbx + 64] ; 146 147 movsxd rdx, dword ptr arg(2) ;pixels_per_line 148 mov rdi, arg(1) ;output_ptr 149 mov rsi, arg(0) ;src_ptr 150 sub rsi, rdx 151 sub rsi, rdx 152 movsxd rcx, DWORD PTR arg(4) ;output_height 153 movsxd rax, DWORD PTR arg(5) ;output_width ; destination pitch? 154 pxor mm0, mm0 ; mm0 = 00000000 155 156 157nextrow_v: 158 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 159 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 160 161 162 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 163 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 164 paddsw mm3, mm4 ; mm3 += mm4 165 166 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 167 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 168 paddsw mm3, mm4 ; mm3 += mm4 169 170 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 171 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 172 paddsw mm3, mm4 ; mm3 += mm4 173 174 175 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 176 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 177 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 178 paddsw mm3, mm4 ; mm3 += mm4 179 180 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 181 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 182 paddsw mm3, mm4 ; mm3 += mm4 183 184 185 paddsw mm3, mm5 ; mm3 += round value 186 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 187 packuswb mm3, mm0 ; pack and saturate 188 189 movd [rdi],mm3 ; store the results in the destination 190 191 add rdi,rax; 192 193 dec rcx ; decrement count 194 jnz nextrow_v ; next row 195 196 pop rbx 197 198 ; begin epilog 199 pop rdi 200 pop rsi 201 RESTORE_GOT 202 UNSHADOW_ARGS 203 pop rbp 204 ret 205 206 207;void vp8_filter_block1dc_v6_mmx 208;( 209; short *src_ptr, 210; unsigned char *output_ptr, 211; int output_pitch, 212; unsigned int pixels_per_line, 213; unsigned int pixel_step, 214; unsigned int output_height, 215; unsigned int output_width, 216; short * vp8_filter 217;) 218global sym(vp8_filter_block1dc_v6_mmx) 219sym(vp8_filter_block1dc_v6_mmx): 220 push rbp 221 mov rbp, rsp 222 SHADOW_ARGS_TO_STACK 8 223 GET_GOT rbx 224 push rsi 225 push rdi 226 ; end prolog 227 228 movq mm5, [GLOBAL(rd)] 229 push rbx 230 mov rbx, arg(7) ;vp8_filter 231 movq mm1, [rbx + 16] ; do both the negative taps first!!! 232 movq mm2, [rbx + 32] ; 233 movq mm6, [rbx + 48] ; 234 movq mm7, [rbx + 64] ; 235 236 movsxd rdx, dword ptr arg(3) ;pixels_per_line 237 mov rdi, arg(1) ;output_ptr 238 mov rsi, arg(0) ;src_ptr 239 sub rsi, rdx 240 sub rsi, rdx 241 movsxd rcx, DWORD PTR arg(5) ;output_height 242 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 243 pxor mm0, mm0 ; mm0 = 00000000 244 245 246nextrow_cv: 247 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 248 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 249 250 251 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 252 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 253 paddsw mm3, mm4 ; mm3 += mm4 254 255 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 256 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 257 paddsw mm3, mm4 ; mm3 += mm4 258 259 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 260 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 261 paddsw mm3, mm4 ; mm3 += mm4 262 263 264 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 265 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 266 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 267 paddsw mm3, mm4 ; mm3 += mm4 268 269 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 270 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 271 paddsw mm3, mm4 ; mm3 += mm4 272 273 274 paddsw mm3, mm5 ; mm3 += round value 275 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 276 packuswb mm3, mm0 ; pack and saturate 277 278 movd [rdi],mm3 ; store the results in the destination 279 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 280 ; recon block should be in cache this shouldn't cost much. Its obviously 281 ; avoidable!!!. 282 lea rdi, [rdi+rax] ; 283 dec rcx ; decrement count 284 jnz nextrow_cv ; next row 285 286 pop rbx 287 288 ; begin epilog 289 pop rdi 290 pop rsi 291 RESTORE_GOT 292 UNSHADOW_ARGS 293 pop rbp 294 ret 295 296 297;void bilinear_predict8x8_mmx 298;( 299; unsigned char *src_ptr, 300; int src_pixels_per_line, 301; int xoffset, 302; int yoffset, 303; unsigned char *dst_ptr, 304; int dst_pitch 305;) 306global sym(vp8_bilinear_predict8x8_mmx) 307sym(vp8_bilinear_predict8x8_mmx): 308 push rbp 309 mov rbp, rsp 310 SHADOW_ARGS_TO_STACK 6 311 GET_GOT rbx 312 push rsi 313 push rdi 314 ; end prolog 315 316 ;const short *HFilter = bilinear_filters_mmx[xoffset]; 317 ;const short *VFilter = bilinear_filters_mmx[yoffset]; 318 319 movsxd rax, dword ptr arg(2) ;xoffset 320 mov rdi, arg(4) ;dst_ptr ; 321 322 shl rax, 5 ; offset * 32 323 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] 324 325 add rax, rcx ; HFilter 326 mov rsi, arg(0) ;src_ptr ; 327 328 movsxd rdx, dword ptr arg(5) ;dst_pitch 329 movq mm1, [rax] ; 330 331 movq mm2, [rax+16] ; 332 movsxd rax, dword ptr arg(3) ;yoffset 333 334 pxor mm0, mm0 ; 335 336 shl rax, 5 ; offset*32 337 add rax, rcx ; VFilter 338 339 lea rcx, [rdi+rdx*8] ; 340 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 341 342 343 344 ; get the first horizontal line done ; 345 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 346 movq mm4, mm3 ; make a copy of current line 347 348 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 349 punpckhbw mm4, mm0 ; 350 351 pmullw mm3, mm1 ; 352 pmullw mm4, mm1 ; 353 354 movq mm5, [rsi+1] ; 355 movq mm6, mm5 ; 356 357 punpcklbw mm5, mm0 ; 358 punpckhbw mm6, mm0 ; 359 360 pmullw mm5, mm2 ; 361 pmullw mm6, mm2 ; 362 363 paddw mm3, mm5 ; 364 paddw mm4, mm6 ; 365 366 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 367 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 368 369 paddw mm4, [GLOBAL(rd)] ; 370 psraw mm4, VP8_FILTER_SHIFT ; 371 372 movq mm7, mm3 ; 373 packuswb mm7, mm4 ; 374 375 add rsi, rdx ; next line 376next_row_8x8: 377 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 378 movq mm4, mm3 ; make a copy of current line 379 380 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 381 punpckhbw mm4, mm0 ; 382 383 pmullw mm3, mm1 ; 384 pmullw mm4, mm1 ; 385 386 movq mm5, [rsi+1] ; 387 movq mm6, mm5 ; 388 389 punpcklbw mm5, mm0 ; 390 punpckhbw mm6, mm0 ; 391 392 pmullw mm5, mm2 ; 393 pmullw mm6, mm2 ; 394 395 paddw mm3, mm5 ; 396 paddw mm4, mm6 ; 397 398 movq mm5, mm7 ; 399 movq mm6, mm7 ; 400 401 punpcklbw mm5, mm0 ; 402 punpckhbw mm6, mm0 403 404 pmullw mm5, [rax] ; 405 pmullw mm6, [rax] ; 406 407 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 408 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 409 410 paddw mm4, [GLOBAL(rd)] ; 411 psraw mm4, VP8_FILTER_SHIFT ; 412 413 movq mm7, mm3 ; 414 packuswb mm7, mm4 ; 415 416 417 pmullw mm3, [rax+16] ; 418 pmullw mm4, [rax+16] ; 419 420 paddw mm3, mm5 ; 421 paddw mm4, mm6 ; 422 423 424 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 425 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 426 427 paddw mm4, [GLOBAL(rd)] ; 428 psraw mm4, VP8_FILTER_SHIFT ; 429 430 packuswb mm3, mm4 431 432 movq [rdi], mm3 ; store the results in the destination 433 434%if ABI_IS_32BIT 435 add rsi, rdx ; next line 436 add rdi, dword ptr arg(5) ;dst_pitch ; 437%else 438 movsxd r8, dword ptr arg(5) ;dst_pitch 439 add rsi, rdx ; next line 440 add rdi, r8 ;dst_pitch 441%endif 442 cmp rdi, rcx ; 443 jne next_row_8x8 444 445 ; begin epilog 446 pop rdi 447 pop rsi 448 RESTORE_GOT 449 UNSHADOW_ARGS 450 pop rbp 451 ret 452 453 454;void bilinear_predict8x4_mmx 455;( 456; unsigned char *src_ptr, 457; int src_pixels_per_line, 458; int xoffset, 459; int yoffset, 460; unsigned char *dst_ptr, 461; int dst_pitch 462;) 463global sym(vp8_bilinear_predict8x4_mmx) 464sym(vp8_bilinear_predict8x4_mmx): 465 push rbp 466 mov rbp, rsp 467 SHADOW_ARGS_TO_STACK 6 468 GET_GOT rbx 469 push rsi 470 push rdi 471 ; end prolog 472 473 ;const short *HFilter = bilinear_filters_mmx[xoffset]; 474 ;const short *VFilter = bilinear_filters_mmx[yoffset]; 475 476 movsxd rax, dword ptr arg(2) ;xoffset 477 mov rdi, arg(4) ;dst_ptr ; 478 479 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] 480 shl rax, 5 481 482 mov rsi, arg(0) ;src_ptr ; 483 add rax, rcx 484 485 movsxd rdx, dword ptr arg(5) ;dst_pitch 486 movq mm1, [rax] ; 487 488 movq mm2, [rax+16] ; 489 movsxd rax, dword ptr arg(3) ;yoffset 490 491 pxor mm0, mm0 ; 492 shl rax, 5 493 494 add rax, rcx 495 lea rcx, [rdi+rdx*4] ; 496 497 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 498 499 ; get the first horizontal line done ; 500 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 501 movq mm4, mm3 ; make a copy of current line 502 503 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 504 punpckhbw mm4, mm0 ; 505 506 pmullw mm3, mm1 ; 507 pmullw mm4, mm1 ; 508 509 movq mm5, [rsi+1] ; 510 movq mm6, mm5 ; 511 512 punpcklbw mm5, mm0 ; 513 punpckhbw mm6, mm0 ; 514 515 pmullw mm5, mm2 ; 516 pmullw mm6, mm2 ; 517 518 paddw mm3, mm5 ; 519 paddw mm4, mm6 ; 520 521 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 522 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 523 524 paddw mm4, [GLOBAL(rd)] ; 525 psraw mm4, VP8_FILTER_SHIFT ; 526 527 movq mm7, mm3 ; 528 packuswb mm7, mm4 ; 529 530 add rsi, rdx ; next line 531next_row_8x4: 532 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 533 movq mm4, mm3 ; make a copy of current line 534 535 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 536 punpckhbw mm4, mm0 ; 537 538 pmullw mm3, mm1 ; 539 pmullw mm4, mm1 ; 540 541 movq mm5, [rsi+1] ; 542 movq mm6, mm5 ; 543 544 punpcklbw mm5, mm0 ; 545 punpckhbw mm6, mm0 ; 546 547 pmullw mm5, mm2 ; 548 pmullw mm6, mm2 ; 549 550 paddw mm3, mm5 ; 551 paddw mm4, mm6 ; 552 553 movq mm5, mm7 ; 554 movq mm6, mm7 ; 555 556 punpcklbw mm5, mm0 ; 557 punpckhbw mm6, mm0 558 559 pmullw mm5, [rax] ; 560 pmullw mm6, [rax] ; 561 562 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 563 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 564 565 paddw mm4, [GLOBAL(rd)] ; 566 psraw mm4, VP8_FILTER_SHIFT ; 567 568 movq mm7, mm3 ; 569 packuswb mm7, mm4 ; 570 571 572 pmullw mm3, [rax+16] ; 573 pmullw mm4, [rax+16] ; 574 575 paddw mm3, mm5 ; 576 paddw mm4, mm6 ; 577 578 579 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 580 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 581 582 paddw mm4, [GLOBAL(rd)] ; 583 psraw mm4, VP8_FILTER_SHIFT ; 584 585 packuswb mm3, mm4 586 587 movq [rdi], mm3 ; store the results in the destination 588 589%if ABI_IS_32BIT 590 add rsi, rdx ; next line 591 add rdi, dword ptr arg(5) ;dst_pitch ; 592%else 593 movsxd r8, dword ptr arg(5) ;dst_pitch 594 add rsi, rdx ; next line 595 add rdi, r8 596%endif 597 cmp rdi, rcx ; 598 jne next_row_8x4 599 600 ; begin epilog 601 pop rdi 602 pop rsi 603 RESTORE_GOT 604 UNSHADOW_ARGS 605 pop rbp 606 ret 607 608 609;void bilinear_predict4x4_mmx 610;( 611; unsigned char *src_ptr, 612; int src_pixels_per_line, 613; int xoffset, 614; int yoffset, 615; unsigned char *dst_ptr, 616; int dst_pitch 617;) 618global sym(vp8_bilinear_predict4x4_mmx) 619sym(vp8_bilinear_predict4x4_mmx): 620 push rbp 621 mov rbp, rsp 622 SHADOW_ARGS_TO_STACK 6 623 GET_GOT rbx 624 push rsi 625 push rdi 626 ; end prolog 627 628 ;const short *HFilter = bilinear_filters_mmx[xoffset]; 629 ;const short *VFilter = bilinear_filters_mmx[yoffset]; 630 631 movsxd rax, dword ptr arg(2) ;xoffset 632 mov rdi, arg(4) ;dst_ptr ; 633 634 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_mmx))] 635 shl rax, 5 636 637 add rax, rcx ; HFilter 638 mov rsi, arg(0) ;src_ptr ; 639 640 movsxd rdx, dword ptr arg(5) ;ldst_pitch 641 movq mm1, [rax] ; 642 643 movq mm2, [rax+16] ; 644 movsxd rax, dword ptr arg(3) ;yoffset 645 646 pxor mm0, mm0 ; 647 shl rax, 5 648 649 add rax, rcx 650 lea rcx, [rdi+rdx*4] ; 651 652 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 653 654 ; get the first horizontal line done ; 655 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 656 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 657 658 pmullw mm3, mm1 ; 659 movd mm5, [rsi+1] ; 660 661 punpcklbw mm5, mm0 ; 662 pmullw mm5, mm2 ; 663 664 paddw mm3, mm5 ; 665 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 666 667 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 668 669 movq mm7, mm3 ; 670 packuswb mm7, mm0 ; 671 672 add rsi, rdx ; next line 673next_row_4x4: 674 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 675 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 676 677 pmullw mm3, mm1 ; 678 movd mm5, [rsi+1] ; 679 680 punpcklbw mm5, mm0 ; 681 pmullw mm5, mm2 ; 682 683 paddw mm3, mm5 ; 684 685 movq mm5, mm7 ; 686 punpcklbw mm5, mm0 ; 687 688 pmullw mm5, [rax] ; 689 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 690 691 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 692 movq mm7, mm3 ; 693 694 packuswb mm7, mm0 ; 695 696 pmullw mm3, [rax+16] ; 697 paddw mm3, mm5 ; 698 699 700 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 701 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 702 703 packuswb mm3, mm0 704 movd [rdi], mm3 ; store the results in the destination 705 706%if ABI_IS_32BIT 707 add rsi, rdx ; next line 708 add rdi, dword ptr arg(5) ;dst_pitch ; 709%else 710 movsxd r8, dword ptr arg(5) ;dst_pitch ; 711 add rsi, rdx ; next line 712 add rdi, r8 713%endif 714 715 cmp rdi, rcx ; 716 jne next_row_4x4 717 718 ; begin epilog 719 pop rdi 720 pop rsi 721 RESTORE_GOT 722 UNSHADOW_ARGS 723 pop rbp 724 ret 725 726 727 728SECTION_RODATA 729align 16 730rd: 731 times 4 dw 0x40 732 733align 16 734global HIDDEN_DATA(sym(vp8_six_tap_mmx)) 735sym(vp8_six_tap_mmx): 736 times 8 dw 0 737 times 8 dw 0 738 times 8 dw 128 739 times 8 dw 0 740 times 8 dw 0 741 times 8 dw 0 742 743 times 8 dw 0 744 times 8 dw -6 745 times 8 dw 123 746 times 8 dw 12 747 times 8 dw -1 748 times 8 dw 0 749 750 times 8 dw 2 751 times 8 dw -11 752 times 8 dw 108 753 times 8 dw 36 754 times 8 dw -8 755 times 8 dw 1 756 757 times 8 dw 0 758 times 8 dw -9 759 times 8 dw 93 760 times 8 dw 50 761 times 8 dw -6 762 times 8 dw 0 763 764 times 8 dw 3 765 times 8 dw -16 766 times 8 dw 77 767 times 8 dw 77 768 times 8 dw -16 769 times 8 dw 3 770 771 times 8 dw 0 772 times 8 dw -6 773 times 8 dw 50 774 times 8 dw 93 775 times 8 dw -9 776 times 8 dw 0 777 778 times 8 dw 1 779 times 8 dw -8 780 times 8 dw 36 781 times 8 dw 108 782 times 8 dw -11 783 times 8 dw 2 784 785 times 8 dw 0 786 times 8 dw -1 787 times 8 dw 12 788 times 8 dw 123 789 times 8 dw -6 790 times 8 dw 0 791 792 793align 16 794global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx)) 795sym(vp8_bilinear_filters_mmx): 796 times 8 dw 128 797 times 8 dw 0 798 799 times 8 dw 112 800 times 8 dw 16 801 802 times 8 dw 96 803 times 8 dw 32 804 805 times 8 dw 80 806 times 8 dw 48 807 808 times 8 dw 64 809 times 8 dw 64 810 811 times 8 dw 48 812 times 8 dw 80 813 814 times 8 dw 32 815 times 8 dw 96 816 817 times 8 dw 16 818 times 8 dw 112 819