1;****************************************************************************** 2;* MMX/SSSE3-optimized functions for H.264 chroma MC 3;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, 4;* 2005-2008 Loren Merritt 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27rnd_rv40_2d_tbl: times 4 dw 0 28 times 4 dw 16 29 times 4 dw 32 30 times 4 dw 16 31 times 4 dw 32 32 times 4 dw 28 33 times 4 dw 32 34 times 4 dw 28 35 times 4 dw 0 36 times 4 dw 32 37 times 4 dw 16 38 times 4 dw 32 39 times 4 dw 32 40 times 4 dw 28 41 times 4 dw 32 42 times 4 dw 28 43rnd_rv40_1d_tbl: times 4 dw 0 44 times 4 dw 2 45 times 4 dw 4 46 times 4 dw 2 47 times 4 dw 4 48 times 4 dw 3 49 times 4 dw 4 50 times 4 dw 3 51 times 4 dw 0 52 times 4 dw 4 53 times 4 dw 2 54 times 4 dw 4 55 times 4 dw 4 56 times 4 dw 3 57 times 4 dw 4 58 times 4 dw 3 59 60cextern pw_3 61cextern pw_4 62cextern pw_8 63pw_28: times 8 dw 28 64cextern pw_32 65cextern pw_64 66 67SECTION .text 68 69%macro mv0_pixels_mc8 0 70 lea r4, [r2*2 ] 71.next4rows: 72 movq mm0, [r1 ] 73 movq mm1, [r1+r2] 74 add r1, r4 75 CHROMAMC_AVG mm0, [r0 ] 76 CHROMAMC_AVG mm1, [r0+r2] 77 movq [r0 ], mm0 78 movq [r0+r2], mm1 79 add r0, r4 80 movq mm0, [r1 ] 81 movq mm1, [r1+r2] 82 add r1, r4 83 CHROMAMC_AVG mm0, [r0 ] 84 CHROMAMC_AVG mm1, [r0+r2] 85 movq [r0 ], mm0 86 movq [r0+r2], mm1 87 add r0, r4 88 sub r3d, 4 89 jne .next4rows 90%endmacro 91 92%macro chroma_mc8_mmx_func 2-3 93%ifidn %2, rv40 94%ifdef PIC 95%define rnd_1d_rv40 r8 96%define rnd_2d_rv40 r8 97%define extra_regs 2 98%else ; no-PIC 99%define rnd_1d_rv40 rnd_rv40_1d_tbl 100%define rnd_2d_rv40 rnd_rv40_2d_tbl 101%define extra_regs 1 102%endif ; PIC 103%else 104%define extra_regs 0 105%endif ; rv40 106; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */, 107; uint8_t *src /* align 1 */, 108; ptrdiff_t stride, int h, int mx, int my) 109cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 110 mov r6d, r5d 111 or r6d, r4d 112 jne .at_least_one_non_zero 113 ; mx == 0 AND my == 0 - no filter needed 114 mv0_pixels_mc8 115 REP_RET 116 117.at_least_one_non_zero: 118%ifidn %2, rv40 119%if ARCH_X86_64 120 mov r7, r5 121 and r7, 6 ; &~1 for mx/my=[0,7] 122 lea r7, [r7*4+r4] 123 sar r7d, 1 124%define rnd_bias r7 125%define dest_reg r0 126%else ; x86-32 127 mov r0, r5 128 and r0, 6 ; &~1 for mx/my=[0,7] 129 lea r0, [r0*4+r4] 130 sar r0d, 1 131%define rnd_bias r0 132%define dest_reg r5 133%endif 134%else ; vc1, h264 135%define rnd_bias 0 136%define dest_reg r0 137%endif 138 139 test r5d, r5d 140 mov r6, 1 141 je .my_is_zero 142 test r4d, r4d 143 mov r6, r2 ; dxy = x ? 1 : stride 144 jne .both_non_zero 145.my_is_zero: 146 ; mx == 0 XOR my == 0 - 1 dimensional filter only 147 or r4d, r5d ; x + y 148 149%ifidn %2, rv40 150%ifdef PIC 151 lea r8, [rnd_rv40_1d_tbl] 152%endif 153%if ARCH_X86_64 == 0 154 mov r5, r0m 155%endif 156%endif 157 158 movd m5, r4d 159 movq m4, [pw_8] 160 movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 161 punpcklwd m5, m5 162 punpckldq m5, m5 ; mm5 = B = x 163 pxor m7, m7 164 psubw m4, m5 ; mm4 = A = 8-x 165 166.next1drow: 167 movq m0, [r1 ] ; mm0 = src[0..7] 168 movq m2, [r1+r6] ; mm1 = src[1..8] 169 170 movq m1, m0 171 movq m3, m2 172 punpcklbw m0, m7 173 punpckhbw m1, m7 174 punpcklbw m2, m7 175 punpckhbw m3, m7 176 pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] 177 pmullw m1, m4 178 pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] 179 pmullw m3, m5 180 181 paddw m0, m6 182 paddw m1, m6 183 paddw m0, m2 184 paddw m1, m3 185 psrlw m0, 3 186 psrlw m1, 3 187 packuswb m0, m1 188 CHROMAMC_AVG m0, [dest_reg] 189 movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 190 191 add dest_reg, r2 192 add r1, r2 193 dec r3d 194 jne .next1drow 195 REP_RET 196 197.both_non_zero: ; general case, bilinear 198 movd m4, r4d ; x 199 movd m6, r5d ; y 200%ifidn %2, rv40 201%ifdef PIC 202 lea r8, [rnd_rv40_2d_tbl] 203%endif 204%if ARCH_X86_64 == 0 205 mov r5, r0m 206%endif 207%endif 208 mov r6, rsp ; backup stack pointer 209 and rsp, ~(mmsize-1) ; align stack 210 sub rsp, 16 ; AA and DD 211 212 punpcklwd m4, m4 213 punpcklwd m6, m6 214 punpckldq m4, m4 ; mm4 = x words 215 punpckldq m6, m6 ; mm6 = y words 216 movq m5, m4 217 pmullw m4, m6 ; mm4 = x * y 218 psllw m5, 3 219 psllw m6, 3 220 movq m7, m5 221 paddw m7, m6 222 movq [rsp+8], m4 ; DD = x * y 223 psubw m5, m4 ; mm5 = B = 8x - xy 224 psubw m6, m4 ; mm6 = C = 8y - xy 225 paddw m4, [pw_64] 226 psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 227 pxor m7, m7 228 movq [rsp ], m4 229 230 movq m0, [r1 ] ; mm0 = src[0..7] 231 movq m1, [r1+1] ; mm1 = src[1..8] 232.next2drow: 233 add r1, r2 234 235 movq m2, m0 236 movq m3, m1 237 punpckhbw m0, m7 238 punpcklbw m1, m7 239 punpcklbw m2, m7 240 punpckhbw m3, m7 241 pmullw m0, [rsp] 242 pmullw m2, [rsp] 243 pmullw m1, m5 244 pmullw m3, m5 245 paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] 246 paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] 247 248 movq m0, [r1] 249 movq m1, m0 250 punpcklbw m0, m7 251 punpckhbw m1, m7 252 pmullw m0, m6 253 pmullw m1, m6 254 paddw m2, m0 255 paddw m3, m1 ; [mm2,mm3] += C * src[0..7] 256 257 movq m1, [r1+1] 258 movq m0, m1 259 movq m4, m1 260 punpcklbw m0, m7 261 punpckhbw m4, m7 262 pmullw m0, [rsp+8] 263 pmullw m4, [rsp+8] 264 paddw m2, m0 265 paddw m3, m4 ; [mm2,mm3] += D * src[1..8] 266 movq m0, [r1] 267 268 paddw m2, [rnd_2d_%2+rnd_bias*8] 269 paddw m3, [rnd_2d_%2+rnd_bias*8] 270 psrlw m2, 6 271 psrlw m3, 6 272 packuswb m2, m3 273 CHROMAMC_AVG m2, [dest_reg] 274 movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 275 276 add dest_reg, r2 277 dec r3d 278 jne .next2drow 279 mov rsp, r6 ; restore stack pointer 280 RET 281%endmacro 282 283%macro chroma_mc4_mmx_func 2 284%define extra_regs 0 285%ifidn %2, rv40 286%ifdef PIC 287%define extra_regs 1 288%endif ; PIC 289%endif ; rv40 290cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 291 pxor m7, m7 292 movd m2, r4d ; x 293 movd m3, r5d ; y 294 movq m4, [pw_8] 295 movq m5, [pw_8] 296 punpcklwd m2, m2 297 punpcklwd m3, m3 298 punpcklwd m2, m2 299 punpcklwd m3, m3 300 psubw m4, m2 301 psubw m5, m3 302 303%ifidn %2, rv40 304%ifdef PIC 305 lea r6, [rnd_rv40_2d_tbl] 306%define rnd_2d_rv40 r6 307%else 308%define rnd_2d_rv40 rnd_rv40_2d_tbl 309%endif 310 and r5, 6 ; &~1 for mx/my=[0,7] 311 lea r5, [r5*4+r4] 312 sar r5d, 1 313%define rnd_bias r5 314%else ; vc1, h264 315%define rnd_bias 0 316%endif 317 318 movd m0, [r1 ] 319 movd m6, [r1+1] 320 add r1, r2 321 punpcklbw m0, m7 322 punpcklbw m6, m7 323 pmullw m0, m4 324 pmullw m6, m2 325 paddw m6, m0 326 327.next2rows: 328 movd m0, [r1 ] 329 movd m1, [r1+1] 330 add r1, r2 331 punpcklbw m0, m7 332 punpcklbw m1, m7 333 pmullw m0, m4 334 pmullw m1, m2 335 paddw m1, m0 336 movq m0, m1 337 338 pmullw m6, m5 339 pmullw m1, m3 340 paddw m6, [rnd_2d_%2+rnd_bias*8] 341 paddw m1, m6 342 psrlw m1, 6 343 packuswb m1, m1 344 CHROMAMC_AVG4 m1, m6, [r0] 345 movd [r0], m1 346 add r0, r2 347 348 movd m6, [r1 ] 349 movd m1, [r1+1] 350 add r1, r2 351 punpcklbw m6, m7 352 punpcklbw m1, m7 353 pmullw m6, m4 354 pmullw m1, m2 355 paddw m1, m6 356 movq m6, m1 357 pmullw m0, m5 358 pmullw m1, m3 359 paddw m0, [rnd_2d_%2+rnd_bias*8] 360 paddw m1, m0 361 psrlw m1, 6 362 packuswb m1, m1 363 CHROMAMC_AVG4 m1, m0, [r0] 364 movd [r0], m1 365 add r0, r2 366 sub r3d, 2 367 jnz .next2rows 368 REP_RET 369%endmacro 370 371%macro chroma_mc2_mmx_func 2 372cglobal %1_%2_chroma_mc2, 6, 7, 0 373 mov r6d, r4d 374 shl r4d, 16 375 sub r4d, r6d 376 add r4d, 8 377 imul r5d, r4d ; x*y<<16 | y*(8-x) 378 shl r4d, 3 379 sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) 380 381 movd m5, r4d 382 movd m6, r5d 383 punpckldq m5, m5 ; mm5 = {A,B,A,B} 384 punpckldq m6, m6 ; mm6 = {C,D,C,D} 385 pxor m7, m7 386 movd m2, [r1] 387 punpcklbw m2, m7 388 pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] 389 390.nextrow: 391 add r1, r2 392 movq m1, m2 393 pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] 394 movd m0, [r1] 395 punpcklbw m0, m7 396 pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] 397 movq m2, m0 398 pmaddwd m0, m6 399 paddw m1, [rnd_2d_%2] 400 paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] 401 psrlw m1, 6 402 packssdw m1, m7 403 packuswb m1, m7 404 CHROMAMC_AVG4 m1, m3, [r0] 405 movd r5d, m1 406 mov [r0], r5w 407 add r0, r2 408 sub r3d, 1 409 jnz .nextrow 410 REP_RET 411%endmacro 412 413%define rnd_1d_h264 pw_4 414%define rnd_2d_h264 pw_32 415%define rnd_1d_vc1 pw_3 416%define rnd_2d_vc1 pw_28 417 418%macro NOTHING 2-3 419%endmacro 420%macro DIRECT_AVG 2 421 PAVGB %1, %2 422%endmacro 423%macro COPY_AVG 3 424 movd %2, %3 425 PAVGB %1, %2 426%endmacro 427 428INIT_MMX mmx 429%define CHROMAMC_AVG NOTHING 430%define CHROMAMC_AVG4 NOTHING 431chroma_mc8_mmx_func put, h264, _rnd 432chroma_mc8_mmx_func put, vc1, _nornd 433chroma_mc8_mmx_func put, rv40 434chroma_mc4_mmx_func put, h264 435chroma_mc4_mmx_func put, rv40 436 437INIT_MMX mmxext 438chroma_mc2_mmx_func put, h264 439 440%define CHROMAMC_AVG DIRECT_AVG 441%define CHROMAMC_AVG4 COPY_AVG 442chroma_mc8_mmx_func avg, h264, _rnd 443chroma_mc8_mmx_func avg, vc1, _nornd 444chroma_mc8_mmx_func avg, rv40 445chroma_mc4_mmx_func avg, h264 446chroma_mc4_mmx_func avg, rv40 447chroma_mc2_mmx_func avg, h264 448 449INIT_MMX 3dnow 450chroma_mc8_mmx_func avg, h264, _rnd 451chroma_mc8_mmx_func avg, vc1, _nornd 452chroma_mc8_mmx_func avg, rv40 453chroma_mc4_mmx_func avg, h264 454chroma_mc4_mmx_func avg, rv40 455 456%macro chroma_mc8_ssse3_func 2-3 457cglobal %1_%2_chroma_mc8%3, 6, 7, 8 458 mov r6d, r5d 459 or r6d, r4d 460 jne .at_least_one_non_zero 461 ; mx == 0 AND my == 0 - no filter needed 462 mv0_pixels_mc8 463 REP_RET 464 465.at_least_one_non_zero: 466 test r5d, r5d 467 je .my_is_zero 468 test r4d, r4d 469 je .mx_is_zero 470 471 ; general case, bilinear 472 mov r6d, r4d 473 shl r4d, 8 474 sub r4, r6 475 mov r6, 8 476 add r4, 8 ; x*288+8 = x<<8 | (8-x) 477 sub r6d, r5d 478 imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 479 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 480 481 movd m7, r6d 482 movd m6, r4d 483 movdqa m5, [rnd_2d_%2] 484 movq m0, [r1 ] 485 movq m1, [r1+1] 486 pshuflw m7, m7, 0 487 pshuflw m6, m6, 0 488 punpcklbw m0, m1 489 movlhps m7, m7 490 movlhps m6, m6 491 492.next2rows: 493 movq m1, [r1+r2*1 ] 494 movq m2, [r1+r2*1+1] 495 movq m3, [r1+r2*2 ] 496 movq m4, [r1+r2*2+1] 497 lea r1, [r1+r2*2] 498 punpcklbw m1, m2 499 movdqa m2, m1 500 punpcklbw m3, m4 501 movdqa m4, m3 502 pmaddubsw m0, m7 503 pmaddubsw m1, m6 504 pmaddubsw m2, m7 505 pmaddubsw m3, m6 506 paddw m0, m5 507 paddw m2, m5 508 paddw m1, m0 509 paddw m3, m2 510 psrlw m1, 6 511 movdqa m0, m4 512 psrlw m3, 6 513%ifidn %1, avg 514 movq m2, [r0 ] 515 movhps m2, [r0+r2] 516%endif 517 packuswb m1, m3 518 CHROMAMC_AVG m1, m2 519 movq [r0 ], m1 520 movhps [r0+r2], m1 521 sub r3d, 2 522 lea r0, [r0+r2*2] 523 jg .next2rows 524 REP_RET 525 526.my_is_zero: 527 mov r5d, r4d 528 shl r4d, 8 529 add r4, 8 530 sub r4, r5 ; 255*x+8 = x<<8 | (8-x) 531 movd m7, r4d 532 movdqa m6, [rnd_1d_%2] 533 pshuflw m7, m7, 0 534 movlhps m7, m7 535 536.next2xrows: 537 movq m0, [r1 ] 538 movq m1, [r1 +1] 539 movq m2, [r1+r2 ] 540 movq m3, [r1+r2+1] 541 punpcklbw m0, m1 542 punpcklbw m2, m3 543 pmaddubsw m0, m7 544 pmaddubsw m2, m7 545%ifidn %1, avg 546 movq m4, [r0 ] 547 movhps m4, [r0+r2] 548%endif 549 paddw m0, m6 550 paddw m2, m6 551 psrlw m0, 3 552 psrlw m2, 3 553 packuswb m0, m2 554 CHROMAMC_AVG m0, m4 555 movq [r0 ], m0 556 movhps [r0+r2], m0 557 sub r3d, 2 558 lea r0, [r0+r2*2] 559 lea r1, [r1+r2*2] 560 jg .next2xrows 561 REP_RET 562 563.mx_is_zero: 564 mov r4d, r5d 565 shl r5d, 8 566 add r5, 8 567 sub r5, r4 ; 255*y+8 = y<<8 | (8-y) 568 movd m7, r5d 569 movdqa m6, [rnd_1d_%2] 570 pshuflw m7, m7, 0 571 movlhps m7, m7 572 573.next2yrows: 574 movq m0, [r1 ] 575 movq m1, [r1+r2 ] 576 movdqa m2, m1 577 movq m3, [r1+r2*2] 578 lea r1, [r1+r2*2] 579 punpcklbw m0, m1 580 punpcklbw m2, m3 581 pmaddubsw m0, m7 582 pmaddubsw m2, m7 583%ifidn %1, avg 584 movq m4, [r0 ] 585 movhps m4, [r0+r2] 586%endif 587 paddw m0, m6 588 paddw m2, m6 589 psrlw m0, 3 590 psrlw m2, 3 591 packuswb m0, m2 592 CHROMAMC_AVG m0, m4 593 movq [r0 ], m0 594 movhps [r0+r2], m0 595 sub r3d, 2 596 lea r0, [r0+r2*2] 597 jg .next2yrows 598 REP_RET 599%endmacro 600 601%macro chroma_mc4_ssse3_func 2 602cglobal %1_%2_chroma_mc4, 6, 7, 0 603 mov r6, r4 604 shl r4d, 8 605 sub r4d, r6d 606 mov r6, 8 607 add r4d, 8 ; x*288+8 608 sub r6d, r5d 609 imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) 610 imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) 611 612 movd m7, r6d 613 movd m6, r4d 614 movq m5, [pw_32] 615 movd m0, [r1 ] 616 pshufw m7, m7, 0 617 punpcklbw m0, [r1+1] 618 pshufw m6, m6, 0 619 620.next2rows: 621 movd m1, [r1+r2*1 ] 622 movd m3, [r1+r2*2 ] 623 punpcklbw m1, [r1+r2*1+1] 624 punpcklbw m3, [r1+r2*2+1] 625 lea r1, [r1+r2*2] 626 movq m2, m1 627 movq m4, m3 628 pmaddubsw m0, m7 629 pmaddubsw m1, m6 630 pmaddubsw m2, m7 631 pmaddubsw m3, m6 632 paddw m0, m5 633 paddw m2, m5 634 paddw m1, m0 635 paddw m3, m2 636 psrlw m1, 6 637 movq m0, m4 638 psrlw m3, 6 639 packuswb m1, m1 640 packuswb m3, m3 641 CHROMAMC_AVG m1, [r0 ] 642 CHROMAMC_AVG m3, [r0+r2] 643 movd [r0 ], m1 644 movd [r0+r2], m3 645 sub r3d, 2 646 lea r0, [r0+r2*2] 647 jg .next2rows 648 REP_RET 649%endmacro 650 651%define CHROMAMC_AVG NOTHING 652INIT_XMM ssse3 653chroma_mc8_ssse3_func put, h264, _rnd 654chroma_mc8_ssse3_func put, vc1, _nornd 655INIT_MMX ssse3 656chroma_mc4_ssse3_func put, h264 657 658%define CHROMAMC_AVG DIRECT_AVG 659INIT_XMM ssse3 660chroma_mc8_ssse3_func avg, h264, _rnd 661chroma_mc8_ssse3_func avg, vc1, _nornd 662INIT_MMX ssse3 663chroma_mc4_ssse3_func avg, h264 664