1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION .text 28 29cextern pw_1023 30%define pw_pixel_max pw_1023 31cextern pd_32 32 33;----------------------------------------------------------------------------- 34; void ff_h264_idct_add_10(pixel *dst, int16_t *block, int stride) 35;----------------------------------------------------------------------------- 36%macro STORE_DIFFx2 6 37 psrad %1, 6 38 psrad %2, 6 39 packssdw %1, %2 40 movq %3, [%5] 41 movhps %3, [%5+%6] 42 paddsw %1, %3 43 CLIPW %1, %4, [pw_pixel_max] 44 movq [%5], %1 45 movhps [%5+%6], %1 46%endmacro 47 48%macro STORE_DIFF16 5 49 psrad %1, 6 50 psrad %2, 6 51 packssdw %1, %2 52 paddsw %1, [%5] 53 CLIPW %1, %3, %4 54 mova [%5], %1 55%endmacro 56 57;dst, in, stride 58%macro IDCT4_ADD_10 3 59 mova m0, [%2+ 0] 60 mova m1, [%2+16] 61 mova m2, [%2+32] 62 mova m3, [%2+48] 63 IDCT4_1D d,0,1,2,3,4,5 64 TRANSPOSE4x4D 0,1,2,3,4 65 paddd m0, [pd_32] 66 IDCT4_1D d,0,1,2,3,4,5 67 pxor m5, m5 68 mova [%2+ 0], m5 69 mova [%2+16], m5 70 mova [%2+32], m5 71 mova [%2+48], m5 72 STORE_DIFFx2 m0, m1, m4, m5, %1, %3 73 lea %1, [%1+%3*2] 74 STORE_DIFFx2 m2, m3, m4, m5, %1, %3 75%endmacro 76 77%macro IDCT_ADD_10 0 78cglobal h264_idct_add_10, 3,3 79 movsxdifnidn r2, r2d 80 IDCT4_ADD_10 r0, r1, r2 81 RET 82%endmacro 83 84INIT_XMM sse2 85IDCT_ADD_10 86%if HAVE_AVX_EXTERNAL 87INIT_XMM avx 88IDCT_ADD_10 89%endif 90 91;----------------------------------------------------------------------------- 92; void ff_h264_idct_add16_10(pixel *dst, const int *block_offset, 93; int16_t *block, int stride, 94; const uint8_t nnzc[6*8]) 95;----------------------------------------------------------------------------- 96;;;;;;; NO FATE SAMPLES TRIGGER THIS 97%macro ADD4x4IDCT 0 98add4x4_idct %+ SUFFIX: 99 add r5, r0 100 mova m0, [r2+ 0] 101 mova m1, [r2+16] 102 mova m2, [r2+32] 103 mova m3, [r2+48] 104 IDCT4_1D d,0,1,2,3,4,5 105 TRANSPOSE4x4D 0,1,2,3,4 106 paddd m0, [pd_32] 107 IDCT4_1D d,0,1,2,3,4,5 108 pxor m5, m5 109 mova [r2+ 0], m5 110 mova [r2+16], m5 111 mova [r2+32], m5 112 mova [r2+48], m5 113 STORE_DIFFx2 m0, m1, m4, m5, r5, r3 114 lea r5, [r5+r3*2] 115 STORE_DIFFx2 m2, m3, m4, m5, r5, r3 116 ret 117%endmacro 118 119INIT_XMM sse2 120ALIGN 16 121ADD4x4IDCT 122%if HAVE_AVX_EXTERNAL 123INIT_XMM avx 124ALIGN 16 125ADD4x4IDCT 126%endif 127 128%macro ADD16_OP 2 129 cmp byte [r4+%2], 0 130 jz .skipblock%1 131 mov r5d, [r1+%1*4] 132 call add4x4_idct %+ SUFFIX 133.skipblock%1: 134%if %1<15 135 add r2, 64 136%endif 137%endmacro 138 139%macro IDCT_ADD16_10 0 140cglobal h264_idct_add16_10, 5,6 141 movsxdifnidn r3, r3d 142 ADD16_OP 0, 4+1*8 143 ADD16_OP 1, 5+1*8 144 ADD16_OP 2, 4+2*8 145 ADD16_OP 3, 5+2*8 146 ADD16_OP 4, 6+1*8 147 ADD16_OP 5, 7+1*8 148 ADD16_OP 6, 6+2*8 149 ADD16_OP 7, 7+2*8 150 ADD16_OP 8, 4+3*8 151 ADD16_OP 9, 5+3*8 152 ADD16_OP 10, 4+4*8 153 ADD16_OP 11, 5+4*8 154 ADD16_OP 12, 6+3*8 155 ADD16_OP 13, 7+3*8 156 ADD16_OP 14, 6+4*8 157 ADD16_OP 15, 7+4*8 158 REP_RET 159%endmacro 160 161INIT_XMM sse2 162IDCT_ADD16_10 163%if HAVE_AVX_EXTERNAL 164INIT_XMM avx 165IDCT_ADD16_10 166%endif 167 168;----------------------------------------------------------------------------- 169; void ff_h264_idct_dc_add_10(pixel *dst, int16_t *block, int stride) 170;----------------------------------------------------------------------------- 171%macro IDCT_DC_ADD_OP_10 3 172 pxor m5, m5 173%if avx_enabled 174 paddw m1, m0, [%1+0 ] 175 paddw m2, m0, [%1+%2 ] 176 paddw m3, m0, [%1+%2*2] 177 paddw m4, m0, [%1+%3 ] 178%else 179 mova m1, [%1+0 ] 180 mova m2, [%1+%2 ] 181 mova m3, [%1+%2*2] 182 mova m4, [%1+%3 ] 183 paddw m1, m0 184 paddw m2, m0 185 paddw m3, m0 186 paddw m4, m0 187%endif 188 CLIPW m1, m5, m6 189 CLIPW m2, m5, m6 190 CLIPW m3, m5, m6 191 CLIPW m4, m5, m6 192 mova [%1+0 ], m1 193 mova [%1+%2 ], m2 194 mova [%1+%2*2], m3 195 mova [%1+%3 ], m4 196%endmacro 197 198INIT_MMX mmxext 199cglobal h264_idct_dc_add_10,3,3 200 movsxdifnidn r2, r2d 201 movd m0, [r1] 202 mov dword [r1], 0 203 paddd m0, [pd_32] 204 psrad m0, 6 205 lea r1, [r2*3] 206 pshufw m0, m0, 0 207 mova m6, [pw_pixel_max] 208 IDCT_DC_ADD_OP_10 r0, r2, r1 209 RET 210 211;----------------------------------------------------------------------------- 212; void ff_h264_idct8_dc_add_10(pixel *dst, int16_t *block, int stride) 213;----------------------------------------------------------------------------- 214%macro IDCT8_DC_ADD 0 215cglobal h264_idct8_dc_add_10,3,4,7 216 movsxdifnidn r2, r2d 217 movd m0, [r1] 218 mov dword[r1], 0 219 paddd m0, [pd_32] 220 psrad m0, 6 221 lea r1, [r2*3] 222 SPLATW m0, m0, 0 223 mova m6, [pw_pixel_max] 224 IDCT_DC_ADD_OP_10 r0, r2, r1 225 lea r0, [r0+r2*4] 226 IDCT_DC_ADD_OP_10 r0, r2, r1 227 RET 228%endmacro 229 230INIT_XMM sse2 231IDCT8_DC_ADD 232%if HAVE_AVX_EXTERNAL 233INIT_XMM avx 234IDCT8_DC_ADD 235%endif 236 237;----------------------------------------------------------------------------- 238; void ff_h264_idct_add16intra_10(pixel *dst, const int *block_offset, 239; int16_t *block, int stride, 240; const uint8_t nnzc[6*8]) 241;----------------------------------------------------------------------------- 242%macro AC 1 243.ac%1: 244 mov r5d, [r1+(%1+0)*4] 245 call add4x4_idct %+ SUFFIX 246 mov r5d, [r1+(%1+1)*4] 247 add r2, 64 248 call add4x4_idct %+ SUFFIX 249 add r2, 64 250 jmp .skipadd%1 251%endmacro 252 253%assign last_block 16 254%macro ADD16_OP_INTRA 2 255 cmp word [r4+%2], 0 256 jnz .ac%1 257 mov r5d, [r2+ 0] 258 or r5d, [r2+64] 259 jz .skipblock%1 260 mov r5d, [r1+(%1+0)*4] 261 call idct_dc_add %+ SUFFIX 262.skipblock%1: 263%if %1<last_block-2 264 add r2, 128 265%endif 266.skipadd%1: 267%endmacro 268 269%macro IDCT_ADD16INTRA_10 0 270idct_dc_add %+ SUFFIX: 271 add r5, r0 272 movq m0, [r2+ 0] 273 movhps m0, [r2+64] 274 mov dword [r2+ 0], 0 275 mov dword [r2+64], 0 276 paddd m0, [pd_32] 277 psrad m0, 6 278 pshufhw m0, m0, 0 279 pshuflw m0, m0, 0 280 lea r6, [r3*3] 281 mova m6, [pw_pixel_max] 282 IDCT_DC_ADD_OP_10 r5, r3, r6 283 ret 284 285cglobal h264_idct_add16intra_10,5,7,8 286 movsxdifnidn r3, r3d 287 ADD16_OP_INTRA 0, 4+1*8 288 ADD16_OP_INTRA 2, 4+2*8 289 ADD16_OP_INTRA 4, 6+1*8 290 ADD16_OP_INTRA 6, 6+2*8 291 ADD16_OP_INTRA 8, 4+3*8 292 ADD16_OP_INTRA 10, 4+4*8 293 ADD16_OP_INTRA 12, 6+3*8 294 ADD16_OP_INTRA 14, 6+4*8 295 REP_RET 296 AC 8 297 AC 10 298 AC 12 299 AC 14 300 AC 0 301 AC 2 302 AC 4 303 AC 6 304%endmacro 305 306INIT_XMM sse2 307IDCT_ADD16INTRA_10 308%if HAVE_AVX_EXTERNAL 309INIT_XMM avx 310IDCT_ADD16INTRA_10 311%endif 312 313%assign last_block 36 314;----------------------------------------------------------------------------- 315; void ff_h264_idct_add8_10(pixel **dst, const int *block_offset, 316; int16_t *block, int stride, 317; const uint8_t nnzc[6*8]) 318;----------------------------------------------------------------------------- 319%macro IDCT_ADD8 0 320cglobal h264_idct_add8_10,5,8,7 321 movsxdifnidn r3, r3d 322%if ARCH_X86_64 323 mov r7, r0 324%endif 325 add r2, 1024 326 mov r0, [r0] 327 ADD16_OP_INTRA 16, 4+ 6*8 328 ADD16_OP_INTRA 18, 4+ 7*8 329 add r2, 1024-128*2 330%if ARCH_X86_64 331 mov r0, [r7+gprsize] 332%else 333 mov r0, r0m 334 mov r0, [r0+gprsize] 335%endif 336 ADD16_OP_INTRA 32, 4+11*8 337 ADD16_OP_INTRA 34, 4+12*8 338 REP_RET 339 AC 16 340 AC 18 341 AC 32 342 AC 34 343 344%endmacro ; IDCT_ADD8 345 346INIT_XMM sse2 347IDCT_ADD8 348%if HAVE_AVX_EXTERNAL 349INIT_XMM avx 350IDCT_ADD8 351%endif 352 353;----------------------------------------------------------------------------- 354; void ff_h264_idct_add8_422_10(pixel **dst, const int *block_offset, 355; int16_t *block, int stride, 356; const uint8_t nnzc[6*8]) 357;----------------------------------------------------------------------------- 358%assign last_block 44 359 360%macro IDCT_ADD8_422 0 361 362cglobal h264_idct_add8_422_10, 5, 8, 7 363 movsxdifnidn r3, r3d 364%if ARCH_X86_64 365 mov r7, r0 366%endif 367 368 add r2, 1024 369 mov r0, [r0] 370 ADD16_OP_INTRA 16, 4+ 6*8 371 ADD16_OP_INTRA 18, 4+ 7*8 372 ADD16_OP_INTRA 24, 4+ 8*8 ; i+4 373 ADD16_OP_INTRA 26, 4+ 9*8 ; i+4 374 add r2, 1024-128*4 375 376%if ARCH_X86_64 377 mov r0, [r7+gprsize] 378%else 379 mov r0, r0m 380 mov r0, [r0+gprsize] 381%endif 382 383 ADD16_OP_INTRA 32, 4+11*8 384 ADD16_OP_INTRA 34, 4+12*8 385 ADD16_OP_INTRA 40, 4+13*8 ; i+4 386 ADD16_OP_INTRA 42, 4+14*8 ; i+4 387REP_RET 388 AC 16 389 AC 18 390 AC 24 ; i+4 391 AC 26 ; i+4 392 AC 32 393 AC 34 394 AC 40 ; i+4 395 AC 42 ; i+4 396 397%endmacro 398 399INIT_XMM sse2 400IDCT_ADD8_422 401%if HAVE_AVX_EXTERNAL 402INIT_XMM avx 403IDCT_ADD8_422 404%endif 405 406;----------------------------------------------------------------------------- 407; void ff_h264_idct8_add_10(pixel *dst, int16_t *block, int stride) 408;----------------------------------------------------------------------------- 409%macro IDCT8_1D 2 410 SWAP 0, 1 411 psrad m4, m5, 1 412 psrad m1, m0, 1 413 paddd m4, m5 414 paddd m1, m0 415 paddd m4, m7 416 paddd m1, m5 417 psubd m4, m0 418 paddd m1, m3 419 420 psubd m0, m3 421 psubd m5, m3 422 paddd m0, m7 423 psubd m5, m7 424 psrad m3, 1 425 psrad m7, 1 426 psubd m0, m3 427 psubd m5, m7 428 429 SWAP 1, 7 430 psrad m1, m7, 2 431 psrad m3, m4, 2 432 paddd m3, m0 433 psrad m0, 2 434 paddd m1, m5 435 psrad m5, 2 436 psubd m0, m4 437 psubd m7, m5 438 439 SWAP 5, 6 440 psrad m4, m2, 1 441 psrad m6, m5, 1 442 psubd m4, m5 443 paddd m6, m2 444 445 mova m2, %1 446 mova m5, %2 447 SUMSUB_BA d, 5, 2 448 SUMSUB_BA d, 6, 5 449 SUMSUB_BA d, 4, 2 450 SUMSUB_BA d, 7, 6 451 SUMSUB_BA d, 0, 4 452 SUMSUB_BA d, 3, 2 453 SUMSUB_BA d, 1, 5 454 SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 455%endmacro 456 457%macro IDCT8_1D_FULL 1 458 mova m7, [%1+112*2] 459 mova m6, [%1+ 96*2] 460 mova m5, [%1+ 80*2] 461 mova m3, [%1+ 48*2] 462 mova m2, [%1+ 32*2] 463 mova m1, [%1+ 16*2] 464 IDCT8_1D [%1], [%1+ 64*2] 465%endmacro 466 467; %1=int16_t *block, %2=int16_t *dstblock 468%macro IDCT8_ADD_SSE_START 2 469 IDCT8_1D_FULL %1 470%if ARCH_X86_64 471 TRANSPOSE4x4D 0,1,2,3,8 472 mova [%2 ], m0 473 TRANSPOSE4x4D 4,5,6,7,8 474 mova [%2+8*2], m4 475%else 476 mova [%1], m7 477 TRANSPOSE4x4D 0,1,2,3,7 478 mova m7, [%1] 479 mova [%2 ], m0 480 mova [%2+16*2], m1 481 mova [%2+32*2], m2 482 mova [%2+48*2], m3 483 TRANSPOSE4x4D 4,5,6,7,3 484 mova [%2+ 8*2], m4 485 mova [%2+24*2], m5 486 mova [%2+40*2], m6 487 mova [%2+56*2], m7 488%endif 489%endmacro 490 491; %1=uint8_t *dst, %2=int16_t *block, %3=int stride 492%macro IDCT8_ADD_SSE_END 3 493 IDCT8_1D_FULL %2 494 mova [%2 ], m6 495 mova [%2+16*2], m7 496 497 pxor m7, m7 498 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 499 lea %1, [%1+%3*2] 500 STORE_DIFFx2 m2, m3, m6, m7, %1, %3 501 mova m0, [%2 ] 502 mova m1, [%2+16*2] 503 lea %1, [%1+%3*2] 504 STORE_DIFFx2 m4, m5, m6, m7, %1, %3 505 lea %1, [%1+%3*2] 506 STORE_DIFFx2 m0, m1, m6, m7, %1, %3 507%endmacro 508 509%macro IDCT8_ADD 0 510cglobal h264_idct8_add_10, 3,4,16 511 movsxdifnidn r2, r2d 512%if UNIX64 == 0 513 %assign pad 16-gprsize-(stack_offset&15) 514 sub rsp, pad 515 call h264_idct8_add1_10 %+ SUFFIX 516 add rsp, pad 517 RET 518%endif 519 520ALIGN 16 521; TODO: does not need to use stack 522h264_idct8_add1_10 %+ SUFFIX: 523%assign pad 256+16-gprsize 524 sub rsp, pad 525 add dword [r1], 32 526 527%if ARCH_X86_64 528 IDCT8_ADD_SSE_START r1, rsp 529 SWAP 1, 9 530 SWAP 2, 10 531 SWAP 3, 11 532 SWAP 5, 13 533 SWAP 6, 14 534 SWAP 7, 15 535 IDCT8_ADD_SSE_START r1+16, rsp+128 536 PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 537 IDCT8_1D [rsp], [rsp+128] 538 SWAP 0, 8 539 SWAP 1, 9 540 SWAP 2, 10 541 SWAP 3, 11 542 SWAP 4, 12 543 SWAP 5, 13 544 SWAP 6, 14 545 SWAP 7, 15 546 IDCT8_1D [rsp+16], [rsp+144] 547 psrad m8, 6 548 psrad m0, 6 549 packssdw m8, m0 550 paddsw m8, [r0] 551 pxor m0, m0 552 mova [r1+ 0], m0 553 mova [r1+ 16], m0 554 mova [r1+ 32], m0 555 mova [r1+ 48], m0 556 mova [r1+ 64], m0 557 mova [r1+ 80], m0 558 mova [r1+ 96], m0 559 mova [r1+112], m0 560 mova [r1+128], m0 561 mova [r1+144], m0 562 mova [r1+160], m0 563 mova [r1+176], m0 564 mova [r1+192], m0 565 mova [r1+208], m0 566 mova [r1+224], m0 567 mova [r1+240], m0 568 CLIPW m8, m0, [pw_pixel_max] 569 mova [r0], m8 570 mova m8, [pw_pixel_max] 571 STORE_DIFF16 m9, m1, m0, m8, r0+r2 572 lea r0, [r0+r2*2] 573 STORE_DIFF16 m10, m2, m0, m8, r0 574 STORE_DIFF16 m11, m3, m0, m8, r0+r2 575 lea r0, [r0+r2*2] 576 STORE_DIFF16 m12, m4, m0, m8, r0 577 STORE_DIFF16 m13, m5, m0, m8, r0+r2 578 lea r0, [r0+r2*2] 579 STORE_DIFF16 m14, m6, m0, m8, r0 580 STORE_DIFF16 m15, m7, m0, m8, r0+r2 581%else 582 IDCT8_ADD_SSE_START r1, rsp 583 IDCT8_ADD_SSE_START r1+16, rsp+128 584 lea r3, [r0+8] 585 IDCT8_ADD_SSE_END r0, rsp, r2 586 IDCT8_ADD_SSE_END r3, rsp+16, r2 587 mova [r1+ 0], m7 588 mova [r1+ 16], m7 589 mova [r1+ 32], m7 590 mova [r1+ 48], m7 591 mova [r1+ 64], m7 592 mova [r1+ 80], m7 593 mova [r1+ 96], m7 594 mova [r1+112], m7 595 mova [r1+128], m7 596 mova [r1+144], m7 597 mova [r1+160], m7 598 mova [r1+176], m7 599 mova [r1+192], m7 600 mova [r1+208], m7 601 mova [r1+224], m7 602 mova [r1+240], m7 603%endif ; ARCH_X86_64 604 605 add rsp, pad 606 ret 607%endmacro 608 609INIT_XMM sse2 610IDCT8_ADD 611%if HAVE_AVX_EXTERNAL 612INIT_XMM avx 613IDCT8_ADD 614%endif 615 616;----------------------------------------------------------------------------- 617; void ff_h264_idct8_add4_10(pixel **dst, const int *block_offset, 618; int16_t *block, int stride, 619; const uint8_t nnzc[6*8]) 620;----------------------------------------------------------------------------- 621;;;;;;; NO FATE SAMPLES TRIGGER THIS 622%macro IDCT8_ADD4_OP 2 623 cmp byte [r4+%2], 0 624 jz .skipblock%1 625 mov r0d, [r6+%1*4] 626 add r0, r5 627 call h264_idct8_add1_10 %+ SUFFIX 628.skipblock%1: 629%if %1<12 630 add r1, 256 631%endif 632%endmacro 633 634%macro IDCT8_ADD4 0 635cglobal h264_idct8_add4_10, 0,7,16 636 movsxdifnidn r3, r3d 637 %assign pad 16-gprsize-(stack_offset&15) 638 SUB rsp, pad 639 mov r5, r0mp 640 mov r6, r1mp 641 mov r1, r2mp 642 mov r2d, r3m 643 movifnidn r4, r4mp 644 IDCT8_ADD4_OP 0, 4+1*8 645 IDCT8_ADD4_OP 4, 6+1*8 646 IDCT8_ADD4_OP 8, 4+3*8 647 IDCT8_ADD4_OP 12, 6+3*8 648 ADD rsp, pad 649 RET 650%endmacro ; IDCT8_ADD4 651 652INIT_XMM sse2 653IDCT8_ADD4 654%if HAVE_AVX_EXTERNAL 655INIT_XMM avx 656IDCT8_ADD4 657%endif 658