1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Daniel Kang <daniel.d.kang@gmail.com> 7;* 8;* This file is part of FFmpeg. 9;* 10;* FFmpeg is free software; you can redistribute it and/or 11;* modify it under the terms of the GNU Lesser General Public 12;* License as published by the Free Software Foundation; either 13;* version 2.1 of the License, or (at your option) any later version. 14;* 15;* FFmpeg is distributed in the hope that it will be useful, 16;* but WITHOUT ANY WARRANTY; without even the implied warranty of 17;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18;* Lesser General Public License for more details. 19;* 20;* You should have received a copy of the GNU Lesser General Public 21;* License along with FFmpeg; if not, write to the Free Software 22;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 23;****************************************************************************** 24 25%include "libavutil/x86/x86util.asm" 26 27SECTION_RODATA 28 29cextern pw_1023 30%define pw_pixel_max pw_1023 31cextern pw_512 32cextern pw_16 33cextern pw_8 34cextern pw_4 35cextern pw_2 36cextern pw_1 37cextern pd_16 38 39pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 40pw_m3: times 8 dw -3 41pd_17: times 4 dd 17 42 43SECTION .text 44 45; dest, left, right, src 46; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 47%macro PRED4x4_LOWPASS 4 48 paddw %2, %3 49 psrlw %2, 1 50 pavgw %1, %4, %2 51%endmacro 52 53;----------------------------------------------------------------------------- 54; void ff_pred4x4_down_right_10(pixel *src, const pixel *topright, 55; ptrdiff_t stride) 56;----------------------------------------------------------------------------- 57%macro PRED4x4_DR 0 58cglobal pred4x4_down_right_10, 3, 3 59 sub r0, r2 60 lea r1, [r0+r2*2] 61 movhps m1, [r1-8] 62 movhps m2, [r0+r2*1-8] 63 movhps m4, [r0-8] 64 punpckhwd m2, m4 65 movq m3, [r0] 66 punpckhdq m1, m2 67 PALIGNR m3, m1, 10, m1 68 movhps m4, [r1+r2*1-8] 69 PALIGNR m0, m3, m4, 14, m4 70 movhps m4, [r1+r2*2-8] 71 PALIGNR m2, m0, m4, 14, m4 72 PRED4x4_LOWPASS m0, m2, m3, m0 73 movq [r1+r2*2], m0 74 psrldq m0, 2 75 movq [r1+r2*1], m0 76 psrldq m0, 2 77 movq [r0+r2*2], m0 78 psrldq m0, 2 79 movq [r0+r2*1], m0 80 RET 81%endmacro 82 83INIT_XMM sse2 84PRED4x4_DR 85INIT_XMM ssse3 86PRED4x4_DR 87%if HAVE_AVX_EXTERNAL 88INIT_XMM avx 89PRED4x4_DR 90%endif 91 92;------------------------------------------------------------------------------ 93; void ff_pred4x4_vertical_right_10(pixel *src, const pixel *topright, 94; ptrdiff_t stride) 95;------------------------------------------------------------------------------ 96%macro PRED4x4_VR 0 97cglobal pred4x4_vertical_right_10, 3, 3, 6 98 sub r0, r2 99 lea r1, [r0+r2*2] 100 movq m5, [r0] ; ........t3t2t1t0 101 movhps m1, [r0-8] 102 PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt 103 pavgw m5, m0 104 movhps m1, [r0+r2*1-8] 105 PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 106 movhps m2, [r0+r2*2-8] 107 PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 108 movhps m3, [r1+r2*1-8] 109 PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 110 PRED4x4_LOWPASS m1, m0, m2, m1 111 pslldq m0, m1, 12 112 psrldq m1, 4 113 movq [r0+r2*1], m5 114 movq [r0+r2*2], m1 115 PALIGNR m5, m0, 14, m2 116 pslldq m0, 2 117 movq [r1+r2*1], m5 118 PALIGNR m1, m0, 14, m0 119 movq [r1+r2*2], m1 120 RET 121%endmacro 122 123INIT_XMM sse2 124PRED4x4_VR 125INIT_XMM ssse3 126PRED4x4_VR 127%if HAVE_AVX_EXTERNAL 128INIT_XMM avx 129PRED4x4_VR 130%endif 131 132;------------------------------------------------------------------------------- 133; void ff_pred4x4_horizontal_down_10(pixel *src, const pixel *topright, 134; ptrdiff_t stride) 135;------------------------------------------------------------------------------- 136%macro PRED4x4_HD 0 137cglobal pred4x4_horizontal_down_10, 3, 3 138 sub r0, r2 139 lea r1, [r0+r2*2] 140 movq m0, [r0-8] ; lt .. 141 movhps m0, [r0] 142 pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. 143 movq m1, [r1+r2*2-8] ; l3 144 movq m3, [r1+r2*1-8] 145 punpcklwd m1, m3 ; l2 l3 146 movq m2, [r0+r2*2-8] ; l1 147 movq m3, [r0+r2*1-8] 148 punpcklwd m2, m3 ; l0 l1 149 punpckhdq m1, m2 ; l0 l1 l2 l3 150 punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 151 psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 152 psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 153 pavgw m5, m1, m3 154 PRED4x4_LOWPASS m3, m1, m0, m3 155 punpcklwd m5, m3 156 psrldq m3, 8 157 PALIGNR m3, m5, 12, m4 158 movq [r1+r2*2], m5 159 movhps [r0+r2*2], m5 160 psrldq m5, 4 161 movq [r1+r2*1], m5 162 movq [r0+r2*1], m3 163 RET 164%endmacro 165 166INIT_XMM sse2 167PRED4x4_HD 168INIT_XMM ssse3 169PRED4x4_HD 170%if HAVE_AVX_EXTERNAL 171INIT_XMM avx 172PRED4x4_HD 173%endif 174 175;----------------------------------------------------------------------------- 176; void ff_pred4x4_dc_10(pixel *src, const pixel *topright, ptrdiff_t stride) 177;----------------------------------------------------------------------------- 178 179INIT_MMX mmxext 180cglobal pred4x4_dc_10, 3, 3 181 sub r0, r2 182 lea r1, [r0+r2*2] 183 movq m2, [r0+r2*1-8] 184 paddw m2, [r0+r2*2-8] 185 paddw m2, [r1+r2*1-8] 186 paddw m2, [r1+r2*2-8] 187 psrlq m2, 48 188 movq m0, [r0] 189 HADDW m0, m1 190 paddw m0, [pw_4] 191 paddw m0, m2 192 psrlw m0, 3 193 SPLATW m0, m0, 0 194 movq [r0+r2*1], m0 195 movq [r0+r2*2], m0 196 movq [r1+r2*1], m0 197 movq [r1+r2*2], m0 198 RET 199 200;----------------------------------------------------------------------------- 201; void ff_pred4x4_down_left_10(pixel *src, const pixel *topright, 202; ptrdiff_t stride) 203;----------------------------------------------------------------------------- 204%macro PRED4x4_DL 0 205cglobal pred4x4_down_left_10, 3, 3 206 sub r0, r2 207 movq m0, [r0] 208 movhps m0, [r1] 209 psrldq m2, m0, 2 210 pslldq m3, m0, 2 211 pshufhw m2, m2, 10100100b 212 PRED4x4_LOWPASS m0, m3, m2, m0 213 lea r1, [r0+r2*2] 214 movhps [r1+r2*2], m0 215 psrldq m0, 2 216 movq [r0+r2*1], m0 217 psrldq m0, 2 218 movq [r0+r2*2], m0 219 psrldq m0, 2 220 movq [r1+r2*1], m0 221 RET 222%endmacro 223 224INIT_XMM sse2 225PRED4x4_DL 226%if HAVE_AVX_EXTERNAL 227INIT_XMM avx 228PRED4x4_DL 229%endif 230 231;----------------------------------------------------------------------------- 232; void ff_pred4x4_vertical_left_10(pixel *src, const pixel *topright, 233; ptrdiff_t stride) 234;----------------------------------------------------------------------------- 235%macro PRED4x4_VL 0 236cglobal pred4x4_vertical_left_10, 3, 3 237 sub r0, r2 238 movu m1, [r0] 239 movhps m1, [r1] 240 psrldq m0, m1, 2 241 psrldq m2, m1, 4 242 pavgw m4, m0, m1 243 PRED4x4_LOWPASS m0, m1, m2, m0 244 lea r1, [r0+r2*2] 245 movq [r0+r2*1], m4 246 movq [r0+r2*2], m0 247 psrldq m4, 2 248 psrldq m0, 2 249 movq [r1+r2*1], m4 250 movq [r1+r2*2], m0 251 RET 252%endmacro 253 254INIT_XMM sse2 255PRED4x4_VL 256%if HAVE_AVX_EXTERNAL 257INIT_XMM avx 258PRED4x4_VL 259%endif 260 261;----------------------------------------------------------------------------- 262; void ff_pred4x4_horizontal_up_10(pixel *src, const pixel *topright, 263; ptrdiff_t stride) 264;----------------------------------------------------------------------------- 265INIT_MMX mmxext 266cglobal pred4x4_horizontal_up_10, 3, 3 267 sub r0, r2 268 lea r1, [r0+r2*2] 269 movq m0, [r0+r2*1-8] 270 punpckhwd m0, [r0+r2*2-8] 271 movq m1, [r1+r2*1-8] 272 punpckhwd m1, [r1+r2*2-8] 273 punpckhdq m0, m1 274 pshufw m1, m1, 0xFF 275 movq [r1+r2*2], m1 276 movd [r1+r2*1+4], m1 277 pshufw m2, m0, 11111001b 278 movq m1, m2 279 pavgw m2, m0 280 281 pshufw m5, m0, 11111110b 282 PRED4x4_LOWPASS m1, m0, m5, m1 283 movq m6, m2 284 punpcklwd m6, m1 285 movq [r0+r2*1], m6 286 psrlq m2, 16 287 psrlq m1, 16 288 punpcklwd m2, m1 289 movq [r0+r2*2], m2 290 psrlq m2, 32 291 movd [r1+r2*1], m2 292 RET 293 294 295 296;----------------------------------------------------------------------------- 297; void ff_pred8x8_vertical_10(pixel *src, ptrdiff_t stride) 298;----------------------------------------------------------------------------- 299INIT_XMM sse2 300cglobal pred8x8_vertical_10, 2, 2 301 sub r0, r1 302 mova m0, [r0] 303%rep 3 304 mova [r0+r1*1], m0 305 mova [r0+r1*2], m0 306 lea r0, [r0+r1*2] 307%endrep 308 mova [r0+r1*1], m0 309 mova [r0+r1*2], m0 310 RET 311 312;----------------------------------------------------------------------------- 313; void ff_pred8x8_horizontal_10(pixel *src, ptrdiff_t stride) 314;----------------------------------------------------------------------------- 315INIT_XMM sse2 316cglobal pred8x8_horizontal_10, 2, 3 317 mov r2d, 4 318.loop: 319 movq m0, [r0+r1*0-8] 320 movq m1, [r0+r1*1-8] 321 pshuflw m0, m0, 0xff 322 pshuflw m1, m1, 0xff 323 punpcklqdq m0, m0 324 punpcklqdq m1, m1 325 mova [r0+r1*0], m0 326 mova [r0+r1*1], m1 327 lea r0, [r0+r1*2] 328 dec r2d 329 jg .loop 330 REP_RET 331 332;----------------------------------------------------------------------------- 333; void ff_predict_8x8_dc_10(pixel *src, ptrdiff_t stride) 334;----------------------------------------------------------------------------- 335%macro MOV8 2-3 336; sort of a hack, but it works 337%if mmsize==8 338 movq [%1+0], %2 339 movq [%1+8], %3 340%else 341 movdqa [%1], %2 342%endif 343%endmacro 344 345%macro PRED8x8_DC 1 346cglobal pred8x8_dc_10, 2, 6 347 sub r0, r1 348 pxor m4, m4 349 movq m0, [r0+0] 350 movq m1, [r0+8] 351%if mmsize==16 352 punpcklwd m0, m1 353 movhlps m1, m0 354 paddw m0, m1 355%else 356 pshufw m2, m0, 00001110b 357 pshufw m3, m1, 00001110b 358 paddw m0, m2 359 paddw m1, m3 360 punpcklwd m0, m1 361%endif 362 %1 m2, m0, 00001110b 363 paddw m0, m2 364 365 lea r5, [r1*3] 366 lea r4, [r0+r1*4] 367 movzx r2d, word [r0+r1*1-2] 368 movzx r3d, word [r0+r1*2-2] 369 add r2d, r3d 370 movzx r3d, word [r0+r5*1-2] 371 add r2d, r3d 372 movzx r3d, word [r4-2] 373 add r2d, r3d 374 movd m2, r2d ; s2 375 376 movzx r2d, word [r4+r1*1-2] 377 movzx r3d, word [r4+r1*2-2] 378 add r2d, r3d 379 movzx r3d, word [r4+r5*1-2] 380 add r2d, r3d 381 movzx r3d, word [r4+r1*4-2] 382 add r2d, r3d 383 movd m3, r2d ; s3 384 385 punpcklwd m2, m3 386 punpckldq m0, m2 ; s0, s1, s2, s3 387 %1 m3, m0, 11110110b ; s2, s1, s3, s3 388 %1 m0, m0, 01110100b ; s0, s1, s3, s1 389 paddw m0, m3 390 psrlw m0, 2 391 pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 392%if mmsize==16 393 punpcklwd m0, m0 394 pshufd m3, m0, 11111010b 395 punpckldq m0, m0 396 SWAP 0,1 397%else 398 pshufw m1, m0, 0x00 399 pshufw m2, m0, 0x55 400 pshufw m3, m0, 0xaa 401 pshufw m4, m0, 0xff 402%endif 403 MOV8 r0+r1*1, m1, m2 404 MOV8 r0+r1*2, m1, m2 405 MOV8 r0+r5*1, m1, m2 406 MOV8 r0+r1*4, m1, m2 407 MOV8 r4+r1*1, m3, m4 408 MOV8 r4+r1*2, m3, m4 409 MOV8 r4+r5*1, m3, m4 410 MOV8 r4+r1*4, m3, m4 411 RET 412%endmacro 413 414INIT_MMX mmxext 415PRED8x8_DC pshufw 416INIT_XMM sse2 417PRED8x8_DC pshuflw 418 419;----------------------------------------------------------------------------- 420; void ff_pred8x8_top_dc_10(pixel *src, ptrdiff_t stride) 421;----------------------------------------------------------------------------- 422INIT_XMM sse2 423cglobal pred8x8_top_dc_10, 2, 4 424 sub r0, r1 425 mova m0, [r0] 426 pshuflw m1, m0, 0x4e 427 pshufhw m1, m1, 0x4e 428 paddw m0, m1 429 pshuflw m1, m0, 0xb1 430 pshufhw m1, m1, 0xb1 431 paddw m0, m1 432 lea r2, [r1*3] 433 lea r3, [r0+r1*4] 434 paddw m0, [pw_2] 435 psrlw m0, 2 436 mova [r0+r1*1], m0 437 mova [r0+r1*2], m0 438 mova [r0+r2*1], m0 439 mova [r0+r1*4], m0 440 mova [r3+r1*1], m0 441 mova [r3+r1*2], m0 442 mova [r3+r2*1], m0 443 mova [r3+r1*4], m0 444 RET 445 446;----------------------------------------------------------------------------- 447; void ff_pred8x8_plane_10(pixel *src, ptrdiff_t stride) 448;----------------------------------------------------------------------------- 449INIT_XMM sse2 450cglobal pred8x8_plane_10, 2, 7, 7 451 sub r0, r1 452 lea r2, [r1*3] 453 lea r3, [r0+r1*4] 454 mova m2, [r0] 455 pmaddwd m2, [pw_m32101234] 456 HADDD m2, m1 457 movd m0, [r0-4] 458 psrld m0, 14 459 psubw m2, m0 ; H 460 movd m0, [r3+r1*4-4] 461 movd m1, [r0+12] 462 paddw m0, m1 463 psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) 464 movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] 465 movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] 466 sub r4d, r5d 467 movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] 468 movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] 469 sub r6d, r5d 470 lea r4d, [r4+r6*2] 471 movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] 472 movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] 473 sub r5d, r6d 474 lea r5d, [r5*3] 475 add r4d, r5d 476 movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] 477 movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] 478 sub r6d, r5d 479 lea r4d, [r4+r6*4] 480 movd m3, r4d ; V 481 punpckldq m2, m3 482 pmaddwd m2, [pd_17] 483 paddd m2, [pd_16] 484 psrad m2, 5 ; b, c 485 486 mova m3, [pw_pixel_max] 487 pxor m1, m1 488 SPLATW m0, m0, 1 489 SPLATW m4, m2, 2 490 SPLATW m2, m2, 0 491 pmullw m2, [pw_m32101234] ; b 492 pmullw m5, m4, [pw_m3] ; c 493 paddw m5, [pw_16] 494 mov r2d, 8 495 add r0, r1 496.loop: 497 paddsw m6, m2, m5 498 paddsw m6, m0 499 psraw m6, 5 500 CLIPW m6, m1, m3 501 mova [r0], m6 502 paddw m5, m4 503 add r0, r1 504 dec r2d 505 jg .loop 506 REP_RET 507 508 509;----------------------------------------------------------------------------- 510; void ff_pred8x8l_128_dc_10(pixel *src, int has_topleft, int has_topright, 511; ptrdiff_t stride) 512;----------------------------------------------------------------------------- 513%macro PRED8x8L_128_DC 0 514cglobal pred8x8l_128_dc_10, 4, 4 515 mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) 516 lea r1, [r3*3] 517 lea r2, [r0+r3*4] 518 MOV8 r0+r3*0, m0, m0 519 MOV8 r0+r3*1, m0, m0 520 MOV8 r0+r3*2, m0, m0 521 MOV8 r0+r1*1, m0, m0 522 MOV8 r2+r3*0, m0, m0 523 MOV8 r2+r3*1, m0, m0 524 MOV8 r2+r3*2, m0, m0 525 MOV8 r2+r1*1, m0, m0 526 RET 527%endmacro 528 529INIT_MMX mmxext 530PRED8x8L_128_DC 531INIT_XMM sse2 532PRED8x8L_128_DC 533 534;----------------------------------------------------------------------------- 535; void ff_pred8x8l_top_dc_10(pixel *src, int has_topleft, int has_topright, 536; ptrdiff_t stride) 537;----------------------------------------------------------------------------- 538%macro PRED8x8L_TOP_DC 0 539cglobal pred8x8l_top_dc_10, 4, 4, 6 540 sub r0, r3 541 mova m0, [r0] 542 shr r1d, 14 543 shr r2d, 13 544 neg r1 545 pslldq m1, m0, 2 546 psrldq m2, m0, 2 547 pinsrw m1, [r0+r1], 0 548 pinsrw m2, [r0+r2+14], 7 549 lea r1, [r3*3] 550 lea r2, [r0+r3*4] 551 PRED4x4_LOWPASS m0, m2, m1, m0 552 HADDW m0, m1 553 paddw m0, [pw_4] 554 psrlw m0, 3 555 SPLATW m0, m0, 0 556 mova [r0+r3*1], m0 557 mova [r0+r3*2], m0 558 mova [r0+r1*1], m0 559 mova [r0+r3*4], m0 560 mova [r2+r3*1], m0 561 mova [r2+r3*2], m0 562 mova [r2+r1*1], m0 563 mova [r2+r3*4], m0 564 RET 565%endmacro 566 567INIT_XMM sse2 568PRED8x8L_TOP_DC 569%if HAVE_AVX_EXTERNAL 570INIT_XMM avx 571PRED8x8L_TOP_DC 572%endif 573 574;------------------------------------------------------------------------------- 575; void ff_pred8x8l_dc_10(pixel *src, int has_topleft, int has_topright, 576; ptrdiff_t stride) 577;------------------------------------------------------------------------------- 578;TODO: see if scalar is faster 579%macro PRED8x8L_DC 0 580cglobal pred8x8l_dc_10, 4, 6, 6 581 sub r0, r3 582 lea r4, [r0+r3*4] 583 lea r5, [r3*3] 584 mova m0, [r0+r3*2-16] 585 punpckhwd m0, [r0+r3*1-16] 586 mova m1, [r4+r3*0-16] 587 punpckhwd m1, [r0+r5*1-16] 588 punpckhdq m1, m0 589 mova m2, [r4+r3*2-16] 590 punpckhwd m2, [r4+r3*1-16] 591 mova m3, [r4+r3*4-16] 592 punpckhwd m3, [r4+r5*1-16] 593 punpckhdq m3, m2 594 punpckhqdq m3, m1 595 mova m0, [r0] 596 shr r1d, 14 597 shr r2d, 13 598 neg r1 599 pslldq m1, m0, 2 600 psrldq m2, m0, 2 601 pinsrw m1, [r0+r1], 0 602 pinsrw m2, [r0+r2+14], 7 603 not r1 604 and r1, r3 605 pslldq m4, m3, 2 606 psrldq m5, m3, 2 607 pshuflw m4, m4, 11100101b 608 pinsrw m5, [r0+r1-2], 7 609 PRED4x4_LOWPASS m3, m4, m5, m3 610 PRED4x4_LOWPASS m0, m2, m1, m0 611 paddw m0, m3 612 HADDW m0, m1 613 paddw m0, [pw_8] 614 psrlw m0, 4 615 SPLATW m0, m0 616 mova [r0+r3*1], m0 617 mova [r0+r3*2], m0 618 mova [r0+r5*1], m0 619 mova [r0+r3*4], m0 620 mova [r4+r3*1], m0 621 mova [r4+r3*2], m0 622 mova [r4+r5*1], m0 623 mova [r4+r3*4], m0 624 RET 625%endmacro 626 627INIT_XMM sse2 628PRED8x8L_DC 629%if HAVE_AVX_EXTERNAL 630INIT_XMM avx 631PRED8x8L_DC 632%endif 633 634;----------------------------------------------------------------------------- 635; void ff_pred8x8l_vertical_10(pixel *src, int has_topleft, int has_topright, 636; ptrdiff_t stride) 637;----------------------------------------------------------------------------- 638%macro PRED8x8L_VERTICAL 0 639cglobal pred8x8l_vertical_10, 4, 4, 6 640 sub r0, r3 641 mova m0, [r0] 642 shr r1d, 14 643 shr r2d, 13 644 neg r1 645 pslldq m1, m0, 2 646 psrldq m2, m0, 2 647 pinsrw m1, [r0+r1], 0 648 pinsrw m2, [r0+r2+14], 7 649 lea r1, [r3*3] 650 lea r2, [r0+r3*4] 651 PRED4x4_LOWPASS m0, m2, m1, m0 652 mova [r0+r3*1], m0 653 mova [r0+r3*2], m0 654 mova [r0+r1*1], m0 655 mova [r0+r3*4], m0 656 mova [r2+r3*1], m0 657 mova [r2+r3*2], m0 658 mova [r2+r1*1], m0 659 mova [r2+r3*4], m0 660 RET 661%endmacro 662 663INIT_XMM sse2 664PRED8x8L_VERTICAL 665%if HAVE_AVX_EXTERNAL 666INIT_XMM avx 667PRED8x8L_VERTICAL 668%endif 669 670;----------------------------------------------------------------------------- 671; void ff_pred8x8l_horizontal_10(uint8_t *src, int has_topleft, 672; int has_topright, ptrdiff_t stride) 673;----------------------------------------------------------------------------- 674%macro PRED8x8L_HORIZONTAL 0 675cglobal pred8x8l_horizontal_10, 4, 4, 5 676 mova m0, [r0-16] 677 shr r1d, 14 678 dec r1 679 and r1, r3 680 sub r1, r3 681 punpckhwd m0, [r0+r1-16] 682 mova m1, [r0+r3*2-16] 683 punpckhwd m1, [r0+r3*1-16] 684 lea r2, [r0+r3*4] 685 lea r1, [r3*3] 686 punpckhdq m1, m0 687 mova m2, [r2+r3*0-16] 688 punpckhwd m2, [r0+r1-16] 689 mova m3, [r2+r3*2-16] 690 punpckhwd m3, [r2+r3*1-16] 691 punpckhdq m3, m2 692 punpckhqdq m3, m1 693 PALIGNR m4, m3, [r2+r1-16], 14, m0 694 pslldq m0, m4, 2 695 pshuflw m0, m0, 11100101b 696 PRED4x4_LOWPASS m4, m3, m0, m4 697 punpckhwd m3, m4, m4 698 punpcklwd m4, m4 699 pshufd m0, m3, 0xff 700 pshufd m1, m3, 0xaa 701 pshufd m2, m3, 0x55 702 pshufd m3, m3, 0x00 703 mova [r0+r3*0], m0 704 mova [r0+r3*1], m1 705 mova [r0+r3*2], m2 706 mova [r0+r1*1], m3 707 pshufd m0, m4, 0xff 708 pshufd m1, m4, 0xaa 709 pshufd m2, m4, 0x55 710 pshufd m3, m4, 0x00 711 mova [r2+r3*0], m0 712 mova [r2+r3*1], m1 713 mova [r2+r3*2], m2 714 mova [r2+r1*1], m3 715 RET 716%endmacro 717 718INIT_XMM sse2 719PRED8x8L_HORIZONTAL 720INIT_XMM ssse3 721PRED8x8L_HORIZONTAL 722%if HAVE_AVX_EXTERNAL 723INIT_XMM avx 724PRED8x8L_HORIZONTAL 725%endif 726 727;----------------------------------------------------------------------------- 728; void ff_pred8x8l_down_left_10(pixel *src, int has_topleft, int has_topright, 729; ptrdiff_t stride) 730;----------------------------------------------------------------------------- 731%macro PRED8x8L_DOWN_LEFT 0 732cglobal pred8x8l_down_left_10, 4, 4, 7 733 sub r0, r3 734 mova m3, [r0] 735 shr r1d, 14 736 neg r1 737 shr r2d, 13 738 pslldq m1, m3, 2 739 psrldq m2, m3, 2 740 pinsrw m1, [r0+r1], 0 741 pinsrw m2, [r0+r2+14], 7 742 PRED4x4_LOWPASS m6, m2, m1, m3 743 jz .fix_tr ; flags from shr r2d 744 mova m1, [r0+16] 745 psrldq m5, m1, 2 746 PALIGNR m2, m1, m3, 14, m3 747 pshufhw m5, m5, 10100100b 748 PRED4x4_LOWPASS m1, m2, m5, m1 749.do_topright: 750 lea r1, [r3*3] 751 psrldq m5, m1, 14 752 lea r2, [r0+r3*4] 753 PALIGNR m2, m1, m6, 2, m0 754 PALIGNR m3, m1, m6, 14, m0 755 PALIGNR m5, m1, 2, m0 756 pslldq m4, m6, 2 757 PRED4x4_LOWPASS m6, m4, m2, m6 758 PRED4x4_LOWPASS m1, m3, m5, m1 759 mova [r2+r3*4], m1 760 PALIGNR m1, m6, 14, m2 761 pslldq m6, 2 762 mova [r2+r1*1], m1 763 PALIGNR m1, m6, 14, m2 764 pslldq m6, 2 765 mova [r2+r3*2], m1 766 PALIGNR m1, m6, 14, m2 767 pslldq m6, 2 768 mova [r2+r3*1], m1 769 PALIGNR m1, m6, 14, m2 770 pslldq m6, 2 771 mova [r0+r3*4], m1 772 PALIGNR m1, m6, 14, m2 773 pslldq m6, 2 774 mova [r0+r1*1], m1 775 PALIGNR m1, m6, 14, m2 776 pslldq m6, 2 777 mova [r0+r3*2], m1 778 PALIGNR m1, m6, 14, m6 779 mova [r0+r3*1], m1 780 RET 781.fix_tr: 782 punpckhwd m3, m3 783 pshufd m1, m3, 0xFF 784 jmp .do_topright 785%endmacro 786 787INIT_XMM sse2 788PRED8x8L_DOWN_LEFT 789INIT_XMM ssse3 790PRED8x8L_DOWN_LEFT 791%if HAVE_AVX_EXTERNAL 792INIT_XMM avx 793PRED8x8L_DOWN_LEFT 794%endif 795 796;----------------------------------------------------------------------------- 797; void ff_pred8x8l_down_right_10(pixel *src, int has_topleft, 798; int has_topright, ptrdiff_t stride) 799;----------------------------------------------------------------------------- 800%macro PRED8x8L_DOWN_RIGHT 0 801; standard forbids this when has_topleft is false 802; no need to check 803cglobal pred8x8l_down_right_10, 4, 5, 8 804 sub r0, r3 805 lea r4, [r0+r3*4] 806 lea r1, [r3*3] 807 mova m0, [r0+r3*1-16] 808 punpckhwd m0, [r0+r3*0-16] 809 mova m1, [r0+r1*1-16] 810 punpckhwd m1, [r0+r3*2-16] 811 punpckhdq m1, m0 812 mova m2, [r4+r3*1-16] 813 punpckhwd m2, [r4+r3*0-16] 814 mova m3, [r4+r1*1-16] 815 punpckhwd m3, [r4+r3*2-16] 816 punpckhdq m3, m2 817 punpckhqdq m3, m1 818 mova m0, [r4+r3*4-16] 819 mova m1, [r0] 820 PALIGNR m4, m3, m0, 14, m0 821 PALIGNR m1, m3, 2, m2 822 pslldq m0, m4, 2 823 pshuflw m0, m0, 11100101b 824 PRED4x4_LOWPASS m6, m1, m4, m3 825 PRED4x4_LOWPASS m4, m3, m0, m4 826 mova m3, [r0] 827 shr r2d, 13 828 pslldq m1, m3, 2 829 psrldq m2, m3, 2 830 pinsrw m1, [r0-2], 0 831 pinsrw m2, [r0+r2+14], 7 832 PRED4x4_LOWPASS m3, m2, m1, m3 833 PALIGNR m2, m3, m6, 2, m0 834 PALIGNR m5, m3, m6, 14, m0 835 psrldq m7, m3, 2 836 PRED4x4_LOWPASS m6, m4, m2, m6 837 PRED4x4_LOWPASS m3, m5, m7, m3 838 mova [r4+r3*4], m6 839 PALIGNR m3, m6, 14, m2 840 pslldq m6, 2 841 mova [r0+r3*1], m3 842 PALIGNR m3, m6, 14, m2 843 pslldq m6, 2 844 mova [r0+r3*2], m3 845 PALIGNR m3, m6, 14, m2 846 pslldq m6, 2 847 mova [r0+r1*1], m3 848 PALIGNR m3, m6, 14, m2 849 pslldq m6, 2 850 mova [r0+r3*4], m3 851 PALIGNR m3, m6, 14, m2 852 pslldq m6, 2 853 mova [r4+r3*1], m3 854 PALIGNR m3, m6, 14, m2 855 pslldq m6, 2 856 mova [r4+r3*2], m3 857 PALIGNR m3, m6, 14, m6 858 mova [r4+r1*1], m3 859 RET 860%endmacro 861 862INIT_XMM sse2 863PRED8x8L_DOWN_RIGHT 864INIT_XMM ssse3 865PRED8x8L_DOWN_RIGHT 866%if HAVE_AVX_EXTERNAL 867INIT_XMM avx 868PRED8x8L_DOWN_RIGHT 869%endif 870 871;----------------------------------------------------------------------------- 872; void ff_pred8x8l_vertical_right_10(pixel *src, int has_topleft, 873; int has_topright, ptrdiff_t stride) 874;----------------------------------------------------------------------------- 875%macro PRED8x8L_VERTICAL_RIGHT 0 876; likewise with 8x8l_down_right 877cglobal pred8x8l_vertical_right_10, 4, 5, 7 878 sub r0, r3 879 lea r4, [r0+r3*4] 880 lea r1, [r3*3] 881 mova m0, [r0+r3*1-16] 882 punpckhwd m0, [r0+r3*0-16] 883 mova m1, [r0+r1*1-16] 884 punpckhwd m1, [r0+r3*2-16] 885 punpckhdq m1, m0 886 mova m2, [r4+r3*1-16] 887 punpckhwd m2, [r4+r3*0-16] 888 mova m3, [r4+r1*1-16] 889 punpckhwd m3, [r4+r3*2-16] 890 punpckhdq m3, m2 891 punpckhqdq m3, m1 892 mova m0, [r4+r3*4-16] 893 mova m1, [r0] 894 PALIGNR m4, m3, m0, 14, m0 895 PALIGNR m1, m3, 2, m2 896 PRED4x4_LOWPASS m3, m1, m4, m3 897 mova m2, [r0] 898 shr r2d, 13 899 pslldq m1, m2, 2 900 psrldq m5, m2, 2 901 pinsrw m1, [r0-2], 0 902 pinsrw m5, [r0+r2+14], 7 903 PRED4x4_LOWPASS m2, m5, m1, m2 904 PALIGNR m6, m2, m3, 12, m1 905 PALIGNR m5, m2, m3, 14, m0 906 PRED4x4_LOWPASS m0, m6, m2, m5 907 pavgw m2, m5 908 mova [r0+r3*2], m0 909 mova [r0+r3*1], m2 910 pslldq m6, m3, 4 911 pslldq m1, m3, 2 912 PRED4x4_LOWPASS m1, m3, m6, m1 913 PALIGNR m2, m1, 14, m4 914 mova [r0+r1*1], m2 915 pslldq m1, 2 916 PALIGNR m0, m1, 14, m3 917 mova [r0+r3*4], m0 918 pslldq m1, 2 919 PALIGNR m2, m1, 14, m4 920 mova [r4+r3*1], m2 921 pslldq m1, 2 922 PALIGNR m0, m1, 14, m3 923 mova [r4+r3*2], m0 924 pslldq m1, 2 925 PALIGNR m2, m1, 14, m4 926 mova [r4+r1*1], m2 927 pslldq m1, 2 928 PALIGNR m0, m1, 14, m1 929 mova [r4+r3*4], m0 930 RET 931%endmacro 932 933INIT_XMM sse2 934PRED8x8L_VERTICAL_RIGHT 935INIT_XMM ssse3 936PRED8x8L_VERTICAL_RIGHT 937%if HAVE_AVX_EXTERNAL 938INIT_XMM avx 939PRED8x8L_VERTICAL_RIGHT 940%endif 941 942;----------------------------------------------------------------------------- 943; void ff_pred8x8l_horizontal_up_10(pixel *src, int has_topleft, 944; int has_topright, ptrdiff_t stride) 945;----------------------------------------------------------------------------- 946%macro PRED8x8L_HORIZONTAL_UP 0 947cglobal pred8x8l_horizontal_up_10, 4, 4, 6 948 mova m0, [r0+r3*0-16] 949 punpckhwd m0, [r0+r3*1-16] 950 shr r1d, 14 951 dec r1 952 and r1, r3 953 sub r1, r3 954 mova m4, [r0+r1*1-16] 955 lea r1, [r3*3] 956 lea r2, [r0+r3*4] 957 mova m1, [r0+r3*2-16] 958 punpckhwd m1, [r0+r1*1-16] 959 punpckhdq m0, m1 960 mova m2, [r2+r3*0-16] 961 punpckhwd m2, [r2+r3*1-16] 962 mova m3, [r2+r3*2-16] 963 punpckhwd m3, [r2+r1*1-16] 964 punpckhdq m2, m3 965 punpckhqdq m0, m2 966 PALIGNR m1, m0, m4, 14, m4 967 psrldq m2, m0, 2 968 pshufhw m2, m2, 10100100b 969 PRED4x4_LOWPASS m0, m1, m2, m0 970 psrldq m1, m0, 2 971 psrldq m2, m0, 4 972 pshufhw m1, m1, 10100100b 973 pshufhw m2, m2, 01010100b 974 pavgw m4, m0, m1 975 PRED4x4_LOWPASS m1, m2, m0, m1 976 punpckhwd m5, m4, m1 977 punpcklwd m4, m1 978 mova [r2+r3*0], m5 979 mova [r0+r3*0], m4 980 pshufd m0, m5, 11111001b 981 pshufd m1, m5, 11111110b 982 pshufd m2, m5, 11111111b 983 mova [r2+r3*1], m0 984 mova [r2+r3*2], m1 985 mova [r2+r1*1], m2 986 PALIGNR m2, m5, m4, 4, m0 987 PALIGNR m3, m5, m4, 8, m1 988 PALIGNR m5, m5, m4, 12, m4 989 mova [r0+r3*1], m2 990 mova [r0+r3*2], m3 991 mova [r0+r1*1], m5 992 RET 993%endmacro 994 995INIT_XMM sse2 996PRED8x8L_HORIZONTAL_UP 997INIT_XMM ssse3 998PRED8x8L_HORIZONTAL_UP 999%if HAVE_AVX_EXTERNAL 1000INIT_XMM avx 1001PRED8x8L_HORIZONTAL_UP 1002%endif 1003 1004 1005;----------------------------------------------------------------------------- 1006; void ff_pred16x16_vertical_10(pixel *src, ptrdiff_t stride) 1007;----------------------------------------------------------------------------- 1008%macro MOV16 3-5 1009 mova [%1+ 0], %2 1010 mova [%1+mmsize], %3 1011%if mmsize==8 1012 mova [%1+ 16], %4 1013 mova [%1+ 24], %5 1014%endif 1015%endmacro 1016 1017%macro PRED16x16_VERTICAL 0 1018cglobal pred16x16_vertical_10, 2, 3 1019 sub r0, r1 1020 mov r2d, 8 1021 mova m0, [r0+ 0] 1022 mova m1, [r0+mmsize] 1023%if mmsize==8 1024 mova m2, [r0+16] 1025 mova m3, [r0+24] 1026%endif 1027.loop: 1028 MOV16 r0+r1*1, m0, m1, m2, m3 1029 MOV16 r0+r1*2, m0, m1, m2, m3 1030 lea r0, [r0+r1*2] 1031 dec r2d 1032 jg .loop 1033 REP_RET 1034%endmacro 1035 1036INIT_MMX mmxext 1037PRED16x16_VERTICAL 1038INIT_XMM sse2 1039PRED16x16_VERTICAL 1040 1041;----------------------------------------------------------------------------- 1042; void ff_pred16x16_horizontal_10(pixel *src, ptrdiff_t stride) 1043;----------------------------------------------------------------------------- 1044%macro PRED16x16_HORIZONTAL 0 1045cglobal pred16x16_horizontal_10, 2, 3 1046 mov r2d, 8 1047.vloop: 1048 movd m0, [r0+r1*0-4] 1049 movd m1, [r0+r1*1-4] 1050 SPLATW m0, m0, 1 1051 SPLATW m1, m1, 1 1052 MOV16 r0+r1*0, m0, m0, m0, m0 1053 MOV16 r0+r1*1, m1, m1, m1, m1 1054 lea r0, [r0+r1*2] 1055 dec r2d 1056 jg .vloop 1057 REP_RET 1058%endmacro 1059 1060INIT_MMX mmxext 1061PRED16x16_HORIZONTAL 1062INIT_XMM sse2 1063PRED16x16_HORIZONTAL 1064 1065;----------------------------------------------------------------------------- 1066; void ff_pred16x16_dc_10(pixel *src, ptrdiff_t stride) 1067;----------------------------------------------------------------------------- 1068%macro PRED16x16_DC 0 1069cglobal pred16x16_dc_10, 2, 6 1070 mov r5, r0 1071 sub r0, r1 1072 mova m0, [r0+0] 1073 paddw m0, [r0+mmsize] 1074%if mmsize==8 1075 paddw m0, [r0+16] 1076 paddw m0, [r0+24] 1077%endif 1078 HADDW m0, m2 1079 1080 lea r0, [r0+r1-2] 1081 movzx r3d, word [r0] 1082 movzx r4d, word [r0+r1] 1083%rep 7 1084 lea r0, [r0+r1*2] 1085 movzx r2d, word [r0] 1086 add r3d, r2d 1087 movzx r2d, word [r0+r1] 1088 add r4d, r2d 1089%endrep 1090 lea r3d, [r3+r4+16] 1091 1092 movd m1, r3d 1093 paddw m0, m1 1094 psrlw m0, 5 1095 SPLATW m0, m0 1096 mov r3d, 8 1097.loop: 1098 MOV16 r5+r1*0, m0, m0, m0, m0 1099 MOV16 r5+r1*1, m0, m0, m0, m0 1100 lea r5, [r5+r1*2] 1101 dec r3d 1102 jg .loop 1103 REP_RET 1104%endmacro 1105 1106INIT_MMX mmxext 1107PRED16x16_DC 1108INIT_XMM sse2 1109PRED16x16_DC 1110 1111;----------------------------------------------------------------------------- 1112; void ff_pred16x16_top_dc_10(pixel *src, ptrdiff_t stride) 1113;----------------------------------------------------------------------------- 1114%macro PRED16x16_TOP_DC 0 1115cglobal pred16x16_top_dc_10, 2, 3 1116 sub r0, r1 1117 mova m0, [r0+0] 1118 paddw m0, [r0+mmsize] 1119%if mmsize==8 1120 paddw m0, [r0+16] 1121 paddw m0, [r0+24] 1122%endif 1123 HADDW m0, m2 1124 1125 SPLATW m0, m0 1126 paddw m0, [pw_8] 1127 psrlw m0, 4 1128 mov r2d, 8 1129.loop: 1130 MOV16 r0+r1*1, m0, m0, m0, m0 1131 MOV16 r0+r1*2, m0, m0, m0, m0 1132 lea r0, [r0+r1*2] 1133 dec r2d 1134 jg .loop 1135 REP_RET 1136%endmacro 1137 1138INIT_MMX mmxext 1139PRED16x16_TOP_DC 1140INIT_XMM sse2 1141PRED16x16_TOP_DC 1142 1143;----------------------------------------------------------------------------- 1144; void ff_pred16x16_left_dc_10(pixel *src, ptrdiff_t stride) 1145;----------------------------------------------------------------------------- 1146%macro PRED16x16_LEFT_DC 0 1147cglobal pred16x16_left_dc_10, 2, 6 1148 mov r5, r0 1149 1150 sub r0, 2 1151 movzx r3d, word [r0] 1152 movzx r4d, word [r0+r1] 1153%rep 7 1154 lea r0, [r0+r1*2] 1155 movzx r2d, word [r0] 1156 add r3d, r2d 1157 movzx r2d, word [r0+r1] 1158 add r4d, r2d 1159%endrep 1160 lea r3d, [r3+r4+8] 1161 shr r3d, 4 1162 1163 movd m0, r3d 1164 SPLATW m0, m0 1165 mov r3d, 8 1166.loop: 1167 MOV16 r5+r1*0, m0, m0, m0, m0 1168 MOV16 r5+r1*1, m0, m0, m0, m0 1169 lea r5, [r5+r1*2] 1170 dec r3d 1171 jg .loop 1172 REP_RET 1173%endmacro 1174 1175INIT_MMX mmxext 1176PRED16x16_LEFT_DC 1177INIT_XMM sse2 1178PRED16x16_LEFT_DC 1179 1180;----------------------------------------------------------------------------- 1181; void ff_pred16x16_128_dc_10(pixel *src, ptrdiff_t stride) 1182;----------------------------------------------------------------------------- 1183%macro PRED16x16_128_DC 0 1184cglobal pred16x16_128_dc_10, 2,3 1185 mova m0, [pw_512] 1186 mov r2d, 8 1187.loop: 1188 MOV16 r0+r1*0, m0, m0, m0, m0 1189 MOV16 r0+r1*1, m0, m0, m0, m0 1190 lea r0, [r0+r1*2] 1191 dec r2d 1192 jg .loop 1193 REP_RET 1194%endmacro 1195 1196INIT_MMX mmxext 1197PRED16x16_128_DC 1198INIT_XMM sse2 1199PRED16x16_128_DC 1200