1;****************************************************************************** 2;* x86-optimized vertical line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* Kieran Kunhya <kieran@kunhya.com> 5;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28minshort: times 8 dw 0x8000 29yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 30yuv2yuvX_10_start: times 4 dd 0x10000 31yuv2yuvX_9_start: times 4 dd 0x20000 32yuv2yuvX_10_upper: times 8 dw 0x3ff 33yuv2yuvX_9_upper: times 8 dw 0x1ff 34pd_4: times 4 dd 4 35pd_4min0x40000:times 4 dd 4 - (0x40000) 36pw_16: times 8 dw 16 37pw_32: times 8 dw 32 38pd_255: times 8 dd 255 39pw_512: times 8 dw 512 40pw_1024: times 8 dw 1024 41pd_65535_invf: times 8 dd 0x37800080 ;1.0/65535.0 42pd_yuv2gbrp16_start: times 8 dd -0x40000000 43pd_yuv2gbrp_y_start: times 8 dd (1 << 9) 44pd_yuv2gbrp_uv_start: times 8 dd ((1 << 9) - (128 << 19)) 45pd_yuv2gbrp_a_start: times 8 dd (1 << 18) 46pd_yuv2gbrp16_offset: times 8 dd 0x10000 ;(1 << 16) 47pd_yuv2gbrp16_round13: times 8 dd 0xE0002000 ;(1 << 13) - (1 << 29) 48pd_yuv2gbrp16_a_offset: times 8 dd 0x20002000 49pd_yuv2gbrp16_upper30: times 8 dd 0x3FFFFFFF ;(1<<30) - 1 50pd_yuv2gbrp16_upper27: times 8 dd 0x07FFFFFF ;(1<<27) - 1 51pd_yuv2gbrp16_upper16: times 8 dd 0x0000FFFF ;(1<<16) - 1 52pd_yuv2gbrp16_upperC: times 8 dd 0xC0000000 53pd_yuv2gbrp_debias: times 8 dd 0x00008000 ;(1 << 29 - 14) 54pb_pack_shuffle8: db 0, 4, 8, 12, \ 55 -1, -1, -1, -1, \ 56 -1, -1, -1, -1, \ 57 -1, -1, -1, -1, \ 58 -1, -1, -1, -1, \ 59 0, 4, 8, 12, \ 60 -1, -1, -1, -1, \ 61 -1, -1, -1, -1 62pb_pack_shuffle16le: db 0, 1, 4, 5, \ 63 8, 9, 12, 13, \ 64 -1, -1, -1, -1, \ 65 -1, -1, -1, -1, \ 66 -1, -1, -1, -1, \ 67 -1, -1, -1, -1, \ 68 0, 1, 4, 5, \ 69 8, 9, 12, 13 70pb_pack_shuffle16be: db 1, 0, 5, 4, \ 71 9, 8, 13, 12, \ 72 -1, -1, -1, -1, \ 73 -1, -1, -1, -1, \ 74 -1, -1, -1, -1, \ 75 -1, -1, -1, -1, \ 76 1, 0, 5, 4, \ 77 9, 8, 13, 12 78pb_shuffle32be: db 3, 2, 1, 0, \ 79 7, 6, 5, 4, \ 80 11, 10, 9, 8, \ 81 15, 14, 13, 12, \ 82 3, 2, 1, 0, \ 83 7, 6, 5, 4, \ 84 11, 10, 9, 8, \ 85 15, 14, 13, 12 86yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \ 87 -1, -1, -1, -1, \ 88 -1, -1, -1, -1, \ 89 -1, -1, -1, -1 90yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \ 91 -1, -1, -1, -1, \ 92 -1, -1, -1, -1, \ 93 -1, -1, -1, -1 94yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7 95 96SECTION .text 97 98;----------------------------------------------------------------------------- 99; vertical line scaling 100; 101; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, 102; const uint8_t *dither, int offset) 103; and 104; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, 105; const int16_t **src, uint8_t *dst, int dstW, 106; const uint8_t *dither, int offset) 107; 108; Scale one or $filterSize lines of source data to generate one line of output 109; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in 110; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple 111; of 2. $offset is either 0 or 3. $dither holds 8 values. 112;----------------------------------------------------------------------------- 113%macro yuv2planeX_mainloop 2 114.pixelloop_%2: 115%assign %%i 0 116 ; the rep here is for the 8-bit output MMX case, where dither covers 117 ; 8 pixels but we can only handle 2 pixels per register, and thus 4 118 ; pixels per iteration. In order to not have to keep track of where 119 ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2. 120%if %1 == 8 121%assign %%repcnt 16/mmsize 122%else 123%assign %%repcnt 1 124%endif 125 126%rep %%repcnt 127 128%if %1 == 8 129%if ARCH_X86_32 130 mova m2, [rsp+mmsize*(0+%%i)] 131 mova m1, [rsp+mmsize*(1+%%i)] 132%else ; x86-64 133 mova m2, m8 134 mova m1, m_dith 135%endif ; x86-32/64 136%else ; %1 == 9/10/16 137 mova m1, [yuv2yuvX_%1_start] 138 mova m2, m1 139%endif ; %1 == 8/9/10/16 140 movsx cntr_reg, fltsizem 141.filterloop_%2_ %+ %%i: 142 ; input pixels 143 mov r6, [srcq+gprsize*cntr_reg-2*gprsize] 144%if %1 == 16 145 mova m3, [r6+r5*4] 146 mova m5, [r6+r5*4+mmsize] 147%else ; %1 == 8/9/10 148 mova m3, [r6+r5*2] 149%endif ; %1 == 8/9/10/16 150 mov r6, [srcq+gprsize*cntr_reg-gprsize] 151%if %1 == 16 152 mova m4, [r6+r5*4] 153 mova m6, [r6+r5*4+mmsize] 154%else ; %1 == 8/9/10 155 mova m4, [r6+r5*2] 156%endif ; %1 == 8/9/10/16 157 158 ; coefficients 159 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] 160%if %1 == 16 161 pshuflw m7, m0, 0 ; coeff[0] 162 pshuflw m0, m0, 0x55 ; coeff[1] 163 pmovsxwd m7, m7 ; word -> dword 164 pmovsxwd m0, m0 ; word -> dword 165 166 pmulld m3, m7 167 pmulld m5, m7 168 pmulld m4, m0 169 pmulld m6, m0 170 171 paddd m2, m3 172 paddd m1, m5 173 paddd m2, m4 174 paddd m1, m6 175%else ; %1 == 10/9/8 176 punpcklwd m5, m3, m4 177 punpckhwd m3, m4 178 SPLATD m0 179 180 pmaddwd m5, m0 181 pmaddwd m3, m0 182 183 paddd m2, m5 184 paddd m1, m3 185%endif ; %1 == 8/9/10/16 186 187 sub cntr_reg, 2 188 jg .filterloop_%2_ %+ %%i 189 190%if %1 == 16 191 psrad m2, 31 - %1 192 psrad m1, 31 - %1 193%else ; %1 == 10/9/8 194 psrad m2, 27 - %1 195 psrad m1, 27 - %1 196%endif ; %1 == 8/9/10/16 197 198%if %1 == 8 199 packssdw m2, m1 200 packuswb m2, m2 201 movh [dstq+r5*1], m2 202%else ; %1 == 9/10/16 203%if %1 == 16 204 packssdw m2, m1 205 paddw m2, [minshort] 206%else ; %1 == 9/10 207%if cpuflag(sse4) 208 packusdw m2, m1 209%else ; mmxext/sse2 210 packssdw m2, m1 211 pmaxsw m2, m6 212%endif ; mmxext/sse2/sse4/avx 213 pminsw m2, [yuv2yuvX_%1_upper] 214%endif ; %1 == 9/10/16 215 mov%2 [dstq+r5*2], m2 216%endif ; %1 == 8/9/10/16 217 218 add r5, mmsize/2 219 sub wd, mmsize/2 220 221%assign %%i %%i+2 222%endrep 223 jg .pixelloop_%2 224%endmacro 225 226%macro yuv2planeX_fn 3 227 228%if ARCH_X86_32 229%define cntr_reg fltsizeq 230%define movsx mov 231%else 232%define cntr_reg r7 233%define movsx movsxd 234%endif 235 236cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset 237%if %1 == 8 || %1 == 9 || %1 == 10 238 pxor m6, m6 239%endif ; %1 == 8/9/10 240 241%if %1 == 8 242%if ARCH_X86_32 243%assign pad 0x2c - (stack_offset & 15) 244 SUB rsp, pad 245%define m_dith m7 246%else ; x86-64 247%define m_dith m9 248%endif ; x86-32 249 250 ; create registers holding dither 251 movq m_dith, [ditherq] ; dither 252 test offsetd, offsetd 253 jz .no_rot 254%if mmsize == 16 255 punpcklqdq m_dith, m_dith 256%endif ; mmsize == 16 257 PALIGNR m_dith, m_dith, 3, m0 258.no_rot: 259%if mmsize == 16 260 punpcklbw m_dith, m6 261%if ARCH_X86_64 262 punpcklwd m8, m_dith, m6 263 pslld m8, 12 264%else ; x86-32 265 punpcklwd m5, m_dith, m6 266 pslld m5, 12 267%endif ; x86-32/64 268 punpckhwd m_dith, m6 269 pslld m_dith, 12 270%if ARCH_X86_32 271 mova [rsp+ 0], m5 272 mova [rsp+16], m_dith 273%endif 274%else ; mmsize == 8 275 punpcklbw m5, m_dith, m6 276 punpckhbw m_dith, m6 277 punpcklwd m4, m5, m6 278 punpckhwd m5, m6 279 punpcklwd m3, m_dith, m6 280 punpckhwd m_dith, m6 281 pslld m4, 12 282 pslld m5, 12 283 pslld m3, 12 284 pslld m_dith, 12 285 mova [rsp+ 0], m4 286 mova [rsp+ 8], m5 287 mova [rsp+16], m3 288 mova [rsp+24], m_dith 289%endif ; mmsize == 8/16 290%endif ; %1 == 8 291 292 xor r5, r5 293 294%if mmsize == 8 || %1 == 8 295 yuv2planeX_mainloop %1, a 296%else ; mmsize == 16 297 test dstq, 15 298 jnz .unaligned 299 yuv2planeX_mainloop %1, a 300 REP_RET 301.unaligned: 302 yuv2planeX_mainloop %1, u 303%endif ; mmsize == 8/16 304 305%if %1 == 8 306%if ARCH_X86_32 307 ADD rsp, pad 308 RET 309%else ; x86-64 310 REP_RET 311%endif ; x86-32/64 312%else ; %1 == 9/10/16 313 REP_RET 314%endif ; %1 == 8/9/10/16 315%endmacro 316 317%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 318INIT_MMX mmxext 319yuv2planeX_fn 8, 0, 7 320%endif 321 322INIT_XMM sse2 323yuv2planeX_fn 8, 10, 7 324yuv2planeX_fn 9, 7, 5 325yuv2planeX_fn 10, 7, 5 326 327INIT_XMM sse4 328yuv2planeX_fn 8, 10, 7 329yuv2planeX_fn 9, 7, 5 330yuv2planeX_fn 10, 7, 5 331yuv2planeX_fn 16, 8, 5 332 333%if HAVE_AVX_EXTERNAL 334INIT_XMM avx 335yuv2planeX_fn 8, 10, 7 336yuv2planeX_fn 9, 7, 5 337yuv2planeX_fn 10, 7, 5 338%endif 339 340; %1=outout-bpc, %2=alignment (u/a) 341%macro yuv2plane1_mainloop 2 342.loop_%2: 343%if %1 == 8 344 paddsw m0, m2, [srcq+wq*2+mmsize*0] 345 paddsw m1, m3, [srcq+wq*2+mmsize*1] 346 psraw m0, 7 347 psraw m1, 7 348 packuswb m0, m1 349 mov%2 [dstq+wq], m0 350%elif %1 == 16 351 paddd m0, m4, [srcq+wq*4+mmsize*0] 352 paddd m1, m4, [srcq+wq*4+mmsize*1] 353 paddd m2, m4, [srcq+wq*4+mmsize*2] 354 paddd m3, m4, [srcq+wq*4+mmsize*3] 355 psrad m0, 3 356 psrad m1, 3 357 psrad m2, 3 358 psrad m3, 3 359%if cpuflag(sse4) ; avx/sse4 360 packusdw m0, m1 361 packusdw m2, m3 362%else ; mmx/sse2 363 packssdw m0, m1 364 packssdw m2, m3 365 paddw m0, m5 366 paddw m2, m5 367%endif ; mmx/sse2/sse4/avx 368 mov%2 [dstq+wq*2+mmsize*0], m0 369 mov%2 [dstq+wq*2+mmsize*1], m2 370%else ; %1 == 9/10 371 paddsw m0, m2, [srcq+wq*2+mmsize*0] 372 paddsw m1, m2, [srcq+wq*2+mmsize*1] 373 psraw m0, 15 - %1 374 psraw m1, 15 - %1 375 pmaxsw m0, m4 376 pmaxsw m1, m4 377 pminsw m0, m3 378 pminsw m1, m3 379 mov%2 [dstq+wq*2+mmsize*0], m0 380 mov%2 [dstq+wq*2+mmsize*1], m1 381%endif 382 add wq, mmsize 383 jl .loop_%2 384%endmacro 385 386%macro yuv2plane1_fn 3 387cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset 388 movsxdifnidn wq, wd 389 add wq, mmsize - 1 390 and wq, ~(mmsize - 1) 391%if %1 == 8 392 add dstq, wq 393%else ; %1 != 8 394 lea dstq, [dstq+wq*2] 395%endif ; %1 == 8 396%if %1 == 16 397 lea srcq, [srcq+wq*4] 398%else ; %1 != 16 399 lea srcq, [srcq+wq*2] 400%endif ; %1 == 16 401 neg wq 402 403%if %1 == 8 404 pxor m4, m4 ; zero 405 406 ; create registers holding dither 407 movq m3, [ditherq] ; dither 408 test offsetd, offsetd 409 jz .no_rot 410 punpcklqdq m3, m3 411 PALIGNR m3, m3, 3, m2 412.no_rot: 413 punpcklbw m3, m4 414 mova m2, m3 415%elif %1 == 9 416 pxor m4, m4 417 mova m3, [pw_512] 418 mova m2, [pw_32] 419%elif %1 == 10 420 pxor m4, m4 421 mova m3, [pw_1024] 422 mova m2, [pw_16] 423%else ; %1 == 16 424%if cpuflag(sse4) ; sse4/avx 425 mova m4, [pd_4] 426%else ; sse2 427 mova m4, [pd_4min0x40000] 428 mova m5, [minshort] 429%endif ; sse2/sse4/avx 430%endif ; %1 == .. 431 432 ; actual pixel scaling 433 test dstq, 15 434 jnz .unaligned 435 yuv2plane1_mainloop %1, a 436 REP_RET 437.unaligned: 438 yuv2plane1_mainloop %1, u 439 REP_RET 440%endmacro 441 442INIT_XMM sse2 443yuv2plane1_fn 8, 5, 5 444yuv2plane1_fn 9, 5, 3 445yuv2plane1_fn 10, 5, 3 446yuv2plane1_fn 16, 6, 3 447 448INIT_XMM sse4 449yuv2plane1_fn 16, 5, 3 450 451%if HAVE_AVX_EXTERNAL 452INIT_XMM avx 453yuv2plane1_fn 8, 5, 5 454yuv2plane1_fn 9, 5, 3 455yuv2plane1_fn 10, 5, 3 456yuv2plane1_fn 16, 5, 3 457%endif 458 459%undef movsx 460 461;----------------------------------------------------------------------------- 462; AVX2 yuv2nv12cX implementation 463; 464; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither, 465; const int16_t *filter, int filterSize, 466; const int16_t **u, const int16_t **v, 467; uint8_t *dst, int dstWidth) 468; 469; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither, 470; const int16_t *filter, int filterSize, 471; const int16_t **u, const int16_t **v, 472; uint8_t *dst, int dstWidth) 473;----------------------------------------------------------------------------- 474 475%if ARCH_X86_64 476%macro yuv2nv12cX_fn 1 477cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth 478 479 mov tmp1q, qword [ditherq] 480 movq xm0, tmp1q 481 ror tmp1q, 24 482 movq xm1, tmp1q 483 484 pmovzxbd m0, xm0 485 pslld m0, m0, 12 ; ditherLo 486 pmovzxbd m1, xm1 487 pslld m1, m1, 12 ; ditherHi 488 489 pxor m9, m9 ; uint8_min dwords 490 mova m10, [pd_255] ; uint8_max dwords 491 mova m11, [%1_shuffle_mask] ; shuffle_mask 492 mova m12, [yuv2nv12_permute_mask] ; permute mask 493 494 DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth 495 496 xor r8q, r8q 497 498nv12_outer_%1: 499 mova m2, m0 ; resultLo 500 mova m3, m1 ; resultHi 501 xor r9q, r9q 502 503nv12_inner_%1: 504 movsx r10d, word [filterq + (2 * r9q)] 505 movd xm4, r10d 506 vpbroadcastd m4, xm4 ; filter 507 508 mov tmp1q, [uq + (gprsize * r9q)] 509 mova xm7, oword [tmp1q + 2 * r8q] 510 511 mov tmp2q, [vq + (gprsize * r9q)] 512 mova xm8, oword [tmp2q + 2 * r8q] 513 514 punpcklwd xm5, xm7, xm8 515 pmovsxwd m5, xm5 ; multiplicandsLo 516 punpckhwd xm6, xm7, xm8 517 pmovsxwd m6, xm6 ; multiplicandsHi 518 519 pmulld m7, m5, m4 ; mulResultLo 520 pmulld m8, m6, m4 ; mulResultHi 521 paddd m2, m2, m7 ; resultLo += mulResultLo 522 paddd m3, m3, m8 ; resultHi += mulResultHi 523 524 inc r9d 525 cmp r9d, filterSized 526 jl nv12_inner_%1 527 ; end of inner loop 528 529 psrad m2, m2, 19 530 psrad m3, m3, 19 531 532 ; Vectorized av_clip_uint8 533 pmaxsd m2, m2, m9 534 pmaxsd m3, m3, m9 535 pminsd m2, m2, m10 536 pminsd m3, m3, m10 537 538 ; At this point we have clamped uint8s arranged in this order: 539 ; m2: u1 0 0 0 v1 0 0 0 [...] 540 ; m3: u5 0 0 0 v5 0 0 0 [...] 541 ; 542 ; First, we shuffle the bytes to make the bytes semi-contiguous. 543 ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with: 544 ; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4 545 ; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8 546 pshufb m2, m2, m11 547 pshufb m3, m3, m11 548 549 ; To fix the cross-lane shuffling issue, we'll then use cross-lane 550 ; permutation to combine the two segments 551 vpermd m2, m12, m2 552 vpermd m3, m12, m3 553 554 ; Now we have the final results in the lower 8 bytes of each register 555 movq [dstq], xm2 556 movq [dstq + 8], xm3 557 558 add r8d, 8 559 add dstq, 16 560 561 cmp r8d, dstWidthd 562 jl nv12_outer_%1 563 RET 564%endmacro 565 566%if HAVE_AVX2_EXTERNAL 567INIT_YMM avx2 568yuv2nv12cX_fn yuv2nv12 569yuv2nv12cX_fn yuv2nv21 570%endif 571%endif ; ARCH_X86_64 572 573;----------------------------------------------------------------------------- 574; planar grb yuv2anyX functions 575; void ff_yuv2<gbr_format>_full_X_<opt>(SwsContext *c, const int16_t *lumFilter, 576; const int16_t **lumSrcx, int lumFilterSize, 577; const int16_t *chrFilter, const int16_t **chrUSrcx, 578; const int16_t **chrVSrcx, int chrFilterSize, 579; const int16_t **alpSrcx, uint8_t **dest, 580; int dstW, int y) 581;----------------------------------------------------------------------------- 582 583%if ARCH_X86_64 584struc SwsContext 585 .padding: resb 40292 ; offsetof(SwsContext, yuv2rgb_y_offset) 586 .yuv2rgb_y_offset: resd 1 587 .yuv2rgb_y_coeff: resd 1 588 .yuv2rgb_v2r_coeff: resd 1 589 .yuv2rgb_v2g_coeff: resd 1 590 .yuv2rgb_u2g_coeff: resd 1 591 .yuv2rgb_u2b_coeff: resd 1 592endstruc 593 594%define R m0 595%define G m1 596%define B m2 597%define A m3 598 599%define Y m4 600%define U m5 601%define V m6 602 603; Clip a signed integer to an unsigned power of two range. 604; av_clip_uintp2 605; 1 - dest 606; 2 - bit position to clip at 607%macro CLIPP2 2 608 ; (~a) >> 31 & ((1<<p) - 1); 609 pcmpeqb m4, m4 610 pxor m4, %1 611 psrad m4, 31 612 movu m5, [pd_yuv2gbrp16_upper%2] 613 pand m4, m5 614 615 ; (a & ~((1<<p) - 1)) == 0 616 pandn m5, %1 617 pxor m6, m6 618 pcmpeqd m5, m6 619%if cpuflag(avx2) 620 vpblendvb %1, m4, %1, m5 621%else 622 pxor %1, m4 623 pand %1, m5 624 pxor %1, m4 625%endif 626%endmacro 627 628; 1 - dest 629; 2 - source 630%macro LOAD16 2 631 %if cpuflag(avx2) 632 movu xm%1, %2 633 vpmovsxwd m%1, xm%1 634 %elif cpuflag(sse4) 635 movsd m%1, %2 636 pmovsxwd m%1, m%1 637 %else 638 movsd m%1, %2 639 punpcklwd m%1, m%1 640 psrad m%1, 16 ; sign extend 641 %endif 642%endmacro 643 644; 1 - dest 645; 2 - source 646; 3 - depth 647%macro LOAD_PIXELS 3 648 mov ptrq, [%2 + jq*8] 649%if %3 >= 16 650 movu m%1, [ptrq + xq*4] 651%else 652 LOAD16 %1, [ptrq + xq*2] 653%endif 654%endmacro 655 656; 1 - dest 657; 2 - source 658%macro STORE8 2 659 mov ptrq, %1 660 %if mmsize > 16 661 pshufb m%2, [pb_pack_shuffle8] 662 vextractf128 xm4, m%2, 1 663 por xm%2, xm4 664 movq [ptrq + xq], xm%2 665 %else 666 %if cpuflag(sse4) 667 pshufb m%2, [pb_pack_shuffle8] 668 %else 669 psrldq m4, m%2, 3 670 por m%2, m4 671 psrldq m4, m%2, 6 672 por m%2, m4 673 %endif 674 movd [ptrq + xq], m%2 675 %endif 676%endmacro 677 678; 1 - dest 679; 2 - source 680; 3 - is big endian 681%macro STORE16 3 682 mov ptrq, %1 683 %if mmsize > 16 684 %if %3 ; bigendian 685 pshufb m%2, [pb_pack_shuffle16be] 686 %else 687 pshufb m%2, [pb_pack_shuffle16le] 688 %endif 689 vpermq m%2, m%2, (3 << 6 | 0 << 4 | 3 << 2 | 0 << 0) 690 movu [ptrq + xq*2], xm%2 691 %else 692 %if cpuflag(sse4) && %3 ; bigendian 693 pshufb m%2, [pb_pack_shuffle16be] 694 %elif cpuflag(sse4) 695 pshufb m%2, [pb_pack_shuffle16le] 696 %else 697 pshuflw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) 698 pshufhw m%2, m%2, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0) 699 pshufd m%2, m%2, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0) 700 %if %3 ; bigendian 701 psrlw m4, m%2, 8 702 psllw m%2, 8 703 por m%2, m4 704 %endif 705 %endif 706 movq [ptrq + xq*2], m%2 707 %endif 708%endmacro 709 710%macro SWAP32 1 711%if mmsize > 16 || cpuflag(sse4) 712 pshufb m%1, [pb_shuffle32be] 713%else 714 psrlw m4, m%1, 8 715 psllw m%1, 8 716 por m%1, m4 717 pshuflw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) 718 pshufhw m%1, m%1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) 719%endif 720%endmacro 721 722; 1 - dest 723; 2 - source 724; 3 - depth 725; 4 - is big endian 726%macro STORE_PIXELS 4 727%if %3 > 16 728 %if %4 729 SWAP32 %2 730 %endif 731 mov ptrq, %1 732 movu [ptrq + xq*4], m%2 733%elif %3 > 8 734 STORE16 %1, %2, %4 735%else 736 STORE8 %1, %2 737%endif 738%endmacro 739 740%macro PMULLO 3 741%if cpuflag(sse4) || mmsize > 16 742 pmulld %1, %2, %3 743%else 744 %ifidni %1, %2 745 %else 746 mova %1, %2 747 %endif 748 pshufd m7, %1, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1 749 pshufd m8, %3, (2 << 6 | 3 << 4 | 0 << 2 | 1 << 0) ; 0xb1 750 pmuludq m7, m8 751 pshufd m7, m7, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8 752 pmuludq %1, %3 753 pshufd %1, %1, (3 << 6 | 1 << 4 | 2 << 2 | 0 << 0) ; 0xd8 754 punpckldq %1, m7 755%endif 756%endmacro 757 758; 1 - name 759; 2 - depth 760; 3 - has alpha 761; 3 - is big endian 762; 5 - is float 763%macro yuv2gbrp_fn 5 764%define DEPTH %2 765%define HAS_ALPHA %3 766%define IS_BE %4 767%define FLOAT %5 768%define SH (22 + 8 - DEPTH) 769 770%if DEPTH >= 16 771 %define RGB_SHIFT 14 772 %define A_SHIFT 14 773%elif 22 != SH 774 %define RGB_SHIFT SH 775 %define A_SHIFT (SH-3) 776%else 777 %define RGB_SHIFT 22 778 %define A_SHIFT 19 779%endif 780 781%if DEPTH >= 16 782 %define YUV_SHIFT 14 783 %define Y_START m9 784 %define Y_ROUND [pd_yuv2gbrp16_round13] 785 %define UV_START m9 786 %define A_START m9 787 %define A_CLIP2P 30 788%else 789 %define YUV_SHIFT 10 790 %define Y_START [pd_yuv2gbrp_y_start] 791 %define Y_ROUND m9 792 %define UV_START [pd_yuv2gbrp_uv_start] 793 %define A_START [pd_yuv2gbrp_a_start] 794 %define A_CLIP2P 27 795%endif 796 797cglobal yuv2%1_full_X, 12, 14, 16, ptr, lumFilter, lumSrcx, lumFilterSize, chrFilter, chrUSrcx, chrVSrcx, chrFilterSize, alpSrcx, dest, dstW, y, x, j 798 VBROADCASTSS m10, dword [ptrq + SwsContext.yuv2rgb_y_offset] 799 VBROADCASTSS m11, dword [ptrq + SwsContext.yuv2rgb_y_coeff] 800 VBROADCASTSS m12, dword [ptrq + SwsContext.yuv2rgb_v2r_coeff] 801 VBROADCASTSS m13, dword [ptrq + SwsContext.yuv2rgb_v2g_coeff] 802 VBROADCASTSS m14, dword [ptrq + SwsContext.yuv2rgb_u2g_coeff] 803 VBROADCASTSS m15, dword [ptrq + SwsContext.yuv2rgb_u2b_coeff] 804 805%if DEPTH >= 16 806 movu m9, [pd_yuv2gbrp16_start] 807%else 808 mov xq, (1 << (SH-1)) 809 movq xm9, xq 810 VBROADCASTSS m9, xm9 811%endif 812 xor xq, xq 813 814 %%loop_x: 815 movu Y, Y_START 816 movu U, UV_START 817 movu V, UV_START 818 819 xor jq, jq 820 %%loop_luma: 821 movsx ptrd, word [lumFilterq + jq*2] 822 movd xm0, ptrd 823 VBROADCASTSS m0, xm0 824 LOAD_PIXELS 1, lumSrcxq, DEPTH 825 PMULLO m1, m1, m0 826 paddd Y, m1 827 inc jd 828 cmp jd, lumFilterSized 829 jl %%loop_luma 830 831%if HAS_ALPHA 832 cmp alpSrcxq, 0 833 je %%skip_alpha_load 834 xor jq, jq 835 movu A, A_START 836 %%loop_alpha: 837 movsx ptrd, word [lumFilterq + jq*2] 838 movd xm0, ptrd 839 VBROADCASTSS m0, xm0 840 LOAD_PIXELS 1, alpSrcxq, DEPTH 841 PMULLO m1, m1, m0 842 paddd A, m1 843 inc jd 844 cmp jd, lumFilterSized 845 jl %%loop_alpha 846%if DEPTH >= 16 847 psrad A, 1 848 paddd A, [pd_yuv2gbrp16_a_offset] 849%endif 850 %%skip_alpha_load: 851%endif 852 xor jq, jq 853 %%loop_chr: 854 movsx ptrd, word [chrFilterq + jq*2] 855 movd xm0, ptrd 856 VBROADCASTSS m0, xm0 857 LOAD_PIXELS 1, chrUSrcxq, DEPTH 858 LOAD_PIXELS 2, chrVSrcxq, DEPTH 859 PMULLO m1, m1, m0 860 PMULLO m2, m2, m0 861 paddd U, m1 862 paddd V, m2 863 inc jd 864 cmp jd, chrFilterSized 865 jl %%loop_chr 866 867 psrad Y, YUV_SHIFT 868%if DEPTH >= 16 869 paddd Y, [pd_yuv2gbrp16_offset] 870%endif 871 psrad U, YUV_SHIFT 872 psrad V, YUV_SHIFT 873 874 psubd Y, m10 ; yuv2rgb_y_offset 875 PMULLO Y, Y, m11 ; yuv2rgb_y_coeff 876 paddd Y, Y_ROUND 877 878 PMULLO R, V, m12 ; yuv2rgb_v2r_coeff 879 PMULLO B, U, m15 ; yuv2rgb_u2b_coeff 880 881 PMULLO U, U, m14 ; yuv2rgb_u2g_coeff 882 PMULLO V, V, m13 ; yuv2rgb_v2g_coeff 883 paddd G, U, V 884 paddd R, Y 885 paddd G, Y 886 paddd B, Y 887 888%if DEPTH < 16 889 CLIPP2 R, 30 890 CLIPP2 G, 30 891 CLIPP2 B, 30 892%endif 893 894 psrad R, RGB_SHIFT 895 psrad G, RGB_SHIFT 896 psrad B, RGB_SHIFT 897 898%if DEPTH >= 16 899 paddd R, [pd_yuv2gbrp_debias] 900 paddd G, [pd_yuv2gbrp_debias] 901 paddd B, [pd_yuv2gbrp_debias] 902 903 CLIPP2 R, 16 904 CLIPP2 G, 16 905 CLIPP2 B, 16 906%endif 907 908%if FLOAT 909 cvtdq2ps R, R 910 cvtdq2ps G, G 911 cvtdq2ps B, B 912 mulps R, [pd_65535_invf] 913 mulps G, [pd_65535_invf] 914 mulps B, [pd_65535_invf] 915%endif 916 STORE_PIXELS [destq + 0], 1, DEPTH, IS_BE ; G 917 STORE_PIXELS [destq + 8], 2, DEPTH, IS_BE ; B 918 STORE_PIXELS [destq + 16], 0, DEPTH, IS_BE ; R 919 920%if HAS_ALPHA 921 cmp alpSrcxq, 0 922 je %%skip_alpha_store 923 CLIPP2 A, A_CLIP2P 924 psrad A, A_SHIFT 925%if FLOAT 926 cvtdq2ps A, A 927 mulps A, [pd_65535_invf] 928%endif 929 STORE_PIXELS [destq + 24], 3, DEPTH, IS_BE 930 %%skip_alpha_store: 931%endif 932 add xq, mmsize/4 933 cmp xd, dstWd 934 jl %%loop_x 935 936 RET 937%endmacro 938 939%macro yuv2gbrp_fn_decl 2 940INIT_%1 %2 941yuv2gbrp_fn gbrp, 8, 0, 0, 0 942yuv2gbrp_fn gbrap, 8, 1, 0, 0 943yuv2gbrp_fn gbrp9le, 9, 0, 0, 0 944yuv2gbrp_fn gbrp10le, 10, 0, 0, 0 945yuv2gbrp_fn gbrap10le, 10, 1, 0, 0 946yuv2gbrp_fn gbrp12le, 12, 0, 0, 0 947yuv2gbrp_fn gbrap12le, 12, 1, 0, 0 948yuv2gbrp_fn gbrp14le, 14, 0, 0, 0 949yuv2gbrp_fn gbrp16le, 16, 0, 0, 0 950yuv2gbrp_fn gbrap16le, 16, 1, 0, 0 951yuv2gbrp_fn gbrpf32le, 32, 0, 0, 1 952yuv2gbrp_fn gbrapf32le, 32, 1, 0, 1 953 954yuv2gbrp_fn gbrp9be, 9, 0, 1, 0 955yuv2gbrp_fn gbrp10be, 10, 0, 1, 0 956yuv2gbrp_fn gbrap10be, 10, 1, 1, 0 957yuv2gbrp_fn gbrp12be, 12, 0, 1, 0 958yuv2gbrp_fn gbrap12be, 12, 1, 1, 0 959yuv2gbrp_fn gbrp14be, 14, 0, 1, 0 960yuv2gbrp_fn gbrp16be, 16, 0, 1, 0 961yuv2gbrp_fn gbrap16be, 16, 1, 1, 0 962yuv2gbrp_fn gbrpf32be, 32, 0, 1, 1 963yuv2gbrp_fn gbrapf32be, 32, 1, 1, 1 964%endmacro 965 966yuv2gbrp_fn_decl XMM, sse2 967yuv2gbrp_fn_decl XMM, sse4 968 969%if HAVE_AVX2_EXTERNAL 970yuv2gbrp_fn_decl YMM, avx2 971%endif 972 973%endif ; ARCH_X86_64 974