1;****************************************************************************** 2;* x86-optimized vertical line scaling functions 3;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> 4;* Kieran Kunhya <kieran@kunhya.com> 5;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com> 6;* 7;* This file is part of FFmpeg. 8;* 9;* FFmpeg is free software; you can redistribute it and/or 10;* modify it under the terms of the GNU Lesser General Public 11;* License as published by the Free Software Foundation; either 12;* version 2.1 of the License, or (at your option) any later version. 13;* 14;* FFmpeg is distributed in the hope that it will be useful, 15;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17;* Lesser General Public License for more details. 18;* 19;* You should have received a copy of the GNU Lesser General Public 20;* License along with FFmpeg; if not, write to the Free Software 21;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22;****************************************************************************** 23 24%include "libavutil/x86/x86util.asm" 25 26SECTION_RODATA 32 27 28minshort: times 8 dw 0x8000 29yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000 30yuv2yuvX_10_start: times 4 dd 0x10000 31yuv2yuvX_9_start: times 4 dd 0x20000 32yuv2yuvX_10_upper: times 8 dw 0x3ff 33yuv2yuvX_9_upper: times 8 dw 0x1ff 34pd_4: times 4 dd 4 35pd_4min0x40000:times 4 dd 4 - (0x40000) 36pw_16: times 8 dw 16 37pw_32: times 8 dw 32 38pd_255: times 8 dd 255 39pw_512: times 8 dw 512 40pw_1024: times 8 dw 1024 41 42yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \ 43 -1, -1, -1, -1, \ 44 -1, -1, -1, -1, \ 45 -1, -1, -1, -1 46yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \ 47 -1, -1, -1, -1, \ 48 -1, -1, -1, -1, \ 49 -1, -1, -1, -1 50yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7 51 52SECTION .text 53 54;----------------------------------------------------------------------------- 55; vertical line scaling 56; 57; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW, 58; const uint8_t *dither, int offset) 59; and 60; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize, 61; const int16_t **src, uint8_t *dst, int dstW, 62; const uint8_t *dither, int offset) 63; 64; Scale one or $filterSize lines of source data to generate one line of output 65; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in 66; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple 67; of 2. $offset is either 0 or 3. $dither holds 8 values. 68;----------------------------------------------------------------------------- 69%macro yuv2planeX_mainloop 2 70.pixelloop_%2: 71%assign %%i 0 72 ; the rep here is for the 8-bit output MMX case, where dither covers 73 ; 8 pixels but we can only handle 2 pixels per register, and thus 4 74 ; pixels per iteration. In order to not have to keep track of where 75 ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2. 76%if %1 == 8 77%assign %%repcnt 16/mmsize 78%else 79%assign %%repcnt 1 80%endif 81 82%rep %%repcnt 83 84%if %1 == 8 85%if ARCH_X86_32 86 mova m2, [rsp+mmsize*(0+%%i)] 87 mova m1, [rsp+mmsize*(1+%%i)] 88%else ; x86-64 89 mova m2, m8 90 mova m1, m_dith 91%endif ; x86-32/64 92%else ; %1 == 9/10/16 93 mova m1, [yuv2yuvX_%1_start] 94 mova m2, m1 95%endif ; %1 == 8/9/10/16 96 movsx cntr_reg, fltsizem 97.filterloop_%2_ %+ %%i: 98 ; input pixels 99 mov r6, [srcq+gprsize*cntr_reg-2*gprsize] 100%if %1 == 16 101 mova m3, [r6+r5*4] 102 mova m5, [r6+r5*4+mmsize] 103%else ; %1 == 8/9/10 104 mova m3, [r6+r5*2] 105%endif ; %1 == 8/9/10/16 106 mov r6, [srcq+gprsize*cntr_reg-gprsize] 107%if %1 == 16 108 mova m4, [r6+r5*4] 109 mova m6, [r6+r5*4+mmsize] 110%else ; %1 == 8/9/10 111 mova m4, [r6+r5*2] 112%endif ; %1 == 8/9/10/16 113 114 ; coefficients 115 movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1] 116%if %1 == 16 117 pshuflw m7, m0, 0 ; coeff[0] 118 pshuflw m0, m0, 0x55 ; coeff[1] 119 pmovsxwd m7, m7 ; word -> dword 120 pmovsxwd m0, m0 ; word -> dword 121 122 pmulld m3, m7 123 pmulld m5, m7 124 pmulld m4, m0 125 pmulld m6, m0 126 127 paddd m2, m3 128 paddd m1, m5 129 paddd m2, m4 130 paddd m1, m6 131%else ; %1 == 10/9/8 132 punpcklwd m5, m3, m4 133 punpckhwd m3, m4 134 SPLATD m0 135 136 pmaddwd m5, m0 137 pmaddwd m3, m0 138 139 paddd m2, m5 140 paddd m1, m3 141%endif ; %1 == 8/9/10/16 142 143 sub cntr_reg, 2 144 jg .filterloop_%2_ %+ %%i 145 146%if %1 == 16 147 psrad m2, 31 - %1 148 psrad m1, 31 - %1 149%else ; %1 == 10/9/8 150 psrad m2, 27 - %1 151 psrad m1, 27 - %1 152%endif ; %1 == 8/9/10/16 153 154%if %1 == 8 155 packssdw m2, m1 156 packuswb m2, m2 157 movh [dstq+r5*1], m2 158%else ; %1 == 9/10/16 159%if %1 == 16 160 packssdw m2, m1 161 paddw m2, [minshort] 162%else ; %1 == 9/10 163%if cpuflag(sse4) 164 packusdw m2, m1 165%else ; mmxext/sse2 166 packssdw m2, m1 167 pmaxsw m2, m6 168%endif ; mmxext/sse2/sse4/avx 169 pminsw m2, [yuv2yuvX_%1_upper] 170%endif ; %1 == 9/10/16 171 mov%2 [dstq+r5*2], m2 172%endif ; %1 == 8/9/10/16 173 174 add r5, mmsize/2 175 sub wd, mmsize/2 176 177%assign %%i %%i+2 178%endrep 179 jg .pixelloop_%2 180%endmacro 181 182%macro yuv2planeX_fn 3 183 184%if ARCH_X86_32 185%define cntr_reg fltsizeq 186%define movsx mov 187%else 188%define cntr_reg r7 189%define movsx movsxd 190%endif 191 192cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset 193%if %1 == 8 || %1 == 9 || %1 == 10 194 pxor m6, m6 195%endif ; %1 == 8/9/10 196 197%if %1 == 8 198%if ARCH_X86_32 199%assign pad 0x2c - (stack_offset & 15) 200 SUB rsp, pad 201%define m_dith m7 202%else ; x86-64 203%define m_dith m9 204%endif ; x86-32 205 206 ; create registers holding dither 207 movq m_dith, [ditherq] ; dither 208 test offsetd, offsetd 209 jz .no_rot 210%if mmsize == 16 211 punpcklqdq m_dith, m_dith 212%endif ; mmsize == 16 213 PALIGNR m_dith, m_dith, 3, m0 214.no_rot: 215%if mmsize == 16 216 punpcklbw m_dith, m6 217%if ARCH_X86_64 218 punpcklwd m8, m_dith, m6 219 pslld m8, 12 220%else ; x86-32 221 punpcklwd m5, m_dith, m6 222 pslld m5, 12 223%endif ; x86-32/64 224 punpckhwd m_dith, m6 225 pslld m_dith, 12 226%if ARCH_X86_32 227 mova [rsp+ 0], m5 228 mova [rsp+16], m_dith 229%endif 230%else ; mmsize == 8 231 punpcklbw m5, m_dith, m6 232 punpckhbw m_dith, m6 233 punpcklwd m4, m5, m6 234 punpckhwd m5, m6 235 punpcklwd m3, m_dith, m6 236 punpckhwd m_dith, m6 237 pslld m4, 12 238 pslld m5, 12 239 pslld m3, 12 240 pslld m_dith, 12 241 mova [rsp+ 0], m4 242 mova [rsp+ 8], m5 243 mova [rsp+16], m3 244 mova [rsp+24], m_dith 245%endif ; mmsize == 8/16 246%endif ; %1 == 8 247 248 xor r5, r5 249 250%if mmsize == 8 || %1 == 8 251 yuv2planeX_mainloop %1, a 252%else ; mmsize == 16 253 test dstq, 15 254 jnz .unaligned 255 yuv2planeX_mainloop %1, a 256 REP_RET 257.unaligned: 258 yuv2planeX_mainloop %1, u 259%endif ; mmsize == 8/16 260 261%if %1 == 8 262%if ARCH_X86_32 263 ADD rsp, pad 264 RET 265%else ; x86-64 266 REP_RET 267%endif ; x86-32/64 268%else ; %1 == 9/10/16 269 REP_RET 270%endif ; %1 == 8/9/10/16 271%endmacro 272 273%if ARCH_X86_32 274INIT_MMX mmxext 275yuv2planeX_fn 8, 0, 7 276yuv2planeX_fn 9, 0, 5 277yuv2planeX_fn 10, 0, 5 278%endif 279 280INIT_XMM sse2 281yuv2planeX_fn 8, 10, 7 282yuv2planeX_fn 9, 7, 5 283yuv2planeX_fn 10, 7, 5 284 285INIT_XMM sse4 286yuv2planeX_fn 8, 10, 7 287yuv2planeX_fn 9, 7, 5 288yuv2planeX_fn 10, 7, 5 289yuv2planeX_fn 16, 8, 5 290 291%if HAVE_AVX_EXTERNAL 292INIT_XMM avx 293yuv2planeX_fn 8, 10, 7 294yuv2planeX_fn 9, 7, 5 295yuv2planeX_fn 10, 7, 5 296%endif 297 298; %1=outout-bpc, %2=alignment (u/a) 299%macro yuv2plane1_mainloop 2 300.loop_%2: 301%if %1 == 8 302 paddsw m0, m2, [srcq+wq*2+mmsize*0] 303 paddsw m1, m3, [srcq+wq*2+mmsize*1] 304 psraw m0, 7 305 psraw m1, 7 306 packuswb m0, m1 307 mov%2 [dstq+wq], m0 308%elif %1 == 16 309 paddd m0, m4, [srcq+wq*4+mmsize*0] 310 paddd m1, m4, [srcq+wq*4+mmsize*1] 311 paddd m2, m4, [srcq+wq*4+mmsize*2] 312 paddd m3, m4, [srcq+wq*4+mmsize*3] 313 psrad m0, 3 314 psrad m1, 3 315 psrad m2, 3 316 psrad m3, 3 317%if cpuflag(sse4) ; avx/sse4 318 packusdw m0, m1 319 packusdw m2, m3 320%else ; mmx/sse2 321 packssdw m0, m1 322 packssdw m2, m3 323 paddw m0, m5 324 paddw m2, m5 325%endif ; mmx/sse2/sse4/avx 326 mov%2 [dstq+wq*2+mmsize*0], m0 327 mov%2 [dstq+wq*2+mmsize*1], m2 328%else ; %1 == 9/10 329 paddsw m0, m2, [srcq+wq*2+mmsize*0] 330 paddsw m1, m2, [srcq+wq*2+mmsize*1] 331 psraw m0, 15 - %1 332 psraw m1, 15 - %1 333 pmaxsw m0, m4 334 pmaxsw m1, m4 335 pminsw m0, m3 336 pminsw m1, m3 337 mov%2 [dstq+wq*2+mmsize*0], m0 338 mov%2 [dstq+wq*2+mmsize*1], m1 339%endif 340 add wq, mmsize 341 jl .loop_%2 342%endmacro 343 344%macro yuv2plane1_fn 3 345cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset 346 movsxdifnidn wq, wd 347 add wq, mmsize - 1 348 and wq, ~(mmsize - 1) 349%if %1 == 8 350 add dstq, wq 351%else ; %1 != 8 352 lea dstq, [dstq+wq*2] 353%endif ; %1 == 8 354%if %1 == 16 355 lea srcq, [srcq+wq*4] 356%else ; %1 != 16 357 lea srcq, [srcq+wq*2] 358%endif ; %1 == 16 359 neg wq 360 361%if %1 == 8 362 pxor m4, m4 ; zero 363 364 ; create registers holding dither 365 movq m3, [ditherq] ; dither 366 test offsetd, offsetd 367 jz .no_rot 368%if mmsize == 16 369 punpcklqdq m3, m3 370%endif ; mmsize == 16 371 PALIGNR m3, m3, 3, m2 372.no_rot: 373%if mmsize == 8 374 mova m2, m3 375 punpckhbw m3, m4 ; byte->word 376 punpcklbw m2, m4 ; byte->word 377%else 378 punpcklbw m3, m4 379 mova m2, m3 380%endif 381%elif %1 == 9 382 pxor m4, m4 383 mova m3, [pw_512] 384 mova m2, [pw_32] 385%elif %1 == 10 386 pxor m4, m4 387 mova m3, [pw_1024] 388 mova m2, [pw_16] 389%else ; %1 == 16 390%if cpuflag(sse4) ; sse4/avx 391 mova m4, [pd_4] 392%else ; mmx/sse2 393 mova m4, [pd_4min0x40000] 394 mova m5, [minshort] 395%endif ; mmx/sse2/sse4/avx 396%endif ; %1 == .. 397 398 ; actual pixel scaling 399%if mmsize == 8 400 yuv2plane1_mainloop %1, a 401%else ; mmsize == 16 402 test dstq, 15 403 jnz .unaligned 404 yuv2plane1_mainloop %1, a 405 REP_RET 406.unaligned: 407 yuv2plane1_mainloop %1, u 408%endif ; mmsize == 8/16 409 REP_RET 410%endmacro 411 412%if ARCH_X86_32 413INIT_MMX mmx 414yuv2plane1_fn 8, 0, 5 415yuv2plane1_fn 16, 0, 3 416 417INIT_MMX mmxext 418yuv2plane1_fn 9, 0, 3 419yuv2plane1_fn 10, 0, 3 420%endif 421 422INIT_XMM sse2 423yuv2plane1_fn 8, 5, 5 424yuv2plane1_fn 9, 5, 3 425yuv2plane1_fn 10, 5, 3 426yuv2plane1_fn 16, 6, 3 427 428INIT_XMM sse4 429yuv2plane1_fn 16, 5, 3 430 431%if HAVE_AVX_EXTERNAL 432INIT_XMM avx 433yuv2plane1_fn 8, 5, 5 434yuv2plane1_fn 9, 5, 3 435yuv2plane1_fn 10, 5, 3 436yuv2plane1_fn 16, 5, 3 437%endif 438 439%undef movsx 440 441;----------------------------------------------------------------------------- 442; AVX2 yuv2nv12cX implementation 443; 444; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither, 445; const int16_t *filter, int filterSize, 446; const int16_t **u, const int16_t **v, 447; uint8_t *dst, int dstWidth) 448; 449; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither, 450; const int16_t *filter, int filterSize, 451; const int16_t **u, const int16_t **v, 452; uint8_t *dst, int dstWidth) 453;----------------------------------------------------------------------------- 454 455%if ARCH_X86_64 456%macro yuv2nv12cX_fn 1 457cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth 458 459 mov tmp1q, qword [ditherq] 460 movq xm0, tmp1q 461 ror tmp1q, 24 462 movq xm1, tmp1q 463 464 pmovzxbd m0, xm0 465 pslld m0, m0, 12 ; ditherLo 466 pmovzxbd m1, xm1 467 pslld m1, m1, 12 ; ditherHi 468 469 pxor m9, m9 ; uint8_min dwords 470 mova m10, [pd_255] ; uint8_max dwords 471 mova m11, [%1_shuffle_mask] ; shuffle_mask 472 mova m12, [yuv2nv12_permute_mask] ; permute mask 473 474 DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth 475 476 xor r8q, r8q 477 478nv12_outer_%1: 479 mova m2, m0 ; resultLo 480 mova m3, m1 ; resultHi 481 xor r9q, r9q 482 483nv12_inner_%1: 484 movsx r10d, word [filterq + (2 * r9q)] 485 movd xm4, r10d 486 vpbroadcastd m4, xm4 ; filter 487 488 mov tmp1q, [uq + (gprsize * r9q)] 489 mova xm7, oword [tmp1q + 2 * r8q] 490 491 mov tmp2q, [vq + (gprsize * r9q)] 492 mova xm8, oword [tmp2q + 2 * r8q] 493 494 punpcklwd xm5, xm7, xm8 495 pmovsxwd m5, xm5 ; multiplicandsLo 496 punpckhwd xm6, xm7, xm8 497 pmovsxwd m6, xm6 ; multiplicandsHi 498 499 pmulld m7, m5, m4 ; mulResultLo 500 pmulld m8, m6, m4 ; mulResultHi 501 paddd m2, m2, m7 ; resultLo += mulResultLo 502 paddd m3, m3, m8 ; resultHi += mulResultHi 503 504 inc r9d 505 cmp r9d, filterSized 506 jl nv12_inner_%1 507 ; end of inner loop 508 509 psrad m2, m2, 19 510 psrad m3, m3, 19 511 512 ; Vectorized av_clip_uint8 513 pmaxsd m2, m2, m9 514 pmaxsd m3, m3, m9 515 pminsd m2, m2, m10 516 pminsd m3, m3, m10 517 518 ; At this point we have clamped uint8s arranged in this order: 519 ; m2: u1 0 0 0 v1 0 0 0 [...] 520 ; m3: u5 0 0 0 v5 0 0 0 [...] 521 ; 522 ; First, we shuffle the bytes to make the bytes semi-contiguous. 523 ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with: 524 ; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4 525 ; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8 526 pshufb m2, m2, m11 527 pshufb m3, m3, m11 528 529 ; To fix the cross-lane shuffling issue, we'll then use cross-lane 530 ; permutation to combine the two segments 531 vpermd m2, m12, m2 532 vpermd m3, m12, m3 533 534 ; Now we have the final results in the lower 8 bytes of each register 535 movq [dstq], xm2 536 movq [dstq + 8], xm3 537 538 add r8d, 8 539 add dstq, 16 540 541 cmp r8d, dstWidthd 542 jl nv12_outer_%1 543 RET 544%endmacro 545 546%if HAVE_AVX2_EXTERNAL 547INIT_YMM avx2 548yuv2nv12cX_fn yuv2nv12 549yuv2nv12cX_fn yuv2nv21 550%endif 551%endif ; ARCH_X86_64 552