1;***************************************************************************** 2;* x86-optimized functions for colorspace filter 3;* 4;* Copyright (C) 2016 Ronald S. Bultje <rsbultje@gmail.com> 5;* 6;* This file is part of FFmpeg. 7;* 8;* FFmpeg is free software; you can redistribute it and/or 9;* modify it under the terms of the GNU Lesser General Public 10;* License as published by the Free Software Foundation; either 11;* version 2.1 of the License, or (at your option) any later version. 12;* 13;* FFmpeg is distributed in the hope that it will be useful, 14;* but WITHOUT ANY WARRANTY; without even the implied warranty of 15;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16;* Lesser General Public License for more details. 17;* 18;* You should have received a copy of the GNU Lesser General Public 19;* License along with FFmpeg; if not, write to the Free Software 20;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 21;****************************************************************************** 22 23%include "libavutil/x86/x86util.asm" 24 25SECTION_RODATA 26 27pw_1: times 8 dw 1 28pw_2: times 8 dw 2 29pw_4: times 8 dw 4 30pw_8: times 8 dw 8 31pw_16: times 8 dw 16 32pw_64: times 8 dw 64 33pw_128: times 8 dw 128 34pw_256: times 8 dw 256 35pw_512: times 8 dw 512 36pw_1023: times 8 dw 1023 37pw_1024: times 8 dw 1024 38pw_2048: times 8 dw 2048 39pw_4095: times 8 dw 4095 40pw_8192: times 8 dw 8192 41pw_16384: times 8 dw 16384 42 43pd_1: times 4 dd 1 44pd_2: times 4 dd 2 45pd_128: times 4 dd 128 46pd_512: times 4 dd 512 47pd_2048: times 4 dd 2048 48pd_8192: times 4 dd 8192 49pd_32768: times 4 dd 32768 50pd_131072: times 4 dd 131072 51 52SECTION .text 53 54; void ff_yuv2yuv_420p8to8_sse2(uint8_t *yuv_out[3], ptrdiff_t yuv_out_stride[3], 55; uint8_t *yuv_in[3], ptrdiff_t yuv_in_stride[3], 56; int w, int h, const int16_t yuv2yuv_coeffs[3][3][8], 57; const int16_t yuv_offset[2][8]) 58 59%if ARCH_X86_64 60%macro YUV2YUV_FN 4 ; in_bitdepth, out_bitdepth, log2_chroma_w (horiz), log2_chroma_h (vert) 61 62%assign %%sh (14 + %1 - %2) 63%assign %%rnd (1 << (%%sh - 1)) 64%assign %%uvinoff (128 << (%1 - 8)) 65%assign %%uvoutoff (128 << (%2 - 8)) 66%if %3 == 0 67%assign %%ss 444 68%elif %4 == 0 69%assign %%ss 422 70%else ; %4 == 1 71%assign %%ss 420 72%endif ; %3/%4 73%if %2 != 8 74%assign %%maxval (1 << %2) - 1 75%endif ; %2 != 8 76 77%assign %%ypsh %%sh - 1 78%if %%ypsh > 14 79%assign %%yoffsh %%ypsh - 13 80%assign %%ypsh 14 81%else 82%assign %%yoffsh 1 83%endif 84%assign %%yprnd (1 << (%%yoffsh - 1)) 85%assign %%ypmul (1 << %%ypsh) 86 87cglobal yuv2yuv_ %+ %%ss %+ p%1to%2, 8, 14, 16, 0 - (4 * mmsize), \ 88 yo, yos, yi, yis, w, h, c, yoff, ui, vi, uo, vo 89%if %3 == 1 90 inc wd 91 sar wd, 1 92%if %4 == 1 93 inc hd 94 sar hd, 1 95%endif ; %4 == 1 96%endif ; %3 == 1 97 mov [rsp+3*mmsize+0], wd 98 mov [rsp+3*mmsize+4], hd 99 100 mova m10, [cq] 101 pxor m11, m11 102 mova m12, [pd_ %+ %%uvoutoff] 103 pslld m12, %%sh 104 paddd m12, [pd_ %+ %%rnd] 105 mova m13, [pw_ %+ %%uvinoff] 106 mova m14, [yoffq+ 0] ; y_off_in 107 mova m15, [yoffq+16] ; y_off_out 108%if %%yoffsh != 0 109 psllw m15, %%yoffsh 110%endif 111 paddw m15, [pw_ %+ %%yprnd] 112 punpcklwd m10, m15 113 mova m15, [pw_ %+ %%ypmul] 114 movh m0, [cq+1*16] ; cyu 115 movh m1, [cq+2*16] ; cyv 116 movh m2, [cq+4*16] ; cuu 117 movh m3, [cq+5*16] ; cuv 118 movh m4, [cq+7*16] ; cvu 119 movh m5, [cq+8*16] ; cvv 120 punpcklwd m0, m1 121 punpcklwd m2, m3 122 punpcklwd m4, m5 123 mova [rsp+0*mmsize], m0 124 mova [rsp+1*mmsize], m2 125 mova [rsp+2*mmsize], m4 126 127 DEFINE_ARGS yo, yos, yi, yis, ui, vi, uo, vo, uis, vis, uos, vos, x, tmp 128 129 mov uiq, [yiq+gprsize*1] 130 mov viq, [yiq+gprsize*2] 131 mov yiq, [yiq+gprsize*0] 132 mov uoq, [yoq+gprsize*1] 133 mov voq, [yoq+gprsize*2] 134 mov yoq, [yoq+gprsize*0] 135 mov uisq, [yisq+gprsize*1] 136 mov visq, [yisq+gprsize*2] 137 mov yisq, [yisq+gprsize*0] 138 mov uosq, [yosq+gprsize*1] 139 mov vosq, [yosq+gprsize*2] 140 mov yosq, [yosq+gprsize*0] 141 142.loop_v: 143 xor xq, xq 144 145.loop_h: 146%if %4 == 1 147 lea tmpq, [yiq+yisq] 148%endif ; %4 == 1 149%if %1 == 8 150 movu m0, [yiq+xq*(1<<%3)] ; y00/01 151%if %4 == 1 152 movu m2, [tmpq+xq*2] ; y10/11 153%endif ; %4 == 1 154%if %3 == 1 155 movh m4, [uiq+xq] ; u 156 movh m5, [viq+xq] ; v 157%else ; %3 != 1 158 movu m4, [uiq+xq] ; u 159 movu m5, [viq+xq] ; v 160%endif ; %3 ==/!= 1 161 punpckhbw m1, m0, m11 162 punpcklbw m0, m11 163%if %4 == 1 164 punpckhbw m3, m2, m11 165 punpcklbw m2, m11 166%endif ; %4 == 1 167%if %3 == 0 168 punpckhbw m2, m4, m11 169 punpckhbw m3, m5, m11 170%endif ; %3 == 0 171 punpcklbw m4, m11 172 punpcklbw m5, m11 173%else ; %1 != 8 174 movu m0, [yiq+xq*(2<<%3)] ; y00/01 175 movu m1, [yiq+xq*(2<<%3)+mmsize] ; y00/01 176%if %4 == 1 177 movu m2, [tmpq+xq*4] ; y10/11 178 movu m3, [tmpq+xq*4+mmsize] ; y10/11 179%endif ; %4 == 1 180 movu m4, [uiq+xq*2] ; u 181 movu m5, [viq+xq*2] ; v 182%if %3 == 0 183 movu m2, [uiq+xq*2+mmsize] 184 movu m3, [viq+xq*2+mmsize] 185%endif ; %3 == 0 186%endif ; %1 ==/!= 8 187 psubw m0, m14 188 psubw m1, m14 189%if %4 == 1 190 psubw m2, m14 191 psubw m3, m14 192%endif ; %4 == 1 193 psubw m4, m13 194 psubw m5, m13 195%if %3 == 0 196 psubw m2, m13 197 psubw m3, m13 198%endif ; %3 == 0 199 200 SBUTTERFLY wd, 4, 5, 6 201 pmaddwd m6, m4, [rsp+1*mmsize] 202 pmaddwd m7, m5, [rsp+1*mmsize] 203%if %3 == 0 204 SBUTTERFLY wd, 2, 3, 8 205 pmaddwd m8, m2, [rsp+1*mmsize] 206 pmaddwd m9, m3, [rsp+1*mmsize] 207%else ; %3 != 0 208 pmaddwd m8, m4, [rsp+2*mmsize] 209 pmaddwd m9, m5, [rsp+2*mmsize] 210%endif 211 paddd m6, m12 212 paddd m7, m12 213 paddd m8, m12 214 paddd m9, m12 215 psrad m6, %%sh 216 psrad m7, %%sh 217 psrad m8, %%sh 218 psrad m9, %%sh 219 packssdw m6, m7 220 packssdw m8, m9 221%if %2 == 8 222 packuswb m6, m8 223%if %3 == 0 224 movu [uoq+xq], m6 225%else ; %3 != 0 226 movh [uoq+xq], m6 227 movhps [voq+xq], m6 228%endif ; %3 ==/!= 0 229%else ; %2 != 8 230 CLIPW m6, m11, [pw_ %+ %%maxval] 231 CLIPW m8, m11, [pw_ %+ %%maxval] 232 movu [uoq+xq*2], m6 233%if %3 == 0 234 movu [uoq+xq*2+mmsize], m8 235%else ; %3 != 0 236 movu [voq+xq*2], m8 237%endif ; %3 ==/!= 0 238%endif ; %2 ==/!= 8 239 240%if %3 == 0 241 pmaddwd m6, m4, [rsp+2*mmsize] 242 pmaddwd m7, m5, [rsp+2*mmsize] 243 pmaddwd m8, m2, [rsp+2*mmsize] 244 pmaddwd m9, m3, [rsp+2*mmsize] 245 paddd m6, m12 246 paddd m7, m12 247 paddd m8, m12 248 paddd m9, m12 249 psrad m6, %%sh 250 psrad m7, %%sh 251 psrad m8, %%sh 252 psrad m9, %%sh 253 packssdw m6, m7 254 packssdw m8, m9 255%if %2 == 8 256 packuswb m6, m8 257 movu [voq+xq], m6 258%else ; %2 != 8 259 CLIPW m6, m11, [pw_ %+ %%maxval] 260 CLIPW m8, m11, [pw_ %+ %%maxval] 261 movu [voq+xq*2], m6 262 movu [voq+xq*2+mmsize], m8 263%endif ; %2 ==/!= 8 264%endif ; %3 == 0 265 266 pmaddwd m4, [rsp+0*mmsize] 267 pmaddwd m5, [rsp+0*mmsize] ; uv_val 268%if %3 == 0 269 pmaddwd m2, [rsp+0*mmsize] 270 pmaddwd m3, [rsp+0*mmsize] 271%endif ; %3 == 0 272 273 ; unpack y pixels with m15 (shifted round + offset), then multiply 274 ; by m10, add uv pixels, and we're done! 275%if %3 == 1 276 punpckhdq m8, m4, m4 277 punpckldq m4, m4 278 punpckhdq m9, m5, m5 279 punpckldq m5, m5 280%else ; %3 != 1 281 SWAP 8, 5, 2 282 SWAP 3, 9 283%endif ; %3 ==/!= 1 284%if %4 == 1 285 punpckhwd m6, m2, m15 286 punpcklwd m2, m15 287 punpckhwd m7, m3, m15 288 punpcklwd m3, m15 289 pmaddwd m2, m10 290 pmaddwd m6, m10 291 pmaddwd m3, m10 292 pmaddwd m7, m10 293 paddd m2, m4 294 paddd m6, m8 295 paddd m3, m5 296 paddd m7, m9 297 psrad m2, %%sh 298 psrad m6, %%sh 299 psrad m3, %%sh 300 psrad m7, %%sh 301 packssdw m2, m6 302 packssdw m3, m7 303 304 lea tmpq, [yoq+yosq] 305%if %2 == 8 306 packuswb m2, m3 307 movu [tmpq+xq*2], m2 308%else ; %2 != 8 309 CLIPW m2, m11, [pw_ %+ %%maxval] 310 CLIPW m3, m11, [pw_ %+ %%maxval] 311 movu [tmpq+xq*4], m2 312 movu [tmpq+xq*4+mmsize], m3 313%endif ; %2 ==/!= 8 314%endif ; %4 == 1 315 316 punpckhwd m6, m0, m15 317 punpcklwd m0, m15 318 punpckhwd m7, m1, m15 319 punpcklwd m1, m15 320 pmaddwd m0, m10 321 pmaddwd m6, m10 322 pmaddwd m1, m10 323 pmaddwd m7, m10 324 paddd m0, m4 325 paddd m6, m8 326 paddd m1, m5 327 paddd m7, m9 328 psrad m0, %%sh 329 psrad m6, %%sh 330 psrad m1, %%sh 331 psrad m7, %%sh 332 packssdw m0, m6 333 packssdw m1, m7 334 335%if %2 == 8 336 packuswb m0, m1 337 movu [yoq+xq*(1<<%3)], m0 338%else ; %2 != 8 339 CLIPW m0, m11, [pw_ %+ %%maxval] 340 CLIPW m1, m11, [pw_ %+ %%maxval] 341 movu [yoq+xq*(2<<%3)], m0 342 movu [yoq+xq*(2<<%3)+mmsize], m1 343%endif ; %2 ==/!= 8 344 345 add xq, mmsize >> %3 346 cmp xd, dword [rsp+3*mmsize+0] 347 jl .loop_h 348 349%if %4 == 1 350 lea yiq, [yiq+yisq*2] 351 lea yoq, [yoq+yosq*2] 352%else ; %4 != 1 353 add yiq, yisq 354 add yoq, yosq 355%endif ; %4 ==/!= 1 356 add uiq, uisq 357 add viq, visq 358 add uoq, uosq 359 add voq, vosq 360 dec dword [rsp+3*mmsize+4] 361 jg .loop_v 362 363 RET 364%endmacro 365 366%macro YUV2YUV_FNS 2 ; ss_w, ss_h 367YUV2YUV_FN 8, 8, %1, %2 368YUV2YUV_FN 10, 8, %1, %2 369YUV2YUV_FN 12, 8, %1, %2 370YUV2YUV_FN 8, 10, %1, %2 371YUV2YUV_FN 10, 10, %1, %2 372YUV2YUV_FN 12, 10, %1, %2 373YUV2YUV_FN 8, 12, %1, %2 374YUV2YUV_FN 10, 12, %1, %2 375YUV2YUV_FN 12, 12, %1, %2 376%endmacro 377 378INIT_XMM sse2 379YUV2YUV_FNS 0, 0 380YUV2YUV_FNS 1, 0 381YUV2YUV_FNS 1, 1 382 383; void ff_yuv2rgb_420p8_sse2(int16_t *rgb[3], ptrdiff_t rgb_stride, 384; uint8_t *yuv[3], ptrdiff_t yuv_stride[3], 385; int w, int h, const int16_t yuv2rgb_coeffs[3][3][8], 386; const int16_t yuv_offset[8]) 387%macro YUV2RGB_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) 388%assign %%sh (%1 - 1) 389%assign %%rnd (1 << (%%sh - 1)) 390%assign %%uvoff (1 << (%1 - 1)) 391%if %2 == 0 392%assign %%ss 444 393%elif %3 == 0 394%assign %%ss 422 395%else ; %3 == 1 396%assign %%ss 420 397%endif ; %2/%3 398 399cglobal yuv2rgb_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 8 * mmsize, \ 400 rgb, rgbs, yuv, yuvs, ww, h, c, yoff 401%if %2 == 1 402 inc wwd 403 sar wwd, 1 404%endif ; %2 == 1 405%if %3 == 1 406 inc hd 407 sar hd, 1 408%endif ; %3 == 1 409 pxor m11, m11 410 mova m15, [yoffq] ; yoff 411 movh m14, [cq+ 0] ; cy 412 movh m10, [cq+ 32] ; crv 413 movh m13, [cq+112] ; cbu 414 movh m12, [cq+ 64] ; cgu 415 movh m9, [cq+ 80] ; cgv 416 punpcklwd m14, [pw_ %+ %%rnd] ; cy, rnd 417 punpcklwd m13, m11 ; cbu, 0 418 punpcklwd m11, m10 ; 0, crv 419 punpcklwd m12, m9 ; cgu, cgv 420 mova [rsp+0*mmsize], m11 421 mova [rsp+1*mmsize], m12 422 mova [rsp+2*mmsize], m13 423 mova [rsp+3*mmsize], m14 424 pxor m14, m14 425 426 DEFINE_ARGS r, rgbs, y, ys, ww, h, g, b, u, v, us, vs, x, tmp 427 428 mov gq, [rq+1*gprsize] 429 mov bq, [rq+2*gprsize] 430 mov rq, [rq+0*gprsize] 431 mov uq, [yq+1*gprsize] 432 mov vq, [yq+2*gprsize] 433 mov yq, [yq+0*gprsize] 434 mov usq, [ysq+1*gprsize] 435 mov vsq, [ysq+2*gprsize] 436 mov ysq, [ysq+0*gprsize] 437 438.loop_v: 439 xor xq, xq 440 441.loop_h: 442%if %3 == 1 443 lea tmpq, [yq+ysq] 444%endif ; %3 == 1 445%if %1 == 8 446 movu m0, [yq+xq*(1<<%2)] 447%if %3 == 1 448 movu m2, [tmpq+xq*2] 449%endif ; %3 == 1 450%if %2 == 1 451 movh m4, [uq+xq] 452 movh m5, [vq+xq] 453%else ; %2 != 1 454 movu m4, [uq+xq] 455 movu m5, [vq+xq] 456%endif ; %2 ==/!= 1 457 punpckhbw m1, m0, m14 458 punpcklbw m0, m14 459%if %3 == 1 460 punpckhbw m3, m2, m14 461 punpcklbw m2, m14 462%endif ; %3 == 1 463%if %2 == 0 464 punpckhbw m2, m4, m14 465 punpckhbw m3, m5, m14 466%endif ; %2 == 0 467 punpcklbw m4, m14 468 punpcklbw m5, m14 469%else ; %1 != 8 470 movu m0, [yq+xq*(2<<%2)] 471 movu m1, [yq+xq*(2<<%2)+mmsize] 472%if %3 == 1 473 movu m2, [tmpq+xq*4] 474 movu m3, [tmpq+xq*4+mmsize] 475%endif ; %3 == 1 476 movu m4, [uq+xq*2] 477 movu m5, [vq+xq*2] 478%if %2 == 0 479 movu m2, [uq+xq*2+mmsize] 480 movu m3, [vq+xq*2+mmsize] 481%endif ; %2 == 0 482%endif ; %1 ==/!= 8 483 psubw m0, m15 484 psubw m1, m15 485%if %3 == 1 486 psubw m2, m15 487 psubw m3, m15 488%endif ; %3 == 1 489 psubw m4, [pw_ %+ %%uvoff] 490 psubw m5, [pw_ %+ %%uvoff] 491 SBUTTERFLY wd, 4, 5, 6 492%if %2 == 0 493 psubw m2, [pw_ %+ %%uvoff] 494 psubw m3, [pw_ %+ %%uvoff] 495 SBUTTERFLY wd, 2, 3, 6 496%endif ; %2 == 0 497 498 ; calculate y+rnd full-resolution [0-3,6-9] 499 punpckhwd m6, m0, [pw_1] ; y, 1 500 punpcklwd m0, [pw_1] ; y, 1 501 punpckhwd m7, m1, [pw_1] ; y, 1 502 punpcklwd m1, [pw_1] ; y, 1 503 pmaddwd m0, [rsp+3*mmsize] 504 pmaddwd m6, [rsp+3*mmsize] 505 pmaddwd m1, [rsp+3*mmsize] 506 pmaddwd m7, [rsp+3*mmsize] 507%if %3 == 1 508 punpckhwd m8, m2, [pw_1] ; y, 1 509 punpcklwd m2, [pw_1] ; y, 1 510 punpckhwd m9, m3, [pw_1] ; y, 1 511 punpcklwd m3, [pw_1] ; y, 1 512 pmaddwd m2, [rsp+3*mmsize] 513 pmaddwd m8, [rsp+3*mmsize] 514 pmaddwd m3, [rsp+3*mmsize] 515 pmaddwd m9, [rsp+3*mmsize] 516 mova [rsp+4*mmsize], m2 517 mova [rsp+5*mmsize], m8 518 mova [rsp+6*mmsize], m3 519 mova [rsp+7*mmsize], m9 520%endif ; %3 == 1 521 522 ; calculate r offsets (un-subsampled, then duplicate) 523 pmaddwd m10, m4, [rsp+0*mmsize] 524%if %2 == 1 525 pmaddwd m12, m5, [rsp+0*mmsize] 526 punpckhdq m11, m10, m10 527 punpckldq m10, m10 528 punpckhdq m13, m12, m12 529 punpckldq m12, m12 530%else ; %2 != 1 531 pmaddwd m11, m5, [rsp+0*mmsize] 532 pmaddwd m12, m2, [rsp+0*mmsize] 533 pmaddwd m13, m3, [rsp+0*mmsize] 534%endif ; %2 ==/!= 1 535%if %3 == 1 536 paddd m2, m10, [rsp+4*mmsize] 537 paddd m3, m11, [rsp+5*mmsize] 538 paddd m8, m12, [rsp+6*mmsize] 539 paddd m9, m13, [rsp+7*mmsize] 540%endif 541 paddd m10, m0 542 paddd m11, m6 543 paddd m12, m1 544 paddd m13, m7 545%if %3 == 1 546 psrad m2, %%sh 547 psrad m3, %%sh 548 psrad m8, %%sh 549 psrad m9, %%sh 550%endif ; %3 == 1 551 psrad m10, %%sh 552 psrad m11, %%sh 553 psrad m12, %%sh 554 psrad m13, %%sh 555%if %3 == 1 556 lea tmpq, [rq+rgbsq*2] 557 packssdw m2, m3 558 packssdw m8, m9 559 mova [tmpq+xq*4], m2 560 mova [tmpq+xq*4+mmsize], m8 561%endif ; %3 == 1 562 packssdw m10, m11 563 packssdw m12, m13 564 mova [rq+xq*(2 << %2)], m10 565 mova [rq+xq*(2 << %2)+mmsize], m12 566 567 ; calculate g offsets (un-subsampled, then duplicate) 568 pmaddwd m10, m4, [rsp+1*mmsize] 569%if %2 == 1 570 pmaddwd m12, m5, [rsp+1*mmsize] 571 punpckhdq m11, m10, m10 572 punpckldq m10, m10 573 punpckhdq m13, m12, m12 574 punpckldq m12, m12 575%else ; %2 != 1 576 pmaddwd m11, m5, [rsp+1*mmsize] 577 pmaddwd m12, m2, [rsp+1*mmsize] 578 pmaddwd m13, m3, [rsp+1*mmsize] 579%endif ; %2 ==/!= 1 580%if %3 == 1 581 paddd m2, m10, [rsp+4*mmsize] 582 paddd m3, m11, [rsp+5*mmsize] 583 paddd m8, m12, [rsp+6*mmsize] 584 paddd m9, m13, [rsp+7*mmsize] 585%endif ; %3 == 1 586 paddd m10, m0 587 paddd m11, m6 588 paddd m12, m1 589 paddd m13, m7 590%if %3 == 1 591 psrad m2, %%sh 592 psrad m3, %%sh 593 psrad m8, %%sh 594 psrad m9, %%sh 595%endif ; %3 == 1 596 psrad m10, %%sh 597 psrad m11, %%sh 598 psrad m12, %%sh 599 psrad m13, %%sh 600%if %3 == 1 601 lea tmpq, [gq+rgbsq*2] 602 packssdw m2, m3 603 packssdw m8, m9 604 mova [tmpq+xq*4], m2 605 mova [tmpq+xq*4+mmsize], m8 606%endif ; %3 == 1 607 packssdw m10, m11 608 packssdw m12, m13 609 mova [gq+xq*(2 << %2)], m10 610 mova [gq+xq*(2 << %2)+mmsize], m12 611 612 ; calculate b offsets (un-subsampled, then duplicate) 613 pmaddwd m4, [rsp+2*mmsize] 614 pmaddwd m5, [rsp+2*mmsize] 615%if %2 == 1 616 punpckhdq m2, m4, m4 617 punpckldq m4, m4 618 punpckhdq m3, m5, m5 619 punpckldq m5, m5 620%else ; %2 != 1 621 pmaddwd m2, [rsp+2*mmsize] 622 pmaddwd m3, [rsp+2*mmsize] 623 SWAP 2, 5 624%endif ; %2 ==/!= 1 625 paddd m0, m4 626 paddd m6, m2 627 paddd m1, m5 628 paddd m7, m3 629%if %3 == 1 630 paddd m4, [rsp+4*mmsize] 631 paddd m2, [rsp+5*mmsize] 632 paddd m5, [rsp+6*mmsize] 633 paddd m3, [rsp+7*mmsize] 634%endif ; %3 == 1 635 psrad m0, %%sh 636 psrad m6, %%sh 637 psrad m1, %%sh 638 psrad m7, %%sh 639%if %3 == 1 640 psrad m4, %%sh 641 psrad m2, %%sh 642 psrad m5, %%sh 643 psrad m3, %%sh 644%endif ; %3 == 1 645 packssdw m0, m6 646 packssdw m1, m7 647 movu [bq+xq*(2 << %2)], m0 648 movu [bq+xq*(2 << %2)+mmsize], m1 649%if %3 == 1 650 lea tmpq, [bq+rgbsq*2] 651 packssdw m4, m2 652 packssdw m5, m3 653 movu [tmpq+xq*4], m4 654 movu [tmpq+xq*4+mmsize], m5 655%endif ; %3 == 1 656 657 add xd, mmsize >> %2 658 cmp xd, wwd 659 jl .loop_h 660 661 lea rq, [rq+rgbsq*(2 << %3)] 662 lea gq, [gq+rgbsq*(2 << %3)] 663 lea bq, [bq+rgbsq*(2 << %3)] 664%if %3 == 1 665 lea yq, [yq+ysq*2] 666%else ; %3 != 0 667 add yq, ysq 668%endif ; %3 ==/!= 1 669 add uq, usq 670 add vq, vsq 671 dec hd 672 jg .loop_v 673 674 RET 675%endmacro 676 677%macro YUV2RGB_FNS 2 678YUV2RGB_FN 8, %1, %2 679YUV2RGB_FN 10, %1, %2 680YUV2RGB_FN 12, %1, %2 681%endmacro 682 683INIT_XMM sse2 684YUV2RGB_FNS 0, 0 685YUV2RGB_FNS 1, 0 686YUV2RGB_FNS 1, 1 687 688%macro RGB2YUV_FN 3 ; depth, log2_chroma_w (horiz), log2_chroma_h (vert) 689%assign %%sh 29 - %1 690%assign %%rnd (1 << (%%sh - 15)) 691%assign %%uvrnd ((128 << (%1 - 8)) << (%%sh - 14)) 692%if %1 != 8 693%assign %%maxval ((1 << %1) - 1) 694%endif ; %1 != 8 695%if %2 == 0 696%assign %%ss 444 697%elif %3 == 0 698%assign %%ss 422 699%else ; %3 == 1 700%assign %%ss 420 701%endif ; %2/%3 702 703cglobal rgb2yuv_ %+ %%ss %+ p%1, 8, 14, 16, 0 - 6 * mmsize, \ 704 yuv, yuvs, rgb, rgbs, ww, h, c, off 705%if %2 == 1 706 inc wwd 707 sar wwd, 1 708%endif ; %2 == 1 709%if %3 == 1 710 inc hd 711 sar hd, 1 712%endif ; %3 == 1 713 714 ; prepare coeffs 715 movh m8, [offq] 716 movh m9, [pw_ %+ %%uvrnd] 717 psllw m8, %%sh - 14 718 paddw m9, [pw_ %+ %%rnd] 719 paddw m8, [pw_ %+ %%rnd] 720 movh m0, [cq+ 0] 721 movh m1, [cq+ 16] 722 movh m2, [cq+ 32] 723 movh m3, [cq+ 48] 724 movh m4, [cq+ 64] 725 movh m5, [cq+ 80] 726 movh m6, [cq+112] 727 movh m7, [cq+128] 728 punpcklwd m0, m1 729 punpcklwd m2, m8 730 punpcklwd m3, m4 731 punpcklwd m4, m5, m9 732 punpcklwd m5, m6 733 punpcklwd m7, m9 734 735 mova [rsp+0*mmsize], m0 ; cry, cgy 736 mova [rsp+1*mmsize], m2 ; cby, off + rnd 737 mova [rsp+2*mmsize], m3 ; cru, cgu 738 mova [rsp+3*mmsize], m4 ; cburv, uvoff + rnd 739 mova [rsp+4*mmsize], m5 ; cburv, cgv 740 mova [rsp+5*mmsize], m7 ; cbv, uvoff + rnd 741 742 743 DEFINE_ARGS y, ys, r, rgbs, ww, h, u, v, us, vs, g, b, tmp, x 744 mov gq, [rq+gprsize*1] 745 mov bq, [rq+gprsize*2] 746 mov rq, [rq+gprsize*0] 747 mov uq, [yq+gprsize*1] 748 mov vq, [yq+gprsize*2] 749 mov yq, [yq+gprsize*0] 750 mov usq, [ysq+gprsize*1] 751 mov vsq, [ysq+gprsize*2] 752 mov ysq, [ysq+gprsize*0] 753 754 pxor m15, m15 755.loop_v: 756 xor xd, xd 757 758.loop_h: 759 ; top line y 760 mova m0, [rq+xq*(2<<%2)] 761 mova m3, [rq+xq*(2<<%2)+mmsize] 762 mova m1, [gq+xq*(2<<%2)] 763 mova m4, [gq+xq*(2<<%2)+mmsize] 764 mova m2, [bq+xq*(2<<%2)] 765 mova m5, [bq+xq*(2<<%2)+mmsize] 766 767 punpcklwd m6, m0, m1 768 punpckhwd m7, m0, m1 769 punpcklwd m8, m3, m4 770 punpckhwd m9, m3, m4 771 punpcklwd m10, m2, [pw_16384] 772 punpckhwd m11, m2, [pw_16384] 773 punpcklwd m12, m5, [pw_16384] 774 punpckhwd m13, m5, [pw_16384] 775 776 pmaddwd m6, [rsp+0*mmsize] 777 pmaddwd m7, [rsp+0*mmsize] 778 pmaddwd m8, [rsp+0*mmsize] 779 pmaddwd m9, [rsp+0*mmsize] 780 pmaddwd m10, [rsp+1*mmsize] 781 pmaddwd m11, [rsp+1*mmsize] 782 pmaddwd m12, [rsp+1*mmsize] 783 pmaddwd m13, [rsp+1*mmsize] 784 paddd m6, m10 785 paddd m7, m11 786 paddd m8, m12 787 paddd m9, m13 788 psrad m6, %%sh 789 psrad m7, %%sh 790 psrad m8, %%sh 791 psrad m9, %%sh 792 packssdw m6, m7 793 packssdw m8, m9 794%if %1 == 8 795 packuswb m6, m8 796 movu [yq+xq*(1<<%2)], m6 797%else 798 CLIPW m6, m15, [pw_ %+ %%maxval] 799 CLIPW m8, m15, [pw_ %+ %%maxval] 800 movu [yq+xq*(2<<%2)], m6 801 movu [yq+xq*(2<<%2)+mmsize], m8 802%endif 803 804%if %2 == 1 805 ; subsampling cached data 806 pmaddwd m0, [pw_1] 807 pmaddwd m1, [pw_1] 808 pmaddwd m2, [pw_1] 809 pmaddwd m3, [pw_1] 810 pmaddwd m4, [pw_1] 811 pmaddwd m5, [pw_1] 812 813%if %3 == 1 814 ; bottom line y, r/g portion only 815 lea tmpq, [rgbsq+xq*2] 816 mova m6, [rq+tmpq*2] 817 mova m9, [rq+tmpq*2+mmsize] 818 mova m7, [gq+tmpq*2] 819 mova m10, [gq+tmpq*2+mmsize] 820 mova m8, [bq+tmpq*2] 821 mova m11, [bq+tmpq*2+mmsize] 822 823 punpcklwd m12, m6, m7 824 punpckhwd m13, m6, m7 825 punpcklwd m14, m9, m10 826 punpckhwd m15, m9, m10 827 828 ; release two more registers 829 pmaddwd m6, [pw_1] 830 pmaddwd m7, [pw_1] 831 pmaddwd m9, [pw_1] 832 pmaddwd m10, [pw_1] 833 paddd m0, m6 834 paddd m3, m9 835 paddd m1, m7 836 paddd m4, m10 837 838 ; bottom line y, b/rnd portion only 839 punpcklwd m6, m8, [pw_16384] 840 punpckhwd m7, m8, [pw_16384] 841 punpcklwd m9, m11, [pw_16384] 842 punpckhwd m10, m11, [pw_16384] 843 844 pmaddwd m12, [rsp+0*mmsize] 845 pmaddwd m13, [rsp+0*mmsize] 846 pmaddwd m14, [rsp+0*mmsize] 847 pmaddwd m15, [rsp+0*mmsize] 848 pmaddwd m6, [rsp+1*mmsize] 849 pmaddwd m7, [rsp+1*mmsize] 850 pmaddwd m9, [rsp+1*mmsize] 851 pmaddwd m10, [rsp+1*mmsize] 852 paddd m12, m6 853 paddd m13, m7 854 paddd m14, m9 855 paddd m15, m10 856 psrad m12, %%sh 857 psrad m13, %%sh 858 psrad m14, %%sh 859 psrad m15, %%sh 860 packssdw m12, m13 861 packssdw m14, m15 862 lea tmpq, [yq+ysq] 863%if %1 == 8 864 packuswb m12, m14 865 movu [tmpq+xq*2], m12 866%else 867 pxor m15, m15 868 CLIPW m12, m15, [pw_ %+ %%maxval] 869 CLIPW m14, m15, [pw_ %+ %%maxval] 870 movu [tmpq+xq*4], m12 871 movu [tmpq+xq*4+mmsize], m14 872%endif 873 874 ; complete subsampling of r/g/b pixels for u/v 875 pmaddwd m8, [pw_1] 876 pmaddwd m11, [pw_1] 877 paddd m2, m8 878 paddd m5, m11 879 paddd m0, [pd_2] 880 paddd m1, [pd_2] 881 paddd m2, [pd_2] 882 paddd m3, [pd_2] 883 paddd m4, [pd_2] 884 paddd m5, [pd_2] 885 psrad m0, 2 886 psrad m1, 2 887 psrad m2, 2 888 psrad m3, 2 889 psrad m4, 2 890 psrad m5, 2 891%else ; %3 != 1 892 paddd m0, [pd_1] 893 paddd m1, [pd_1] 894 paddd m2, [pd_1] 895 paddd m3, [pd_1] 896 paddd m4, [pd_1] 897 paddd m5, [pd_1] 898 psrad m0, 1 899 psrad m1, 1 900 psrad m2, 1 901 psrad m3, 1 902 psrad m4, 1 903 psrad m5, 1 904%endif ; %3 ==/!= 1 905 packssdw m0, m3 906 packssdw m1, m4 907 packssdw m2, m5 908%endif ; %2 == 1 909 910 ; convert u/v pixels 911 SBUTTERFLY wd, 0, 1, 6 912 punpckhwd m6, m2, [pw_16384] 913 punpcklwd m2, [pw_16384] 914 915 pmaddwd m7, m0, [rsp+2*mmsize] 916 pmaddwd m8, m1, [rsp+2*mmsize] 917 pmaddwd m9, m2, [rsp+3*mmsize] 918 pmaddwd m10, m6, [rsp+3*mmsize] 919 pmaddwd m0, [rsp+4*mmsize] 920 pmaddwd m1, [rsp+4*mmsize] 921 pmaddwd m2, [rsp+5*mmsize] 922 pmaddwd m6, [rsp+5*mmsize] 923 paddd m7, m9 924 paddd m8, m10 925 paddd m0, m2 926 paddd m1, m6 927 psrad m7, %%sh 928 psrad m8, %%sh 929 psrad m0, %%sh 930 psrad m1, %%sh 931 packssdw m7, m8 932 packssdw m0, m1 933%if %2 == 1 934%if %1 == 8 935 packuswb m7, m0 936 movh [uq+xq], m7 937 movhps [vq+xq], m7 938%else 939 CLIPW m7, m15, [pw_ %+ %%maxval] 940 CLIPW m0, m15, [pw_ %+ %%maxval] 941 movu [uq+xq*2], m7 942 movu [vq+xq*2], m0 943%endif 944%else ; %2 != 1 945 ; second set of u/v pixels 946 SBUTTERFLY wd, 3, 4, 6 947 punpckhwd m6, m5, [pw_16384] 948 punpcklwd m5, [pw_16384] 949 950 pmaddwd m8, m3, [rsp+2*mmsize] 951 pmaddwd m9, m4, [rsp+2*mmsize] 952 pmaddwd m10, m5, [rsp+3*mmsize] 953 pmaddwd m11, m6, [rsp+3*mmsize] 954 pmaddwd m3, [rsp+4*mmsize] 955 pmaddwd m4, [rsp+4*mmsize] 956 pmaddwd m5, [rsp+5*mmsize] 957 pmaddwd m6, [rsp+5*mmsize] 958 paddd m8, m10 959 paddd m9, m11 960 paddd m3, m5 961 paddd m4, m6 962 psrad m8, %%sh 963 psrad m9, %%sh 964 psrad m3, %%sh 965 psrad m4, %%sh 966 packssdw m8, m9 967 packssdw m3, m4 968 969%if %1 == 8 970 packuswb m7, m8 971 packuswb m0, m3 972 movu [uq+xq], m7 973 movu [vq+xq], m0 974%else 975 CLIPW m7, m15, [pw_ %+ %%maxval] 976 CLIPW m0, m15, [pw_ %+ %%maxval] 977 CLIPW m8, m15, [pw_ %+ %%maxval] 978 CLIPW m3, m15, [pw_ %+ %%maxval] 979 movu [uq+xq*2], m7 980 movu [uq+xq*2+mmsize], m8 981 movu [vq+xq*2], m0 982 movu [vq+xq*2+mmsize], m3 983%endif 984%endif ; %2 ==/!= 1 985 986 add xq, mmsize >> %2 987 cmp xd, wwd 988 jl .loop_h 989 990%if %3 == 0 991 add yq, ysq 992%else ; %3 != 0 993 lea yq, [yq+ysq*2] 994%endif ; %3 ==/!= 0 995 add uq, usq 996 add vq, vsq 997 lea rq, [rq+rgbsq*(2<<%3)] 998 lea gq, [gq+rgbsq*(2<<%3)] 999 lea bq, [bq+rgbsq*(2<<%3)] 1000 dec hd 1001 jg .loop_v 1002 1003 RET 1004%endmacro 1005 1006%macro RGB2YUV_FNS 2 1007RGB2YUV_FN 8, %1, %2 1008RGB2YUV_FN 10, %1, %2 1009RGB2YUV_FN 12, %1, %2 1010%endmacro 1011 1012INIT_XMM sse2 1013RGB2YUV_FNS 0, 0 1014RGB2YUV_FNS 1, 0 1015RGB2YUV_FNS 1, 1 1016 1017; void ff_multiply3x3_sse2(int16_t *data[3], ptrdiff_t stride, 1018; int w, int h, const int16_t coeff[3][3][8]) 1019INIT_XMM sse2 1020cglobal multiply3x3, 5, 7, 16, data, stride, ww, h, c 1021 movh m0, [cq+ 0] 1022 movh m1, [cq+ 32] 1023 movh m2, [cq+ 48] 1024 movh m3, [cq+ 80] 1025 movh m4, [cq+ 96] 1026 movh m5, [cq+128] 1027 punpcklwd m0, [cq+ 16] 1028 punpcklwd m1, [pw_8192] 1029 punpcklwd m2, [cq+ 64] 1030 punpcklwd m3, [pw_8192] 1031 punpcklwd m4, [cq+112] 1032 punpcklwd m5, [pw_8192] 1033 1034 DEFINE_ARGS data0, stride, ww, h, data1, data2, x 1035 shl strideq, 1 1036 mov data1q, [data0q+gprsize*1] 1037 mov data2q, [data0q+gprsize*2] 1038 mov data0q, [data0q+gprsize*0] 1039 1040.loop_v: 1041 xor xd, xd 1042 1043.loop_h: 1044 mova m6, [data0q+xq*2] 1045 mova m7, [data1q+xq*2] 1046 mova m8, [data2q+xq*2] 1047 SBUTTERFLY wd, 6, 7, 9 1048 punpckhwd m9, m8, [pw_1] 1049 punpcklwd m8, [pw_1] 1050 1051 pmaddwd m10, m6, m0 1052 pmaddwd m11, m7, m0 1053 pmaddwd m12, m8, m1 1054 pmaddwd m13, m9, m1 1055 paddd m10, m12 1056 paddd m11, m13 1057 psrad m10, 14 1058 psrad m11, 14 1059 1060 pmaddwd m12, m6, m2 1061 pmaddwd m13, m7, m2 1062 pmaddwd m14, m8, m3 1063 pmaddwd m15, m9, m3 1064 paddd m12, m14 1065 paddd m13, m15 1066 psrad m12, 14 1067 psrad m13, 14 1068 1069 pmaddwd m6, m4 1070 pmaddwd m7, m4 1071 pmaddwd m8, m5 1072 pmaddwd m9, m5 1073 paddd m6, m8 1074 paddd m7, m9 1075 psrad m6, 14 1076 psrad m7, 14 1077 1078 packssdw m10, m11 1079 packssdw m12, m13 1080 packssdw m6, m7 1081 1082 mova [data0q+xq*2], m10 1083 mova [data1q+xq*2], m12 1084 mova [data2q+xq*2], m6 1085 1086 add xd, mmsize / 2 1087 cmp xd, wwd 1088 jl .loop_h 1089 1090 add data0q, strideq 1091 add data1q, strideq 1092 add data2q, strideq 1093 dec hd 1094 jg .loop_v 1095 1096 RET 1097%endif 1098