1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Oskar Arvidsson <oskar@irock.se> 7;* Loren Merritt <lorenm@u.washington.edu> 8;* Fiona Glaser <fiona@x264.com> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION .text 30 31cextern pw_2 32cextern pw_3 33cextern pw_4 34cextern pw_1023 35%define pw_pixel_max pw_1023 36 37; out: %4 = |%1-%2|-%3 38; clobbers: %5 39%macro ABS_SUB 5 40 psubusw %5, %2, %1 41 psubusw %4, %1, %2 42 por %4, %5 43 psubw %4, %3 44%endmacro 45 46; out: %4 = |%1-%2|<%3 47%macro DIFF_LT 5 48 psubusw %4, %2, %1 49 psubusw %5, %1, %2 50 por %5, %4 ; |%1-%2| 51 pxor %4, %4 52 psubw %5, %3 ; |%1-%2|-%3 53 pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 54%endmacro 55 56%macro LOAD_AB 4 57 movd %1, %3 58 movd %2, %4 59 SPLATW %1, %1 60 SPLATW %2, %2 61%endmacro 62 63; in: %2=tc reg 64; out: %1=splatted tc 65%macro LOAD_TC 2 66 movd %1, [%2] 67 punpcklbw %1, %1 68%if mmsize == 8 69 pshufw %1, %1, 0 70%else 71 pshuflw %1, %1, 01010000b 72 pshufd %1, %1, 01010000b 73%endif 74 psraw %1, 6 75%endmacro 76 77; in: %1=p1, %2=p0, %3=q0, %4=q1 78; %5=alpha, %6=beta, %7-%9=tmp 79; out: %7=mask 80%macro LOAD_MASK 9 81 ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha 82 ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta 83 pand %8, %9 84 ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta 85 pxor %7, %7 86 pand %8, %9 87 pcmpgtw %7, %8 88%endmacro 89 90; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 91; out: %1=p0', m2=q0' 92%macro DEBLOCK_P0_Q0 7 93 psubw %3, %4 94 pxor %7, %7 95 paddw %3, [pw_4] 96 psubw %7, %5 97 psubw %6, %2, %1 98 psllw %6, 2 99 paddw %3, %6 100 psraw %3, 3 101 mova %6, [pw_pixel_max] 102 CLIPW %3, %7, %5 103 pxor %7, %7 104 paddw %1, %3 105 psubw %2, %3 106 CLIPW %1, %7, %6 107 CLIPW %2, %7, %6 108%endmacro 109 110; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp 111%macro LUMA_Q1 6 112 pavgw %6, %3, %4 ; (p0+q0+1)>>1 113 paddw %1, %6 114 pxor %6, %6 115 psraw %1, 1 116 psubw %6, %5 117 psubw %1, %2 118 CLIPW %1, %6, %5 119 paddw %1, %2 120%endmacro 121 122%macro LUMA_DEBLOCK_ONE 3 123 DIFF_LT m5, %1, bm, m4, m6 124 pxor m6, m6 125 mova %3, m4 126 pcmpgtw m6, tcm 127 pand m4, tcm 128 pandn m6, m7 129 pand m4, m6 130 LUMA_Q1 m5, %2, m1, m2, m4, m6 131%endmacro 132 133%macro LUMA_H_STORE 2 134%if mmsize == 8 135 movq [r0-4], m0 136 movq [r0+r1-4], m1 137 movq [r0+r1*2-4], m2 138 movq [r0+%2-4], m3 139%else 140 movq [r0-4], m0 141 movhps [r0+r1-4], m0 142 movq [r0+r1*2-4], m1 143 movhps [%1-4], m1 144 movq [%1+r1-4], m2 145 movhps [%1+r1*2-4], m2 146 movq [%1+%2-4], m3 147 movhps [%1+r1*4-4], m3 148%endif 149%endmacro 150 151%macro DEBLOCK_LUMA 0 152;----------------------------------------------------------------------------- 153; void ff_deblock_v_luma_10(uint16_t *pix, int stride, int alpha, int beta, 154; int8_t *tc0) 155;----------------------------------------------------------------------------- 156cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) 157 %assign pad 5*mmsize+12-(stack_offset&15) 158 %define tcm [rsp] 159 %define ms1 [rsp+mmsize] 160 %define ms2 [rsp+mmsize*2] 161 %define am [rsp+mmsize*3] 162 %define bm [rsp+mmsize*4] 163 SUB rsp, pad 164 shl r2d, 2 165 shl r3d, 2 166 LOAD_AB m4, m5, r2d, r3d 167 mov r3, 32/mmsize 168 mov r2, r0 169 sub r0, r1 170 mova am, m4 171 sub r0, r1 172 mova bm, m5 173 sub r0, r1 174.loop: 175 mova m0, [r0+r1] 176 mova m1, [r0+r1*2] 177 mova m2, [r2] 178 mova m3, [r2+r1] 179 180 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 181 LOAD_TC m6, r4 182 mova tcm, m6 183 184 mova m5, [r0] 185 LUMA_DEBLOCK_ONE m1, m0, ms1 186 mova [r0+r1], m5 187 188 mova m5, [r2+r1*2] 189 LUMA_DEBLOCK_ONE m2, m3, ms2 190 mova [r2+r1], m5 191 192 pxor m5, m5 193 mova m6, tcm 194 pcmpgtw m5, tcm 195 psubw m6, ms1 196 pandn m5, m7 197 psubw m6, ms2 198 pand m5, m6 199 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 200 mova [r0+r1*2], m1 201 mova [r2], m2 202 203 add r0, mmsize 204 add r2, mmsize 205 add r4, mmsize/8 206 dec r3 207 jg .loop 208 ADD rsp, pad 209 RET 210 211cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) 212 %assign pad 7*mmsize+12-(stack_offset&15) 213 %define tcm [rsp] 214 %define ms1 [rsp+mmsize] 215 %define ms2 [rsp+mmsize*2] 216 %define p1m [rsp+mmsize*3] 217 %define p2m [rsp+mmsize*4] 218 %define am [rsp+mmsize*5] 219 %define bm [rsp+mmsize*6] 220 SUB rsp, pad 221 shl r2d, 2 222 shl r3d, 2 223 LOAD_AB m4, m5, r2d, r3d 224 mov r3, r1 225 mova am, m4 226 add r3, r1 227 mov r5, 32/mmsize 228 mova bm, m5 229 add r3, r1 230%if mmsize == 16 231 mov r2, r0 232 add r2, r3 233%endif 234.loop: 235%if mmsize == 8 236 movq m2, [r0-8] ; y q2 q1 q0 237 movq m7, [r0+0] 238 movq m5, [r0+r1-8] 239 movq m3, [r0+r1+0] 240 movq m0, [r0+r1*2-8] 241 movq m6, [r0+r1*2+0] 242 movq m1, [r0+r3-8] 243 TRANSPOSE4x4W 2, 5, 0, 1, 4 244 SWAP 2, 7 245 movq m7, [r0+r3] 246 TRANSPOSE4x4W 2, 3, 6, 7, 4 247%else 248 movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 249 movu m0, [r0+r1-8] 250 movu m2, [r0+r1*2-8] 251 movu m3, [r2-8] 252 TRANSPOSE4x4W 5, 0, 2, 3, 6 253 mova tcm, m3 254 255 movu m4, [r2+r1-8] 256 movu m1, [r2+r1*2-8] 257 movu m3, [r2+r3-8] 258 movu m7, [r2+r1*4-8] 259 TRANSPOSE4x4W 4, 1, 3, 7, 6 260 261 mova m6, tcm 262 punpcklqdq m6, m7 263 punpckhqdq m5, m4 264 SBUTTERFLY qdq, 0, 1, 7 265 SBUTTERFLY qdq, 2, 3, 7 266%endif 267 268 mova p2m, m6 269 LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 270 LOAD_TC m6, r4 271 mova tcm, m6 272 273 LUMA_DEBLOCK_ONE m1, m0, ms1 274 mova p1m, m5 275 276 mova m5, p2m 277 LUMA_DEBLOCK_ONE m2, m3, ms2 278 mova p2m, m5 279 280 pxor m5, m5 281 mova m6, tcm 282 pcmpgtw m5, tcm 283 psubw m6, ms1 284 pandn m5, m7 285 psubw m6, ms2 286 pand m5, m6 287 DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 288 mova m0, p1m 289 mova m3, p2m 290 TRANSPOSE4x4W 0, 1, 2, 3, 4 291 LUMA_H_STORE r2, r3 292 293 add r4, mmsize/8 294 lea r0, [r0+r1*(mmsize/2)] 295 lea r2, [r2+r1*(mmsize/2)] 296 dec r5 297 jg .loop 298 ADD rsp, pad 299 RET 300%endmacro 301 302%if ARCH_X86_64 303; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 304; m12=alpha, m13=beta 305; out: m0=p1', m3=q1', m1=p0', m2=q0' 306; clobbers: m4, m5, m6, m7, m10, m11, m14 307%macro DEBLOCK_LUMA_INTER_SSE2 0 308 LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 309 LOAD_TC m6, r4 310 DIFF_LT m8, m1, m13, m10, m4 311 DIFF_LT m9, m2, m13, m11, m4 312 pand m6, m7 313 314 mova m14, m6 315 pxor m4, m4 316 pcmpgtw m6, m4 317 pand m6, m14 318 319 mova m5, m10 320 pand m5, m6 321 LUMA_Q1 m8, m0, m1, m2, m5, m4 322 323 mova m5, m11 324 pand m5, m6 325 LUMA_Q1 m9, m3, m1, m2, m5, m4 326 327 pxor m4, m4 328 psubw m6, m10 329 pcmpgtw m4, m14 330 pandn m4, m7 331 psubw m6, m11 332 pand m4, m6 333 DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 334 335 SWAP 0, 8 336 SWAP 3, 9 337%endmacro 338 339%macro DEBLOCK_LUMA_64 0 340cglobal deblock_v_luma_10, 5,5,15 341 %define p2 m8 342 %define p1 m0 343 %define p0 m1 344 %define q0 m2 345 %define q1 m3 346 %define q2 m9 347 %define mask0 m7 348 %define mask1 m10 349 %define mask2 m11 350 shl r2d, 2 351 shl r3d, 2 352 LOAD_AB m12, m13, r2d, r3d 353 mov r2, r0 354 sub r0, r1 355 sub r0, r1 356 sub r0, r1 357 mov r3, 2 358.loop: 359 mova p2, [r0] 360 mova p1, [r0+r1] 361 mova p0, [r0+r1*2] 362 mova q0, [r2] 363 mova q1, [r2+r1] 364 mova q2, [r2+r1*2] 365 DEBLOCK_LUMA_INTER_SSE2 366 mova [r0+r1], p1 367 mova [r0+r1*2], p0 368 mova [r2], q0 369 mova [r2+r1], q1 370 add r0, mmsize 371 add r2, mmsize 372 add r4, 2 373 dec r3 374 jg .loop 375 REP_RET 376 377cglobal deblock_h_luma_10, 5,7,15 378 shl r2d, 2 379 shl r3d, 2 380 LOAD_AB m12, m13, r2d, r3d 381 mov r2, r1 382 add r2, r1 383 add r2, r1 384 mov r5, r0 385 add r5, r2 386 mov r6, 2 387.loop: 388 movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x 389 movu m0, [r0+r1-8] 390 movu m2, [r0+r1*2-8] 391 movu m9, [r5-8] 392 movu m5, [r5+r1-8] 393 movu m1, [r5+r1*2-8] 394 movu m3, [r5+r2-8] 395 movu m7, [r5+r1*4-8] 396 397 TRANSPOSE4x4W 8, 0, 2, 9, 10 398 TRANSPOSE4x4W 5, 1, 3, 7, 10 399 400 punpckhqdq m8, m5 401 SBUTTERFLY qdq, 0, 1, 10 402 SBUTTERFLY qdq, 2, 3, 10 403 punpcklqdq m9, m7 404 405 DEBLOCK_LUMA_INTER_SSE2 406 407 TRANSPOSE4x4W 0, 1, 2, 3, 4 408 LUMA_H_STORE r5, r2 409 add r4, 2 410 lea r0, [r0+r1*8] 411 lea r5, [r5+r1*8] 412 dec r6 413 jg .loop 414 REP_RET 415%endmacro 416 417INIT_XMM sse2 418DEBLOCK_LUMA_64 419%if HAVE_AVX_EXTERNAL 420INIT_XMM avx 421DEBLOCK_LUMA_64 422%endif 423%endif 424 425%macro SWAPMOVA 2 426%ifid %1 427 SWAP %1, %2 428%else 429 mova %1, %2 430%endif 431%endmacro 432 433; in: t0-t2: tmp registers 434; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 435; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' 436%macro LUMA_INTRA_P012 12 ; p0..p3 in memory 437%if ARCH_X86_64 438 paddw t0, %3, %2 439 mova t2, %4 440 paddw t2, %3 441%else 442 mova t0, %3 443 mova t2, %4 444 paddw t0, %2 445 paddw t2, %3 446%endif 447 paddw t0, %1 448 paddw t2, t2 449 paddw t0, %5 450 paddw t2, %9 451 paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) 452 paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) 453 454 psrlw t2, 3 455 psrlw t1, t0, 2 456 psubw t2, %3 457 psubw t1, %2 458 pand t2, %8 459 pand t1, %8 460 paddw t2, %3 461 paddw t1, %2 462 SWAPMOVA %11, t1 463 464 psubw t1, t0, %3 465 paddw t0, t0 466 psubw t1, %5 467 psubw t0, %3 468 paddw t1, %6 469 paddw t1, %2 470 paddw t0, %6 471 psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 472 psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 473 474 pxor t0, t1 475 pxor t1, %1 476 pand t0, %8 477 pand t1, %7 478 pxor t0, t1 479 pxor t0, %1 480 SWAPMOVA %10, t0 481 SWAPMOVA %12, t2 482%endmacro 483 484%macro LUMA_INTRA_INIT 1 485 %xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) 486 %define t0 m4 487 %define t1 m5 488 %define t2 m6 489 %define t3 m7 490 %assign i 4 491%rep %1 492 CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] 493 %assign i i+1 494%endrep 495 SUB rsp, pad 496%endmacro 497 498; in: %1-%3=tmp, %4=p2, %5=q2 499%macro LUMA_INTRA_INTER 5 500 LOAD_AB t0, t1, r2d, r3d 501 mova %1, t0 502 LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 503%if ARCH_X86_64 504 mova %2, t0 ; mask0 505 psrlw t3, %1, 2 506%else 507 mova t3, %1 508 mova %2, t0 ; mask0 509 psrlw t3, 2 510%endif 511 paddw t3, [pw_2] ; alpha/4+2 512 DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 513 pand t2, %2 514 mova t3, %5 ; q2 515 mova %1, t2 ; mask1 516 DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta 517 pand t2, %1 518 mova t3, %4 ; p2 519 mova %3, t2 ; mask1q 520 DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta 521 pand t2, %1 522 mova %1, t2 ; mask1p 523%endmacro 524 525%macro LUMA_H_INTRA_LOAD 0 526%if mmsize == 8 527 movu t0, [r0-8] 528 movu t1, [r0+r1-8] 529 movu m0, [r0+r1*2-8] 530 movu m1, [r0+r4-8] 531 TRANSPOSE4x4W 4, 5, 0, 1, 2 532 mova t4, t0 ; p3 533 mova t5, t1 ; p2 534 535 movu m2, [r0] 536 movu m3, [r0+r1] 537 movu t0, [r0+r1*2] 538 movu t1, [r0+r4] 539 TRANSPOSE4x4W 2, 3, 4, 5, 6 540 mova t6, t0 ; q2 541 mova t7, t1 ; q3 542%else 543 movu t0, [r0-8] 544 movu t1, [r0+r1-8] 545 movu m0, [r0+r1*2-8] 546 movu m1, [r0+r5-8] 547 movu m2, [r4-8] 548 movu m3, [r4+r1-8] 549 movu t2, [r4+r1*2-8] 550 movu t3, [r4+r5-8] 551 TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 552 mova t4, t0 ; p3 553 mova t5, t1 ; p2 554 mova t6, t2 ; q2 555 mova t7, t3 ; q3 556%endif 557%endmacro 558 559; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp 560%macro LUMA_H_INTRA_STORE 9 561%if mmsize == 8 562 TRANSPOSE4x4W %1, %2, %3, %4, %9 563 movq [r0-8], m%1 564 movq [r0+r1-8], m%2 565 movq [r0+r1*2-8], m%3 566 movq [r0+r4-8], m%4 567 movq m%1, %8 568 TRANSPOSE4x4W %5, %6, %7, %1, %9 569 movq [r0], m%5 570 movq [r0+r1], m%6 571 movq [r0+r1*2], m%7 572 movq [r0+r4], m%1 573%else 574 TRANSPOSE2x4x4W %1, %2, %3, %4, %9 575 movq [r0-8], m%1 576 movq [r0+r1-8], m%2 577 movq [r0+r1*2-8], m%3 578 movq [r0+r5-8], m%4 579 movhps [r4-8], m%1 580 movhps [r4+r1-8], m%2 581 movhps [r4+r1*2-8], m%3 582 movhps [r4+r5-8], m%4 583%ifnum %8 584 SWAP %1, %8 585%else 586 mova m%1, %8 587%endif 588 TRANSPOSE2x4x4W %5, %6, %7, %1, %9 589 movq [r0], m%5 590 movq [r0+r1], m%6 591 movq [r0+r1*2], m%7 592 movq [r0+r5], m%1 593 movhps [r4], m%5 594 movhps [r4+r1], m%6 595 movhps [r4+r1*2], m%7 596 movhps [r4+r5], m%1 597%endif 598%endmacro 599 600%if ARCH_X86_64 601;----------------------------------------------------------------------------- 602; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 603; int beta) 604;----------------------------------------------------------------------------- 605%macro DEBLOCK_LUMA_INTRA_64 0 606cglobal deblock_v_luma_intra_10, 4,7,16 607 %define t0 m1 608 %define t1 m2 609 %define t2 m4 610 %define p2 m8 611 %define p1 m9 612 %define p0 m10 613 %define q0 m11 614 %define q1 m12 615 %define q2 m13 616 %define aa m5 617 %define bb m14 618 lea r4, [r1*4] 619 lea r5, [r1*3] ; 3*stride 620 neg r4 621 add r4, r0 ; pix-4*stride 622 mov r6, 2 623 mova m0, [pw_2] 624 shl r2d, 2 625 shl r3d, 2 626 LOAD_AB aa, bb, r2d, r3d 627.loop: 628 mova p2, [r4+r1] 629 mova p1, [r4+2*r1] 630 mova p0, [r4+r5] 631 mova q0, [r0] 632 mova q1, [r0+r1] 633 mova q2, [r0+2*r1] 634 635 LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 636 mova t2, aa 637 psrlw t2, 2 638 paddw t2, m0 ; alpha/4+2 639 DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 640 DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta 641 DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta 642 pand m6, m3 643 pand m7, m6 644 pand m6, t1 645 LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] 646 LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] 647 add r0, mmsize 648 add r4, mmsize 649 dec r6 650 jg .loop 651 REP_RET 652 653;----------------------------------------------------------------------------- 654; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 655; int beta) 656;----------------------------------------------------------------------------- 657cglobal deblock_h_luma_intra_10, 4,7,16 658 %define t0 m15 659 %define t1 m14 660 %define t2 m2 661 %define q3 m5 662 %define q2 m8 663 %define q1 m9 664 %define q0 m10 665 %define p0 m11 666 %define p1 m12 667 %define p2 m13 668 %define p3 m4 669 %define spill [rsp] 670 %assign pad 24-(stack_offset&15) 671 SUB rsp, pad 672 lea r4, [r1*4] 673 lea r5, [r1*3] ; 3*stride 674 add r4, r0 ; pix+4*stride 675 mov r6, 2 676 mova m0, [pw_2] 677 shl r2d, 2 678 shl r3d, 2 679.loop: 680 movu q3, [r0-8] 681 movu q2, [r0+r1-8] 682 movu q1, [r0+r1*2-8] 683 movu q0, [r0+r5-8] 684 movu p0, [r4-8] 685 movu p1, [r4+r1-8] 686 movu p2, [r4+r1*2-8] 687 movu p3, [r4+r5-8] 688 TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 689 690 LOAD_AB m1, m2, r2d, r3d 691 LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 692 psrlw m1, 2 693 paddw m1, m0 ; alpha/4+2 694 DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 695 DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta 696 DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta 697 pand m6, m3 698 pand m7, m6 699 pand m6, t1 700 701 mova spill, q3 702 LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 703 LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 704 mova m7, spill 705 706 LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 707 708 lea r0, [r0+r1*8] 709 lea r4, [r4+r1*8] 710 dec r6 711 jg .loop 712 ADD rsp, pad 713 RET 714%endmacro 715 716INIT_XMM sse2 717DEBLOCK_LUMA_INTRA_64 718%if HAVE_AVX_EXTERNAL 719INIT_XMM avx 720DEBLOCK_LUMA_INTRA_64 721%endif 722 723%endif 724 725%macro DEBLOCK_LUMA_INTRA 0 726;----------------------------------------------------------------------------- 727; void ff_deblock_v_luma_intra_10(uint16_t *pix, int stride, int alpha, 728; int beta) 729;----------------------------------------------------------------------------- 730cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) 731 LUMA_INTRA_INIT 3 732 lea r4, [r1*4] 733 lea r5, [r1*3] 734 neg r4 735 add r4, r0 736 mov r6, 32/mmsize 737 shl r2d, 2 738 shl r3d, 2 739.loop: 740 mova m0, [r4+r1*2] ; p1 741 mova m1, [r4+r5] ; p0 742 mova m2, [r0] ; q0 743 mova m3, [r0+r1] ; q1 744 LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] 745 LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] 746 mova t3, [r0+r1*2] ; q2 747 LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] 748 add r0, mmsize 749 add r4, mmsize 750 dec r6 751 jg .loop 752 ADD rsp, pad 753 RET 754 755;----------------------------------------------------------------------------- 756; void ff_deblock_h_luma_intra_10(uint16_t *pix, int stride, int alpha, 757; int beta) 758;----------------------------------------------------------------------------- 759cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) 760 LUMA_INTRA_INIT 8 761%if mmsize == 8 762 lea r4, [r1*3] 763 mov r5, 32/mmsize 764%else 765 lea r4, [r1*4] 766 lea r5, [r1*3] ; 3*stride 767 add r4, r0 ; pix+4*stride 768 mov r6, 32/mmsize 769%endif 770 shl r2d, 2 771 shl r3d, 2 772.loop: 773 LUMA_H_INTRA_LOAD 774 LUMA_INTRA_INTER t8, t9, t10, t5, t6 775 776 LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 777 mova t3, t6 ; q2 778 LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 779 780 mova m2, t4 781 mova m0, t11 782 mova m1, t5 783 mova m3, t8 784 mova m6, t6 785 786 LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 787 788 lea r0, [r0+r1*(mmsize/2)] 789%if mmsize == 8 790 dec r5 791%else 792 lea r4, [r4+r1*(mmsize/2)] 793 dec r6 794%endif 795 jg .loop 796 ADD rsp, pad 797 RET 798%endmacro 799 800%if ARCH_X86_64 == 0 801INIT_MMX mmxext 802DEBLOCK_LUMA 803DEBLOCK_LUMA_INTRA 804INIT_XMM sse2 805DEBLOCK_LUMA 806DEBLOCK_LUMA_INTRA 807%if HAVE_AVX_EXTERNAL 808INIT_XMM avx 809DEBLOCK_LUMA 810DEBLOCK_LUMA_INTRA 811%endif 812%endif 813 814; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp 815; out: %1=p0', %2=q0' 816%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 817 mova %6, [pw_2] 818 paddw %6, %3 819 paddw %6, %4 820 paddw %7, %6, %2 821 paddw %6, %1 822 paddw %6, %3 823 paddw %7, %4 824 psraw %6, 2 825 psraw %7, 2 826 psubw %6, %1 827 psubw %7, %2 828 pand %6, %5 829 pand %7, %5 830 paddw %1, %6 831 paddw %2, %7 832%endmacro 833 834%macro CHROMA_V_LOAD 1 835 mova m0, [r0] ; p1 836 mova m1, [r0+r1] ; p0 837 mova m2, [%1] ; q0 838 mova m3, [%1+r1] ; q1 839%endmacro 840 841%macro CHROMA_V_STORE 0 842 mova [r0+1*r1], m1 843 mova [r0+2*r1], m2 844%endmacro 845 846; in: 8 rows of 4 words in %4..%11 847; out: 4 rows of 8 words in m0..m3 848%macro TRANSPOSE4x8W_LOAD 8 849 movq m0, %1 850 movq m2, %2 851 movq m1, %3 852 movq m3, %4 853 854 punpcklwd m0, m2 855 punpcklwd m1, m3 856 punpckhdq m2, m0, m1 857 punpckldq m0, m1 858 859 movq m4, %5 860 movq m6, %6 861 movq m5, %7 862 movq m3, %8 863 864 punpcklwd m4, m6 865 punpcklwd m5, m3 866 punpckhdq m6, m4, m5 867 punpckldq m4, m5 868 869 punpckhqdq m1, m0, m4 870 punpcklqdq m0, m4 871 punpckhqdq m3, m2, m6 872 punpcklqdq m2, m6 873%endmacro 874 875; in: 4 rows of 8 words in m0..m3 876; out: 8 rows of 4 words in %1..%8 877%macro TRANSPOSE8x4W_STORE 8 878 TRANSPOSE4x4W 0, 1, 2, 3, 4 879 movq %1, m0 880 movhps %2, m0 881 movq %3, m1 882 movhps %4, m1 883 movq %5, m2 884 movhps %6, m2 885 movq %7, m3 886 movhps %8, m3 887%endmacro 888 889; %1 = base + 3*stride 890; %2 = 3*stride (unused on mmx) 891; %3, %4 = place to store p1 and q1 values 892%macro CHROMA_H_LOAD 4 893 %if mmsize == 8 894 movq m0, [pix_q - 4] 895 movq m1, [pix_q + stride_q - 4] 896 movq m2, [pix_q + 2*stride_q - 4] 897 movq m3, [%1 - 4] 898 TRANSPOSE4x4W 0, 1, 2, 3, 4 899 %else 900 TRANSPOSE4x8W_LOAD PASS8ROWS(pix_q-4, %1-4, stride_q, %2) 901 %endif 902 mova %3, m0 903 mova %4, m3 904%endmacro 905 906; %1 = base + 3*stride 907; %2 = 3*stride (unused on mmx) 908; %3, %4 = place to load p1 and q1 values 909%macro CHROMA_H_STORE 4 910 mova m0, %3 911 mova m3, %4 912 %if mmsize == 8 913 TRANSPOSE4x4W 0, 1, 2, 3, 4 914 movq [pix_q - 4], m0 915 movq [pix_q + stride_q - 4], m1 916 movq [pix_q + 2*stride_q - 4], m2 917 movq [%1 - 4], m3 918 %else 919 TRANSPOSE8x4W_STORE PASS8ROWS(pix_q-4, %1-4, stride_q, %2) 920 %endif 921%endmacro 922 923%macro CHROMA_V_LOAD_TC 2 924 movd %1, [%2] 925 punpcklbw %1, %1 926 punpcklwd %1, %1 927 psraw %1, 6 928%endmacro 929 930%macro DEBLOCK_CHROMA 0 931;----------------------------------------------------------------------------- 932; void ff_deblock_v_chroma_10(uint16_t *pix, int stride, int alpha, int beta, 933; int8_t *tc0) 934;----------------------------------------------------------------------------- 935cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) 936 mov r5, r0 937 sub r0, r1 938 sub r0, r1 939 shl r2d, 2 940 shl r3d, 2 941%if mmsize < 16 942 mov r6, 16/mmsize 943.loop: 944%endif 945 CHROMA_V_LOAD r5 946 LOAD_AB m4, m5, r2d, r3d 947 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 948 pxor m4, m4 949 CHROMA_V_LOAD_TC m6, r4 950 psubw m6, [pw_3] 951 pmaxsw m6, m4 952 pand m7, m6 953 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 954 CHROMA_V_STORE 955%if mmsize < 16 956 add r0, mmsize 957 add r5, mmsize 958 add r4, mmsize/4 959 dec r6 960 jg .loop 961 REP_RET 962%else 963 RET 964%endif 965 966;----------------------------------------------------------------------------- 967; void ff_deblock_v_chroma_intra_10(uint16_t *pix, int stride, int alpha, 968; int beta) 969;----------------------------------------------------------------------------- 970cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) 971 mov r4, r0 972 sub r0, r1 973 sub r0, r1 974 shl r2d, 2 975 shl r3d, 2 976%if mmsize < 16 977 mov r5, 16/mmsize 978.loop: 979%endif 980 CHROMA_V_LOAD r4 981 LOAD_AB m4, m5, r2d, r3d 982 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 983 CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 984 CHROMA_V_STORE 985%if mmsize < 16 986 add r0, mmsize 987 add r4, mmsize 988 dec r5 989 jg .loop 990 REP_RET 991%else 992 RET 993%endif 994 995;----------------------------------------------------------------------------- 996; void ff_deblock_h_chroma_10(uint16_t *pix, int stride, int alpha, int beta, 997; int8_t *tc0) 998;----------------------------------------------------------------------------- 999cglobal deblock_h_chroma_10, 5, 7, 8, 0-2*mmsize, pix_, stride_, alpha_, beta_, tc0_ 1000 shl alpha_d, 2 1001 shl beta_d, 2 1002 mov r5, pix_q 1003 lea r6, [3*stride_q] 1004 add r5, r6 1005%if mmsize == 8 1006 mov r6d, 2 1007 .loop: 1008%endif 1009 1010 CHROMA_H_LOAD r5, r6, [rsp], [rsp + mmsize] 1011 LOAD_AB m4, m5, alpha_d, beta_d 1012 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 1013 pxor m4, m4 1014 CHROMA_V_LOAD_TC m6, tc0_q 1015 psubw m6, [pw_3] 1016 pmaxsw m6, m4 1017 pand m7, m6 1018 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 1019 CHROMA_H_STORE r5, r6, [rsp], [rsp + mmsize] 1020 1021%if mmsize == 8 1022 lea pix_q, [pix_q + 4*stride_q] 1023 lea r5, [r5 + 4*stride_q] 1024 add tc0_q, 2 1025 dec r6d 1026 jg .loop 1027%endif 1028RET 1029 1030;----------------------------------------------------------------------------- 1031; void ff_deblock_h_chroma422_10(uint16_t *pix, int stride, int alpha, int beta, 1032; int8_t *tc0) 1033;----------------------------------------------------------------------------- 1034cglobal deblock_h_chroma422_10, 5, 7, 8, 0-3*mmsize, pix_, stride_, alpha_, beta_, tc0_ 1035 shl alpha_d, 2 1036 shl beta_d, 2 1037 1038 movd m0, [tc0_q] 1039 punpcklbw m0, m0 1040 psraw m0, 6 1041 movq [rsp], m0 1042 1043 mov r5, pix_q 1044 lea r6, [3*stride_q] 1045 add r5, r6 1046 1047 mov r4, -8 1048 .loop: 1049 1050 CHROMA_H_LOAD r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize] 1051 LOAD_AB m4, m5, alpha_d, beta_d 1052 LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 1053 pxor m4, m4 1054 movd m6, [rsp + r4 + 8] 1055 punpcklwd m6, m6 1056 punpcklwd m6, m6 1057 psubw m6, [pw_3] 1058 pmaxsw m6, m4 1059 pand m7, m6 1060 DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 1061 CHROMA_H_STORE r5, r6, [rsp + 1*mmsize], [rsp + 2*mmsize] 1062 1063 lea pix_q, [pix_q + (mmsize/2)*stride_q] 1064 lea r5, [r5 + (mmsize/2)*stride_q] 1065 add r4, (mmsize/4) 1066 jl .loop 1067RET 1068 1069%endmacro 1070 1071%if ARCH_X86_64 == 0 1072INIT_MMX mmxext 1073DEBLOCK_CHROMA 1074%endif 1075INIT_XMM sse2 1076DEBLOCK_CHROMA 1077%if HAVE_AVX_EXTERNAL 1078INIT_XMM avx 1079DEBLOCK_CHROMA 1080%endif 1081