1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Oskar Arvidsson <oskar@irock.se> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION_RODATA 30 31pb_A1: times 16 db 0xA1 32pb_3_1: times 4 db 3, 1 33 34SECTION .text 35 36cextern pb_0 37cextern pb_1 38cextern pb_3 39 40%define PASS8ROWS(base, base3, stride, stride3, offset) \ 41 PASS8ROWS(base+offset, base3+offset, stride, stride3) 42 43; in: 8 rows of 4 bytes in %4..%11 44; out: 4 rows of 8 bytes in m0..m3 45%macro TRANSPOSE4x8_LOAD 11 46 movh m0, %4 47 movh m2, %5 48 movh m1, %6 49 movh m3, %7 50 punpckl%1 m0, m2 51 punpckl%1 m1, m3 52 mova m2, m0 53 punpckl%2 m0, m1 54 punpckh%2 m2, m1 55 56 movh m4, %8 57 movh m6, %9 58 movh m5, %10 59 movh m7, %11 60 punpckl%1 m4, m6 61 punpckl%1 m5, m7 62 mova m6, m4 63 punpckl%2 m4, m5 64 punpckh%2 m6, m5 65 66 punpckh%3 m1, m0, m4 67 punpckh%3 m3, m2, m6 68 punpckl%3 m0, m4 69 punpckl%3 m2, m6 70%endmacro 71 72; in: 4 rows of 8 bytes in m0..m3 73; out: 8 rows of 4 bytes in %1..%8 74%macro TRANSPOSE8x4B_STORE 8 75 punpckhdq m4, m0, m0 76 punpckhdq m5, m1, m1 77 punpckhdq m6, m2, m2 78 79 punpcklbw m0, m1 80 punpcklbw m2, m3 81 punpcklwd m1, m0, m2 82 punpckhwd m0, m2 83 movh %1, m1 84 punpckhdq m1, m1 85 movh %2, m1 86 movh %3, m0 87 punpckhdq m0, m0 88 movh %4, m0 89 90 punpckhdq m3, m3 91 punpcklbw m4, m5 92 punpcklbw m6, m3 93 punpcklwd m5, m4, m6 94 punpckhwd m4, m6 95 movh %5, m5 96 punpckhdq m5, m5 97 movh %6, m5 98 movh %7, m4 99 punpckhdq m4, m4 100 movh %8, m4 101%endmacro 102 103%macro TRANSPOSE4x8B_LOAD 8 104 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 105%endmacro 106 107%macro SBUTTERFLY3 4 108 punpckh%1 %4, %2, %3 109 punpckl%1 %2, %3 110%endmacro 111 112; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 113; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 114%macro TRANSPOSE6x8_MEM 9 115 RESET_MM_PERMUTATION 116 movq m0, %1 117 movq m1, %2 118 movq m2, %3 119 movq m3, %4 120 movq m4, %5 121 movq m5, %6 122 movq m6, %7 123 SBUTTERFLY bw, 0, 1, 7 124 SBUTTERFLY bw, 2, 3, 7 125 SBUTTERFLY bw, 4, 5, 7 126 movq [%9+0x10], m3 127 SBUTTERFLY3 bw, m6, %8, m7 128 SBUTTERFLY wd, 0, 2, 3 129 SBUTTERFLY wd, 4, 6, 3 130 punpckhdq m0, m4 131 movq [%9+0x00], m0 132 SBUTTERFLY3 wd, m1, [%9+0x10], m3 133 SBUTTERFLY wd, 5, 7, 0 134 SBUTTERFLY dq, 1, 5, 0 135 SBUTTERFLY dq, 2, 6, 0 136 punpckldq m3, m7 137 movq [%9+0x10], m2 138 movq [%9+0x20], m6 139 movq [%9+0x30], m1 140 movq [%9+0x40], m5 141 movq [%9+0x50], m3 142 RESET_MM_PERMUTATION 143%endmacro 144 145; in: 8 rows of 8 in %1..%8 146; out: 8 rows of 8 in %9..%16 147%macro TRANSPOSE8x8_MEM 16 148 RESET_MM_PERMUTATION 149 movq m0, %1 150 movq m1, %2 151 movq m2, %3 152 movq m3, %4 153 movq m4, %5 154 movq m5, %6 155 movq m6, %7 156 SBUTTERFLY bw, 0, 1, 7 157 SBUTTERFLY bw, 2, 3, 7 158 SBUTTERFLY bw, 4, 5, 7 159 SBUTTERFLY3 bw, m6, %8, m7 160 movq %9, m5 161 SBUTTERFLY wd, 0, 2, 5 162 SBUTTERFLY wd, 4, 6, 5 163 SBUTTERFLY wd, 1, 3, 5 164 movq %11, m6 165 movq m6, %9 166 SBUTTERFLY wd, 6, 7, 5 167 SBUTTERFLY dq, 0, 4, 5 168 SBUTTERFLY dq, 1, 6, 5 169 movq %9, m0 170 movq %10, m4 171 movq %13, m1 172 movq %14, m6 173 SBUTTERFLY3 dq, m2, %11, m0 174 SBUTTERFLY dq, 3, 7, 4 175 movq %11, m2 176 movq %12, m0 177 movq %15, m3 178 movq %16, m7 179 RESET_MM_PERMUTATION 180%endmacro 181 182; out: %4 = |%1-%2|>%3 183; clobbers: %5 184%macro DIFF_GT 5 185%if avx_enabled == 0 186 mova %5, %2 187 mova %4, %1 188 psubusb %5, %1 189 psubusb %4, %2 190%else 191 psubusb %5, %2, %1 192 psubusb %4, %1, %2 193%endif 194 por %4, %5 195 psubusb %4, %3 196%endmacro 197 198; out: %4 = |%1-%2|>%3 199; clobbers: %5 200%macro DIFF_GT2 5 201%if ARCH_X86_64 202 psubusb %5, %2, %1 203 psubusb %4, %1, %2 204%else 205 mova %5, %2 206 mova %4, %1 207 psubusb %5, %1 208 psubusb %4, %2 209%endif 210 psubusb %5, %3 211 psubusb %4, %3 212 pcmpeqb %4, %5 213%endmacro 214 215; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 216; out: m5=beta-1, m7=mask, %3=alpha-1 217; clobbers: m4,m6 218%macro LOAD_MASK 2-3 219 movd m4, %1 220 movd m5, %2 221 SPLATW m4, m4 222 SPLATW m5, m5 223 packuswb m4, m4 ; 16x alpha-1 224 packuswb m5, m5 ; 16x beta-1 225%if %0>2 226 mova %3, m4 227%endif 228 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 229 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 230 por m7, m4 231 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 232 por m7, m4 233 pxor m6, m6 234 pcmpeqb m7, m6 235%endmacro 236 237; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 238; out: m1=p0' m2=q0' 239; clobbers: m0,3-6 240%macro DEBLOCK_P0_Q0 0 241 pcmpeqb m4, m4 242 pxor m5, m1, m2 ; p0^q0 243 pxor m3, m4 244 pand m5, [pb_1] ; (p0^q0)&1 245 pavgb m3, m0 ; (p1 - q1 + 256)>>1 246 pxor m4, m1 247 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 248 pavgb m4, m2 ; (q0 - p0 + 256)>>1 249 pavgb m3, m5 250 mova m6, [pb_A1] 251 paddusb m3, m4 ; d+128+33 252 psubusb m6, m3 253 psubusb m3, [pb_A1] 254 pminub m6, m7 255 pminub m3, m7 256 psubusb m1, m6 257 psubusb m2, m3 258 paddusb m1, m3 259 paddusb m2, m6 260%endmacro 261 262; in: m1=p0 m2=q0 263; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 264; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 265; clobbers: q2, tmp, tc0 266%macro LUMA_Q1 6 267 pavgb %6, m1, m2 268 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 269 pxor %6, %3 270 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 271 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 272 psubusb %6, %1, %5 273 paddusb %5, %1 274 pmaxub %2, %6 275 pminub %2, %5 276 mova %4, %2 277%endmacro 278 279%if ARCH_X86_64 280;----------------------------------------------------------------------------- 281; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta, 282; int8_t *tc0) 283;----------------------------------------------------------------------------- 284%macro DEBLOCK_LUMA 0 285cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_ 286 movd m8, [r4] ; tc0 287 lea r4, [stride_q*3] 288 dec alpha_d ; alpha-1 289 neg r4 290 dec beta_d ; beta-1 291 add base3_q, pix_q ; pix-3*stride 292 293 mova m0, [base3_q + stride_q] ; p1 294 mova m1, [base3_q + 2*stride_q] ; p0 295 mova m2, [pix_q] ; q0 296 mova m3, [pix_q + stride_q] ; q1 297 LOAD_MASK r2d, r3d 298 299 punpcklbw m8, m8 300 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 301 pcmpeqb m9, m9 302 pcmpeqb m9, m8 303 pandn m9, m7 304 pand m8, m9 305 306 movdqa m3, [base3_q] ; p2 307 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 308 pand m6, m9 309 psubb m7, m8, m6 310 pand m6, m8 311 LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4 312 313 movdqa m4, [pix_q + 2*stride_q] ; q2 314 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 315 pand m6, m9 316 pand m8, m6 317 psubb m7, m6 318 mova m3, [pix_q + stride_q] 319 LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6 320 321 DEBLOCK_P0_Q0 322 mova [base3_q + 2*stride_q], m1 323 mova [pix_q], m2 324 RET 325 326;----------------------------------------------------------------------------- 327; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 328; int8_t *tc0) 329;----------------------------------------------------------------------------- 330INIT_MMX cpuname 331cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 332 movsxd r7, r1d 333 lea r8, [r7+r7*2] 334 lea r6, [r0-4] 335 lea r5, [r0-4+r8] 336%if WIN64 337 %define pix_tmp rsp+0x30 ; shadow space + r4 338%else 339 %define pix_tmp rsp 340%endif 341 342 ; transpose 6x16 -> tmp space 343 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp 344 lea r6, [r6+r7*8] 345 lea r5, [r5+r7*8] 346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 347 348 ; vertical filter 349 ; alpha, beta, tc0 are still in r2d, r3d, r4 350 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them 351 lea r0, [pix_tmp+0x30] 352 mov r1d, 0x10 353%if WIN64 354 mov [rsp+0x20], r4 355%endif 356 call deblock_v_luma_8 357 358 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 359 add r6, 2 360 add r5, 2 361 movq m0, [pix_tmp+0x18] 362 movq m1, [pix_tmp+0x28] 363 movq m2, [pix_tmp+0x38] 364 movq m3, [pix_tmp+0x48] 365 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 366 367 shl r7, 3 368 sub r6, r7 369 sub r5, r7 370 shr r7, 3 371 movq m0, [pix_tmp+0x10] 372 movq m1, [pix_tmp+0x20] 373 movq m2, [pix_tmp+0x30] 374 movq m3, [pix_tmp+0x40] 375 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 376 377 RET 378%endmacro 379 380%macro DEBLOCK_H_LUMA_MBAFF 0 381 382cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_ 383 movsxd stride_q, stride_d 384 dec alpha_d 385 dec beta_d 386 mov base3_q, pix_q 387 lea stride3_q, [3*stride_q] 388 add base3_q, stride3_q 389 390 movq m0, [pix_q - 4] 391 movq m1, [pix_q + stride_q - 4] 392 movq m2, [pix_q + 2*stride_q - 4] 393 movq m3, [base3_q - 4] 394 movq m4, [base3_q + stride_q - 4] 395 movq m5, [base3_q + 2*stride_q - 4] 396 movq m6, [base3_q + stride3_q - 4] 397 movq m7, [base3_q + 4*stride_q - 4] 398 399 TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 400 401 %assign i 0 402 %rep 8 403 movq [rsp + 16*i], m %+ i 404 %assign i i+1 405 %endrep 406 407 ; p2 = m1 [rsp + 16] 408 ; p1 = m2 [rsp + 32] 409 ; p0 = m3 [rsp + 48] 410 ; q0 = m4 [rsp + 64] 411 ; q1 = m5 [rsp + 80] 412 ; q2 = m6 [rsp + 96] 413 414 SWAP 0, 2 415 SWAP 1, 3 416 SWAP 2, 4 417 SWAP 3, 5 418 419 LOAD_MASK alpha_d, beta_d 420 movd m8, [tc0_q] 421 punpcklbw m8, m8 422 pcmpeqb m9, m9 423 pcmpeqb m9, m8 424 pandn m9, m7 425 pand m8, m9 426 427 movdqa m3, [rsp + 16] ; p2 428 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 429 pand m6, m9 430 psubb m7, m8, m6 431 pand m6, m8 432 LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4 433 434 movdqa m4, [rsp + 96] ; q2 435 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 436 pand m6, m9 437 pand m8, m6 438 psubb m7, m6 439 mova m3, [rsp + 80] 440 LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6 441 442 DEBLOCK_P0_Q0 443 SWAP 1, 3 444 SWAP 2, 4 445 movq m0, [rsp] 446 movq m1, [rsp + 16] 447 movq m2, [rsp + 32] 448 movq m5, [rsp + 80] 449 movq m6, [rsp + 96] 450 movq m7, [rsp + 112] 451 452 TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 453 movq [pix_q - 4], m0 454 movq [pix_q + stride_q - 4], m1 455 movq [pix_q + 2*stride_q - 4], m2 456 movq [base3_q - 4], m3 457 movq [base3_q + stride_q - 4], m4 458 movq [base3_q + 2*stride_q - 4], m5 459 movq [base3_q + stride3_q - 4], m6 460 movq [base3_q + 4*stride_q - 4], m7 461 462RET 463 464%endmacro 465 466INIT_XMM sse2 467DEBLOCK_H_LUMA_MBAFF 468DEBLOCK_LUMA 469 470%if HAVE_AVX_EXTERNAL 471INIT_XMM avx 472DEBLOCK_H_LUMA_MBAFF 473DEBLOCK_LUMA 474%endif 475 476%else 477 478%macro DEBLOCK_LUMA 2 479;----------------------------------------------------------------------------- 480; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta, 481; int8_t *tc0) 482;----------------------------------------------------------------------------- 483cglobal deblock_%1_luma_8, 5,5,8,2*%2 484 lea r4, [r1*3] 485 dec r2 ; alpha-1 486 neg r4 487 dec r3 ; beta-1 488 add r4, r0 ; pix-3*stride 489 490 mova m0, [r4+r1] ; p1 491 mova m1, [r4+2*r1] ; p0 492 mova m2, [r0] ; q0 493 mova m3, [r0+r1] ; q1 494 LOAD_MASK r2, r3 495 496 mov r3, r4mp 497 pcmpeqb m3, m3 498 movd m4, [r3] ; tc0 499 punpcklbw m4, m4 500 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 501 mova [esp+%2], m4 ; tc 502 pcmpgtb m4, m3 503 mova m3, [r4] ; p2 504 pand m4, m7 505 mova [esp], m4 ; mask 506 507 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 508 pand m6, m4 509 pand m4, [esp+%2] ; tc 510 psubb m7, m4, m6 511 pand m6, m4 512 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 513 514 mova m4, [r0+2*r1] ; q2 515 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 516 pand m6, [esp] ; mask 517 mova m5, [esp+%2] ; tc 518 psubb m7, m6 519 pand m5, m6 520 mova m3, [r0+r1] 521 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 522 523 DEBLOCK_P0_Q0 524 mova [r4+2*r1], m1 525 mova [r0], m2 526 RET 527 528;----------------------------------------------------------------------------- 529; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 530; int8_t *tc0) 531;----------------------------------------------------------------------------- 532INIT_MMX cpuname 533cglobal deblock_h_luma_8, 0,5,8,0x60+12 534 mov r0, r0mp 535 mov r3, r1m 536 lea r4, [r3*3] 537 sub r0, 4 538 lea r1, [r0+r4] 539%define pix_tmp esp+12 540 541 ; transpose 6x16 -> tmp space 542 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 543 lea r0, [r0+r3*8] 544 lea r1, [r1+r3*8] 545 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 546 547 ; vertical filter 548 lea r0, [pix_tmp+0x30] 549 PUSH dword r4m 550 PUSH dword r3m 551 PUSH dword r2m 552 PUSH dword 16 553 PUSH dword r0 554 call deblock_%1_luma_8 555%ifidn %1, v8 556 add dword [esp ], 8 ; pix_tmp+0x38 557 add dword [esp+16], 2 ; tc0+2 558 call deblock_%1_luma_8 559%endif 560 ADD esp, 20 561 562 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 563 mov r0, r0mp 564 sub r0, 2 565 566 movq m0, [pix_tmp+0x10] 567 movq m1, [pix_tmp+0x20] 568 lea r1, [r0+r4] 569 movq m2, [pix_tmp+0x30] 570 movq m3, [pix_tmp+0x40] 571 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 572 573 lea r0, [r0+r3*8] 574 lea r1, [r1+r3*8] 575 movq m0, [pix_tmp+0x18] 576 movq m1, [pix_tmp+0x28] 577 movq m2, [pix_tmp+0x38] 578 movq m3, [pix_tmp+0x48] 579 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 580 581 RET 582%endmacro ; DEBLOCK_LUMA 583 584INIT_XMM sse2 585DEBLOCK_LUMA v, 16 586%if HAVE_AVX_EXTERNAL 587INIT_XMM avx 588DEBLOCK_LUMA v, 16 589%endif 590 591%endif ; ARCH 592 593 594 595%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 596%if ARCH_X86_64 597 pavgb t0, p2, p1 598 pavgb t1, p0, q0 599%else 600 mova t0, p2 601 mova t1, p0 602 pavgb t0, p1 603 pavgb t1, q0 604%endif 605 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 606 mova t5, t1 607%if ARCH_X86_64 608 paddb t2, p2, p1 609 paddb t3, p0, q0 610%else 611 mova t2, p2 612 mova t3, p0 613 paddb t2, p1 614 paddb t3, q0 615%endif 616 paddb t2, t3 617 mova t3, t2 618 mova t4, t2 619 psrlw t2, 1 620 pavgb t2, mpb_0 621 pxor t2, t0 622 pand t2, mpb_1 623 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 624 625%if ARCH_X86_64 626 pavgb t1, p2, q1 627 psubb t2, p2, q1 628%else 629 mova t1, p2 630 mova t2, p2 631 pavgb t1, q1 632 psubb t2, q1 633%endif 634 paddb t3, t3 635 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 636 pand t2, mpb_1 637 psubb t1, t2 638 pavgb t1, p1 639 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 640 psrlw t3, 2 641 pavgb t3, mpb_0 642 pxor t3, t1 643 pand t3, mpb_1 644 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 645 646 pxor t3, p0, q1 647 pavgb t2, p0, q1 648 pand t3, mpb_1 649 psubb t2, t3 650 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 651 652 pxor t1, t2 653 pxor t2, p0 654 pand t1, mask1p 655 pand t2, mask0 656 pxor t1, t2 657 pxor t1, p0 658 mova %1, t1 ; store p0 659 660 mova t1, %4 ; p3 661 paddb t2, t1, p2 662 pavgb t1, p2 663 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 664 paddb t2, t2 665 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 666 psrlw t2, 2 667 pavgb t2, mpb_0 668 pxor t2, t1 669 pand t2, mpb_1 670 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 671 672 pxor t0, p1 673 pxor t1, p2 674 pand t0, mask1p 675 pand t1, mask1p 676 pxor t0, p1 677 pxor t1, p2 678 mova %2, t0 ; store p1 679 mova %3, t1 ; store p2 680%endmacro 681 682%macro LUMA_INTRA_SWAP_PQ 0 683 %define q1 m0 684 %define q0 m1 685 %define p0 m2 686 %define p1 m3 687 %define p2 q2 688 %define mask1p mask1q 689%endmacro 690 691%macro DEBLOCK_LUMA_INTRA 1 692 %define p1 m0 693 %define p0 m1 694 %define q0 m2 695 %define q1 m3 696 %define t0 m4 697 %define t1 m5 698 %define t2 m6 699 %define t3 m7 700%if ARCH_X86_64 701 %define p2 m8 702 %define q2 m9 703 %define t4 m10 704 %define t5 m11 705 %define mask0 m12 706 %define mask1p m13 707%if WIN64 708 %define mask1q [rsp] 709%else 710 %define mask1q [rsp-24] 711%endif 712 %define mpb_0 m14 713 %define mpb_1 m15 714%else 715 %define spill(x) [esp+16*x] 716 %define p2 [r4+r1] 717 %define q2 [r0+2*r1] 718 %define t4 spill(0) 719 %define t5 spill(1) 720 %define mask0 spill(2) 721 %define mask1p spill(3) 722 %define mask1q spill(4) 723 %define mpb_0 [pb_0] 724 %define mpb_1 [pb_1] 725%endif 726 727;----------------------------------------------------------------------------- 728; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 729;----------------------------------------------------------------------------- 730%if WIN64 731cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 732%else 733cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 734%endif 735 lea r4, [r1*4] 736 lea r5, [r1*3] ; 3*stride 737 dec r2d ; alpha-1 738 jl .end 739 neg r4 740 dec r3d ; beta-1 741 jl .end 742 add r4, r0 ; pix-4*stride 743 mova p1, [r4+2*r1] 744 mova p0, [r4+r5] 745 mova q0, [r0] 746 mova q1, [r0+r1] 747%if ARCH_X86_64 748 pxor mpb_0, mpb_0 749 mova mpb_1, [pb_1] 750 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 751 SWAP 7, 12 ; m12=mask0 752 pavgb t5, mpb_0 753 pavgb t5, mpb_1 ; alpha/4+1 754 movdqa p2, [r4+r1] 755 movdqa q2, [r0+2*r1] 756 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 757 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 758 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 759 pand t0, mask0 760 pand t4, t0 761 pand t2, t0 762 mova mask1q, t4 763 mova mask1p, t2 764%else 765 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 766 mova m4, t5 767 mova mask0, m7 768 pavgb m4, [pb_0] 769 pavgb m4, [pb_1] ; alpha/4+1 770 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 771 pand m6, mask0 772 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 773 pand m4, m6 774 mova mask1p, m4 775 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 776 pand m4, m6 777 mova mask1q, m4 778%endif 779 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 780 LUMA_INTRA_SWAP_PQ 781 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 782.end: 783 RET 784 785INIT_MMX cpuname 786%if ARCH_X86_64 787;----------------------------------------------------------------------------- 788; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 789;----------------------------------------------------------------------------- 790cglobal deblock_h_luma_intra_8, 4,9,0,0x80 791 movsxd r7, r1d 792 lea r8, [r7*3] 793 lea r6, [r0-4] 794 lea r5, [r0-4+r8] 795%if WIN64 796 %define pix_tmp rsp+0x20 ; shadow space 797%else 798 %define pix_tmp rsp 799%endif 800 801 ; transpose 8x16 -> tmp space 802 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 803 lea r6, [r6+r7*8] 804 lea r5, [r5+r7*8] 805 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 806 807 lea r0, [pix_tmp+0x40] 808 mov r1, 0x10 809 call deblock_v_luma_intra_8 810 811 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 812 lea r5, [r6+r8] 813 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 814 shl r7, 3 815 sub r6, r7 816 sub r5, r7 817 shr r7, 3 818 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 819 RET 820%else 821cglobal deblock_h_luma_intra_8, 2,4,8,0x80 822 lea r3, [r1*3] 823 sub r0, 4 824 lea r2, [r0+r3] 825 %define pix_tmp rsp 826 827 ; transpose 8x16 -> tmp space 828 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 829 lea r0, [r0+r1*8] 830 lea r2, [r2+r1*8] 831 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 832 833 lea r0, [pix_tmp+0x40] 834 PUSH dword r3m 835 PUSH dword r2m 836 PUSH dword 16 837 PUSH r0 838 call deblock_%1_luma_intra_8 839%ifidn %1, v8 840 add dword [rsp], 8 ; pix_tmp+8 841 call deblock_%1_luma_intra_8 842%endif 843 ADD esp, 16 844 845 mov r1, r1m 846 mov r0, r0mp 847 lea r3, [r1*3] 848 sub r0, 4 849 lea r2, [r0+r3] 850 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 851 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 852 lea r0, [r0+r1*8] 853 lea r2, [r2+r1*8] 854 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 855 RET 856%endif ; ARCH_X86_64 857%endmacro ; DEBLOCK_LUMA_INTRA 858 859INIT_XMM sse2 860DEBLOCK_LUMA_INTRA v 861%if HAVE_AVX_EXTERNAL 862INIT_XMM avx 863DEBLOCK_LUMA_INTRA v 864%endif 865 866%macro LOAD_8_ROWS 8 867 movd m0, %1 868 movd m1, %2 869 movd m2, %3 870 movd m3, %4 871 movd m4, %5 872 movd m5, %6 873 movd m6, %7 874 movd m7, %8 875%endmacro 876 877%macro STORE_8_ROWS 8 878 movd %1, m0 879 movd %2, m1 880 movd %3, m2 881 movd %4, m3 882 movd %5, m4 883 movd %6, m5 884 movd %7, m6 885 movd %8, m7 886%endmacro 887 888%macro TRANSPOSE_8x4B_XMM 0 889 punpcklbw m0, m1 890 punpcklbw m2, m3 891 punpcklbw m4, m5 892 punpcklbw m6, m7 893 punpcklwd m0, m2 894 punpcklwd m4, m6 895 punpckhdq m2, m0, m4 896 punpckldq m0, m4 897 MOVHL m1, m0 898 MOVHL m3, m2 899%endmacro 900 901%macro TRANSPOSE_4x8B_XMM 0 902 punpcklbw m0, m1 903 punpcklbw m2, m3 904 punpckhwd m4, m0, m2 905 punpcklwd m0, m2 906 MOVHL m6, m4 907 MOVHL m2, m0 908 pshufd m1, m0, 1 909 pshufd m3, m2, 1 910 pshufd m5, m4, 1 911 pshufd m7, m6, 1 912%endmacro 913 914%macro CHROMA_INTER_BODY_XMM 1 915 LOAD_MASK alpha_d, beta_d 916 movd m6, [tc0_q] 917 %rep %1 918 punpcklbw m6, m6 919 %endrep 920 pand m7, m6 921 DEBLOCK_P0_Q0 922%endmacro 923 924%macro CHROMA_INTRA_BODY_XMM 0 925 LOAD_MASK alpha_d, beta_d 926 mova m5, m1 927 mova m6, m2 928 pxor m4, m1, m3 929 pand m4, [pb_1] 930 pavgb m1, m3 931 psubusb m1, m4 932 pavgb m1, m0 933 pxor m4, m2, m0 934 pand m4, [pb_1] 935 pavgb m2, m0 936 psubusb m2, m4 937 pavgb m2, m3 938 psubb m1, m5 939 psubb m2, m6 940 pand m1, m7 941 pand m2, m7 942 paddb m1, m5 943 paddb m2, m6 944%endmacro 945 946%macro CHROMA_V_START_XMM 1 947 movsxdifnidn stride_q, stride_d 948 dec alpha_d 949 dec beta_d 950 mov %1, pix_q 951 sub %1, stride_q 952 sub %1, stride_q 953%endmacro 954 955%macro CHROMA_H_START_XMM 2 956 movsxdifnidn stride_q, stride_d 957 dec alpha_d 958 dec beta_d 959 lea %2, [3*stride_q] 960 mov %1, pix_q 961 add %1, %2 962%endmacro 963 964%macro DEBLOCK_CHROMA_XMM 1 965 966INIT_XMM %1 967 968cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_ 969 CHROMA_V_START_XMM r5 970 movq m0, [r5] 971 movq m1, [r5 + stride_q] 972 movq m2, [pix_q] 973 movq m3, [pix_q + stride_q] 974 CHROMA_INTER_BODY_XMM 1 975 movq [r5 + stride_q], m1 976 movq [pix_q], m2 977RET 978 979cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 980 CHROMA_H_START_XMM r5, r6 981 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 982 TRANSPOSE_8x4B_XMM 983 movq [rsp], m0 984 movq [rsp + 8], m3 985 CHROMA_INTER_BODY_XMM 1 986 movq m0, [rsp] 987 movq m3, [rsp + 8] 988 TRANSPOSE_4x8B_XMM 989 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 990RET 991 992cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 993 CHROMA_H_START_XMM r5, r6 994 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 995 TRANSPOSE_8x4B_XMM 996 movq [rsp], m0 997 movq [rsp + 8], m3 998 CHROMA_INTER_BODY_XMM 2 999 movq m0, [rsp] 1000 movq m3, [rsp + 8] 1001 TRANSPOSE_4x8B_XMM 1002 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1003 1004 lea pix_q, [pix_q + 8*stride_q] 1005 lea r5, [r5 + 8*stride_q] 1006 add tc0_q, 2 1007 1008 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1009 TRANSPOSE_8x4B_XMM 1010 movq [rsp], m0 1011 movq [rsp + 8], m3 1012 CHROMA_INTER_BODY_XMM 2 1013 movq m0, [rsp] 1014 movq m3, [rsp + 8] 1015 TRANSPOSE_4x8B_XMM 1016 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1017RET 1018 1019cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_ 1020 CHROMA_V_START_XMM r4 1021 movq m0, [r4] 1022 movq m1, [r4 + stride_q] 1023 movq m2, [pix_q] 1024 movq m3, [pix_q + stride_q] 1025 CHROMA_INTRA_BODY_XMM 1026 movq [r4 + stride_q], m1 1027 movq [pix_q], m2 1028RET 1029 1030cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1031 CHROMA_H_START_XMM r4, r5 1032 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1033 TRANSPOSE_8x4B_XMM 1034 CHROMA_INTRA_BODY_XMM 1035 TRANSPOSE_4x8B_XMM 1036 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1037RET 1038 1039cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1040 CHROMA_H_START_XMM r4, r5 1041 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1042 TRANSPOSE_8x4B_XMM 1043 CHROMA_INTRA_BODY_XMM 1044 TRANSPOSE_4x8B_XMM 1045 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1046 1047 lea pix_q, [pix_q + 8*stride_q] 1048 lea r4, [r4 + 8*stride_q] 1049 1050 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1051 TRANSPOSE_8x4B_XMM 1052 CHROMA_INTRA_BODY_XMM 1053 TRANSPOSE_4x8B_XMM 1054 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1055RET 1056 1057%endmacro ; DEBLOCK_CHROMA_XMM 1058 1059DEBLOCK_CHROMA_XMM sse2 1060DEBLOCK_CHROMA_XMM avx 1061 1062;----------------------------------------------------------------------------- 1063; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], 1064; int8_t ref[2][40], int16_t mv[2][40][2], 1065; int bidir, int edges, int step, 1066; int mask_mv0, int mask_mv1, int field); 1067; 1068; bidir is 0 or 1 1069; edges is 1 or 4 1070; step is 1 or 2 1071; mask_mv0 is 0 or 3 1072; mask_mv1 is 0 or 1 1073; field is 0 or 1 1074;----------------------------------------------------------------------------- 1075%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, 1076 ; dir, d_idx, mask_dir, bidir 1077%define edgesd %1 1078%define stepd %2 1079%define mask_mvd %3 1080%define dir %4 1081%define d_idx %5 1082%define mask_dir %6 1083%define bidir %7 1084 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) 1085%%.b_idx_loop: 1086%if mask_dir == 0 1087 pxor m0, m0 1088%endif 1089 test b_idxd, dword mask_mvd 1090 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) 1091%if bidir == 1 1092 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } 1093 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } 1094 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } 1095 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } 1096 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } 1097 psubb m0, m2 ; { ref0[b] != ref0[bn], 1098 ; ref0[b] != ref1[bn] } 1099 psubb m1, m3 ; { ref1[b] != ref1[bn], 1100 ; ref1[b] != ref0[bn] } 1101 1102 por m0, m1 1103 mova m1, [mvq+b_idxq*4+(d_idx+12)*4] 1104 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1105 mova m3, m1 1106 mova m4, m2 1107 psubw m1, [mvq+b_idxq*4+12*4] 1108 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1109 psubw m3, [mvq+b_idxq*4+52*4] 1110 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1111 packsswb m1, m2 1112 packsswb m3, m4 1113 paddb m1, m6 1114 paddb m3, m6 1115 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1116 psubusb m3, m5 1117 packsswb m1, m3 1118 1119 por m0, m1 1120 mova m1, [mvq+b_idxq*4+(d_idx+52)*4] 1121 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] 1122 mova m3, m1 1123 mova m4, m2 1124 psubw m1, [mvq+b_idxq*4+12*4] 1125 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1126 psubw m3, [mvq+b_idxq*4+52*4] 1127 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1128 packsswb m1, m2 1129 packsswb m3, m4 1130 paddb m1, m6 1131 paddb m3, m6 1132 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1133 psubusb m3, m5 1134 packsswb m1, m3 1135 1136 pshufw m1, m1, 0x4E 1137 por m0, m1 1138 pshufw m1, m0, 0x4E 1139 pminub m0, m1 1140%else ; bidir == 0 1141 movd m0, [refq+b_idxq+12] 1142 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] 1143 1144 mova m1, [mvq+b_idxq*4+12*4] 1145 mova m2, [mvq+b_idxq*4+12*4+mmsize] 1146 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] 1147 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1148 packsswb m1, m2 1149 paddb m1, m6 1150 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1151 packsswb m1, m1 1152 por m0, m1 1153%endif ; bidir == 1/0 1154 1155%%.skip_loop_iter: 1156 movd m1, [nnzq+b_idxq+12] 1157 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] 1158 1159 pminub m1, m7 1160 pminub m0, m7 1161 psllw m1, 1 1162 pxor m2, m2 1163 pmaxub m1, m0 1164 punpcklbw m1, m2 1165 movq [bsq+b_idxq+32*dir], m1 1166 1167 add b_idxd, dword stepd 1168 cmp b_idxd, dword edgesd 1169 jl %%.b_idx_loop 1170%endmacro 1171 1172INIT_MMX mmxext 1173cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ 1174 step, mask_mv0, mask_mv1, field 1175%define b_idxq bidirq 1176%define b_idxd bidird 1177 cmp dword fieldm, 0 1178 mova m7, [pb_1] 1179 mova m5, [pb_3] 1180 je .nofield 1181 mova m5, [pb_3_1] 1182.nofield: 1183 mova m6, m5 1184 paddb m5, m5 1185 1186 shl dword stepd, 3 1187 shl dword edgesd, 3 1188%if ARCH_X86_32 1189%define mask_mv0d mask_mv0m 1190%define mask_mv1d mask_mv1m 1191%endif 1192 shl dword mask_mv1d, 3 1193 shl dword mask_mv0d, 3 1194 1195 cmp dword bidird, 0 1196 jne .bidir 1197 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 1198 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 1199 1200 mova m0, [bsq+mmsize*0] 1201 mova m1, [bsq+mmsize*1] 1202 mova m2, [bsq+mmsize*2] 1203 mova m3, [bsq+mmsize*3] 1204 TRANSPOSE4x4W 0, 1, 2, 3, 4 1205 mova [bsq+mmsize*0], m0 1206 mova [bsq+mmsize*1], m1 1207 mova [bsq+mmsize*2], m2 1208 mova [bsq+mmsize*3], m3 1209 RET 1210 1211.bidir: 1212 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 1213 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 1214 1215 mova m0, [bsq+mmsize*0] 1216 mova m1, [bsq+mmsize*1] 1217 mova m2, [bsq+mmsize*2] 1218 mova m3, [bsq+mmsize*3] 1219 TRANSPOSE4x4W 0, 1, 2, 3, 4 1220 mova [bsq+mmsize*0], m0 1221 mova [bsq+mmsize*1], m1 1222 mova [bsq+mmsize*2], m2 1223 mova [bsq+mmsize*3], m3 1224 RET 1225