1;***************************************************************************** 2;* MMX/SSE2/AVX-optimized H.264 deblocking code 3;***************************************************************************** 4;* Copyright (C) 2005-2011 x264 project 5;* 6;* Authors: Loren Merritt <lorenm@u.washington.edu> 7;* Fiona Glaser <fiona@x264.com> 8;* Oskar Arvidsson <oskar@irock.se> 9;* 10;* This file is part of FFmpeg. 11;* 12;* FFmpeg is free software; you can redistribute it and/or 13;* modify it under the terms of the GNU Lesser General Public 14;* License as published by the Free Software Foundation; either 15;* version 2.1 of the License, or (at your option) any later version. 16;* 17;* FFmpeg is distributed in the hope that it will be useful, 18;* but WITHOUT ANY WARRANTY; without even the implied warranty of 19;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20;* Lesser General Public License for more details. 21;* 22;* You should have received a copy of the GNU Lesser General Public 23;* License along with FFmpeg; if not, write to the Free Software 24;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 25;****************************************************************************** 26 27%include "libavutil/x86/x86util.asm" 28 29SECTION_RODATA 30 31pb_A1: times 16 db 0xA1 32pb_3_1: times 4 db 3, 1 33 34SECTION .text 35 36cextern pb_0 37cextern pb_1 38cextern pb_3 39 40%define PASS8ROWS(base, base3, stride, stride3, offset) \ 41 PASS8ROWS(base+offset, base3+offset, stride, stride3) 42 43; in: 8 rows of 4 bytes in %4..%11 44; out: 4 rows of 8 bytes in m0..m3 45%macro TRANSPOSE4x8_LOAD 11 46 movh m0, %4 47 movh m2, %5 48 movh m1, %6 49 movh m3, %7 50 punpckl%1 m0, m2 51 punpckl%1 m1, m3 52 mova m2, m0 53 punpckl%2 m0, m1 54 punpckh%2 m2, m1 55 56 movh m4, %8 57 movh m6, %9 58 movh m5, %10 59 movh m7, %11 60 punpckl%1 m4, m6 61 punpckl%1 m5, m7 62 mova m6, m4 63 punpckl%2 m4, m5 64 punpckh%2 m6, m5 65 66 punpckh%3 m1, m0, m4 67 punpckh%3 m3, m2, m6 68 punpckl%3 m0, m4 69 punpckl%3 m2, m6 70%endmacro 71 72; in: 4 rows of 8 bytes in m0..m3 73; out: 8 rows of 4 bytes in %1..%8 74%macro TRANSPOSE8x4B_STORE 8 75 punpckhdq m4, m0, m0 76 punpckhdq m5, m1, m1 77 punpckhdq m6, m2, m2 78 79 punpcklbw m0, m1 80 punpcklbw m2, m3 81 punpcklwd m1, m0, m2 82 punpckhwd m0, m2 83 movh %1, m1 84 punpckhdq m1, m1 85 movh %2, m1 86 movh %3, m0 87 punpckhdq m0, m0 88 movh %4, m0 89 90 punpckhdq m3, m3 91 punpcklbw m4, m5 92 punpcklbw m6, m3 93 punpcklwd m5, m4, m6 94 punpckhwd m4, m6 95 movh %5, m5 96 punpckhdq m5, m5 97 movh %6, m5 98 movh %7, m4 99 punpckhdq m4, m4 100 movh %8, m4 101%endmacro 102 103%macro TRANSPOSE4x8B_LOAD 8 104 TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 105%endmacro 106 107%macro SBUTTERFLY3 4 108 punpckh%1 %4, %2, %3 109 punpckl%1 %2, %3 110%endmacro 111 112; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 113; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] 114%macro TRANSPOSE6x8_MEM 9 115 RESET_MM_PERMUTATION 116 movq m0, %1 117 movq m1, %2 118 movq m2, %3 119 movq m3, %4 120 movq m4, %5 121 movq m5, %6 122 movq m6, %7 123 SBUTTERFLY bw, 0, 1, 7 124 SBUTTERFLY bw, 2, 3, 7 125 SBUTTERFLY bw, 4, 5, 7 126 movq [%9+0x10], m3 127 SBUTTERFLY3 bw, m6, %8, m7 128 SBUTTERFLY wd, 0, 2, 3 129 SBUTTERFLY wd, 4, 6, 3 130 punpckhdq m0, m4 131 movq [%9+0x00], m0 132 SBUTTERFLY3 wd, m1, [%9+0x10], m3 133 SBUTTERFLY wd, 5, 7, 0 134 SBUTTERFLY dq, 1, 5, 0 135 SBUTTERFLY dq, 2, 6, 0 136 punpckldq m3, m7 137 movq [%9+0x10], m2 138 movq [%9+0x20], m6 139 movq [%9+0x30], m1 140 movq [%9+0x40], m5 141 movq [%9+0x50], m3 142 RESET_MM_PERMUTATION 143%endmacro 144 145; in: 8 rows of 8 in %1..%8 146; out: 8 rows of 8 in %9..%16 147%macro TRANSPOSE8x8_MEM 16 148 RESET_MM_PERMUTATION 149 movq m0, %1 150 movq m1, %2 151 movq m2, %3 152 movq m3, %4 153 movq m4, %5 154 movq m5, %6 155 movq m6, %7 156 SBUTTERFLY bw, 0, 1, 7 157 SBUTTERFLY bw, 2, 3, 7 158 SBUTTERFLY bw, 4, 5, 7 159 SBUTTERFLY3 bw, m6, %8, m7 160 movq %9, m5 161 SBUTTERFLY wd, 0, 2, 5 162 SBUTTERFLY wd, 4, 6, 5 163 SBUTTERFLY wd, 1, 3, 5 164 movq %11, m6 165 movq m6, %9 166 SBUTTERFLY wd, 6, 7, 5 167 SBUTTERFLY dq, 0, 4, 5 168 SBUTTERFLY dq, 1, 6, 5 169 movq %9, m0 170 movq %10, m4 171 movq %13, m1 172 movq %14, m6 173 SBUTTERFLY3 dq, m2, %11, m0 174 SBUTTERFLY dq, 3, 7, 4 175 movq %11, m2 176 movq %12, m0 177 movq %15, m3 178 movq %16, m7 179 RESET_MM_PERMUTATION 180%endmacro 181 182; out: %4 = |%1-%2|>%3 183; clobbers: %5 184%macro DIFF_GT 5 185%if avx_enabled == 0 186 mova %5, %2 187 mova %4, %1 188 psubusb %5, %1 189 psubusb %4, %2 190%else 191 psubusb %5, %2, %1 192 psubusb %4, %1, %2 193%endif 194 por %4, %5 195 psubusb %4, %3 196%endmacro 197 198; out: %4 = |%1-%2|>%3 199; clobbers: %5 200%macro DIFF_GT2 5 201%if ARCH_X86_64 202 psubusb %5, %2, %1 203 psubusb %4, %1, %2 204%else 205 mova %5, %2 206 mova %4, %1 207 psubusb %5, %1 208 psubusb %4, %2 209%endif 210 psubusb %5, %3 211 psubusb %4, %3 212 pcmpeqb %4, %5 213%endmacro 214 215; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 216; out: m5=beta-1, m7=mask, %3=alpha-1 217; clobbers: m4,m6 218%macro LOAD_MASK 2-3 219 movd m4, %1 220 movd m5, %2 221 SPLATW m4, m4 222 SPLATW m5, m5 223 packuswb m4, m4 ; 16x alpha-1 224 packuswb m5, m5 ; 16x beta-1 225%if %0>2 226 mova %3, m4 227%endif 228 DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 229 DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 230 por m7, m4 231 DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 232 por m7, m4 233 pxor m6, m6 234 pcmpeqb m7, m6 235%endmacro 236 237; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) 238; out: m1=p0' m2=q0' 239; clobbers: m0,3-6 240%macro DEBLOCK_P0_Q0 0 241 pcmpeqb m4, m4 242 pxor m5, m1, m2 ; p0^q0 243 pxor m3, m4 244 pand m5, [pb_1] ; (p0^q0)&1 245 pavgb m3, m0 ; (p1 - q1 + 256)>>1 246 pxor m4, m1 247 pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 248 pavgb m4, m2 ; (q0 - p0 + 256)>>1 249 pavgb m3, m5 250 mova m6, [pb_A1] 251 paddusb m3, m4 ; d+128+33 252 psubusb m6, m3 253 psubusb m3, [pb_A1] 254 pminub m6, m7 255 pminub m3, m7 256 psubusb m1, m6 257 psubusb m2, m3 258 paddusb m1, m3 259 paddusb m2, m6 260%endmacro 261 262; in: m1=p0 m2=q0 263; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp 264; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) 265; clobbers: q2, tmp, tc0 266%macro LUMA_Q1 6 267 pavgb %6, m1, m2 268 pavgb %2, %6 ; avg(p2,avg(p0,q0)) 269 pxor %6, %3 270 pand %6, [pb_1] ; (p2^avg(p0,q0))&1 271 psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 272 psubusb %6, %1, %5 273 paddusb %5, %1 274 pmaxub %2, %6 275 pminub %2, %5 276 mova %4, %2 277%endmacro 278 279%if ARCH_X86_64 280;----------------------------------------------------------------------------- 281; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta, 282; int8_t *tc0) 283;----------------------------------------------------------------------------- 284%macro DEBLOCK_LUMA 0 285cglobal deblock_v_luma_8, 5,5,10, pix_, stride_, alpha_, beta_, base3_ 286 movd m8, [r4] ; tc0 287 lea r4, [stride_q*3] 288 dec alpha_d ; alpha-1 289 neg r4 290 dec beta_d ; beta-1 291 add base3_q, pix_q ; pix-3*stride 292 293 mova m0, [base3_q + stride_q] ; p1 294 mova m1, [base3_q + 2*stride_q] ; p0 295 mova m2, [pix_q] ; q0 296 mova m3, [pix_q + stride_q] ; q1 297 LOAD_MASK r2d, r3d 298 299 punpcklbw m8, m8 300 punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 301 pcmpeqb m9, m9 302 pcmpeqb m9, m8 303 pandn m9, m7 304 pand m8, m9 305 306 movdqa m3, [base3_q] ; p2 307 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 308 pand m6, m9 309 psubb m7, m8, m6 310 pand m6, m8 311 LUMA_Q1 m0, m3, [base3_q], [base3_q + stride_q], m6, m4 312 313 movdqa m4, [pix_q + 2*stride_q] ; q2 314 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 315 pand m6, m9 316 pand m8, m6 317 psubb m7, m6 318 mova m3, [pix_q + stride_q] 319 LUMA_Q1 m3, m4, [pix_q + 2*stride_q], [pix_q + stride_q], m8, m6 320 321 DEBLOCK_P0_Q0 322 mova [base3_q + 2*stride_q], m1 323 mova [pix_q], m2 324 RET 325 326;----------------------------------------------------------------------------- 327; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 328; int8_t *tc0) 329;----------------------------------------------------------------------------- 330INIT_MMX cpuname 331cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 332 movsxd r7, r1d 333 lea r8, [r7+r7*2] 334 lea r6, [r0-4] 335 lea r5, [r0-4+r8] 336%if WIN64 337 %define pix_tmp rsp+0x30 ; shadow space + r4 338%else 339 %define pix_tmp rsp 340%endif 341 342 ; transpose 6x16 -> tmp space 343 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp 344 lea r6, [r6+r7*8] 345 lea r5, [r5+r7*8] 346 TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 347 348 ; vertical filter 349 ; alpha, beta, tc0 are still in r2d, r3d, r4 350 ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them 351 lea r0, [pix_tmp+0x30] 352 mov r1d, 0x10 353%if WIN64 354 mov [rsp+0x20], r4 355%endif 356 call deblock_v_luma_8 357 358 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 359 add r6, 2 360 add r5, 2 361 movq m0, [pix_tmp+0x18] 362 movq m1, [pix_tmp+0x28] 363 movq m2, [pix_tmp+0x38] 364 movq m3, [pix_tmp+0x48] 365 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 366 367 shl r7, 3 368 sub r6, r7 369 sub r5, r7 370 shr r7, 3 371 movq m0, [pix_tmp+0x10] 372 movq m1, [pix_tmp+0x20] 373 movq m2, [pix_tmp+0x30] 374 movq m3, [pix_tmp+0x40] 375 TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) 376 377 RET 378%endmacro 379 380%macro DEBLOCK_H_LUMA_MBAFF 0 381 382cglobal deblock_h_luma_mbaff_8, 5, 9, 10, 8*16, pix_, stride_, alpha_, beta_, tc0_, base3_, stride3_ 383 movsxd stride_q, stride_d 384 dec alpha_d 385 dec beta_d 386 mov base3_q, pix_q 387 lea stride3_q, [3*stride_q] 388 add base3_q, stride3_q 389 390 movq m0, [pix_q - 4] 391 movq m1, [pix_q + stride_q - 4] 392 movq m2, [pix_q + 2*stride_q - 4] 393 movq m3, [base3_q - 4] 394 movq m4, [base3_q + stride_q - 4] 395 movq m5, [base3_q + 2*stride_q - 4] 396 movq m6, [base3_q + stride3_q - 4] 397 movq m7, [base3_q + 4*stride_q - 4] 398 399 TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 400 401 %assign i 0 402 %rep 8 403 movq [rsp + 16*i], m %+ i 404 %assign i i+1 405 %endrep 406 407 ; p2 = m1 [rsp + 16] 408 ; p1 = m2 [rsp + 32] 409 ; p0 = m3 [rsp + 48] 410 ; q0 = m4 [rsp + 64] 411 ; q1 = m5 [rsp + 80] 412 ; q2 = m6 [rsp + 96] 413 414 SWAP 0, 2 415 SWAP 1, 3 416 SWAP 2, 4 417 SWAP 3, 5 418 419 LOAD_MASK alpha_d, beta_d 420 movd m8, [tc0_q] 421 punpcklbw m8, m8 422 pcmpeqb m9, m9 423 pcmpeqb m9, m8 424 pandn m9, m7 425 pand m8, m9 426 427 movdqa m3, [rsp + 16] ; p2 428 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 429 pand m6, m9 430 psubb m7, m8, m6 431 pand m6, m8 432 LUMA_Q1 m0, m3, [rsp + 16], [rsp + 32], m6, m4 433 434 movdqa m4, [rsp + 96] ; q2 435 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 436 pand m6, m9 437 pand m8, m6 438 psubb m7, m6 439 mova m3, [rsp + 80] 440 LUMA_Q1 m3, m4, [rsp + 96], [rsp + 80], m8, m6 441 442 DEBLOCK_P0_Q0 443 SWAP 1, 3 444 SWAP 2, 4 445 movq m0, [rsp] 446 movq m1, [rsp + 16] 447 movq m2, [rsp + 32] 448 movq m5, [rsp + 80] 449 movq m6, [rsp + 96] 450 movq m7, [rsp + 112] 451 452 TRANSPOSE_8X8B 0,1,2,3,4,5,6,7 453 movq [pix_q - 4], m0 454 movq [pix_q + stride_q - 4], m1 455 movq [pix_q + 2*stride_q - 4], m2 456 movq [base3_q - 4], m3 457 movq [base3_q + stride_q - 4], m4 458 movq [base3_q + 2*stride_q - 4], m5 459 movq [base3_q + stride3_q - 4], m6 460 movq [base3_q + 4*stride_q - 4], m7 461 462RET 463 464%endmacro 465 466INIT_XMM sse2 467DEBLOCK_H_LUMA_MBAFF 468DEBLOCK_LUMA 469 470%if HAVE_AVX_EXTERNAL 471INIT_XMM avx 472DEBLOCK_H_LUMA_MBAFF 473DEBLOCK_LUMA 474%endif 475 476%else 477 478%macro DEBLOCK_LUMA 2 479;----------------------------------------------------------------------------- 480; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta, 481; int8_t *tc0) 482;----------------------------------------------------------------------------- 483cglobal deblock_%1_luma_8, 5,5,8,2*%2 484 lea r4, [r1*3] 485 dec r2 ; alpha-1 486 neg r4 487 dec r3 ; beta-1 488 add r4, r0 ; pix-3*stride 489 490 mova m0, [r4+r1] ; p1 491 mova m1, [r4+2*r1] ; p0 492 mova m2, [r0] ; q0 493 mova m3, [r0+r1] ; q1 494 LOAD_MASK r2, r3 495 496 mov r3, r4mp 497 pcmpeqb m3, m3 498 movd m4, [r3] ; tc0 499 punpcklbw m4, m4 500 punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] 501 mova [esp+%2], m4 ; tc 502 pcmpgtb m4, m3 503 mova m3, [r4] ; p2 504 pand m4, m7 505 mova [esp], m4 ; mask 506 507 DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 508 pand m6, m4 509 pand m4, [esp+%2] ; tc 510 psubb m7, m4, m6 511 pand m6, m4 512 LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 513 514 mova m4, [r0+2*r1] ; q2 515 DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 516 pand m6, [esp] ; mask 517 mova m5, [esp+%2] ; tc 518 psubb m7, m6 519 pand m5, m6 520 mova m3, [r0+r1] 521 LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 522 523 DEBLOCK_P0_Q0 524 mova [r4+2*r1], m1 525 mova [r0], m2 526 RET 527 528;----------------------------------------------------------------------------- 529; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta, 530; int8_t *tc0) 531;----------------------------------------------------------------------------- 532INIT_MMX cpuname 533cglobal deblock_h_luma_8, 0,5,8,0x60+12 534 mov r0, r0mp 535 mov r3, r1m 536 lea r4, [r3*3] 537 sub r0, 4 538 lea r1, [r0+r4] 539%define pix_tmp esp+12 540 541 ; transpose 6x16 -> tmp space 542 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp 543 lea r0, [r0+r3*8] 544 lea r1, [r1+r3*8] 545 TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 546 547 ; vertical filter 548 lea r0, [pix_tmp+0x30] 549 PUSH dword r4m 550 PUSH dword r3m 551 PUSH dword r2m 552 PUSH dword 16 553 PUSH dword r0 554 call deblock_%1_luma_8 555%ifidn %1, v8 556 add dword [esp ], 8 ; pix_tmp+0x38 557 add dword [esp+16], 2 ; tc0+2 558 call deblock_%1_luma_8 559%endif 560 ADD esp, 20 561 562 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) 563 mov r0, r0mp 564 sub r0, 2 565 566 movq m0, [pix_tmp+0x10] 567 movq m1, [pix_tmp+0x20] 568 lea r1, [r0+r4] 569 movq m2, [pix_tmp+0x30] 570 movq m3, [pix_tmp+0x40] 571 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 572 573 lea r0, [r0+r3*8] 574 lea r1, [r1+r3*8] 575 movq m0, [pix_tmp+0x18] 576 movq m1, [pix_tmp+0x28] 577 movq m2, [pix_tmp+0x38] 578 movq m3, [pix_tmp+0x48] 579 TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) 580 581 RET 582%endmacro ; DEBLOCK_LUMA 583 584INIT_MMX mmxext 585DEBLOCK_LUMA v8, 8 586INIT_XMM sse2 587DEBLOCK_LUMA v, 16 588%if HAVE_AVX_EXTERNAL 589INIT_XMM avx 590DEBLOCK_LUMA v, 16 591%endif 592 593%endif ; ARCH 594 595 596 597%macro LUMA_INTRA_P012 4 ; p0..p3 in memory 598%if ARCH_X86_64 599 pavgb t0, p2, p1 600 pavgb t1, p0, q0 601%else 602 mova t0, p2 603 mova t1, p0 604 pavgb t0, p1 605 pavgb t1, q0 606%endif 607 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 608 mova t5, t1 609%if ARCH_X86_64 610 paddb t2, p2, p1 611 paddb t3, p0, q0 612%else 613 mova t2, p2 614 mova t3, p0 615 paddb t2, p1 616 paddb t3, q0 617%endif 618 paddb t2, t3 619 mova t3, t2 620 mova t4, t2 621 psrlw t2, 1 622 pavgb t2, mpb_0 623 pxor t2, t0 624 pand t2, mpb_1 625 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; 626 627%if ARCH_X86_64 628 pavgb t1, p2, q1 629 psubb t2, p2, q1 630%else 631 mova t1, p2 632 mova t2, p2 633 pavgb t1, q1 634 psubb t2, q1 635%endif 636 paddb t3, t3 637 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 638 pand t2, mpb_1 639 psubb t1, t2 640 pavgb t1, p1 641 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 642 psrlw t3, 2 643 pavgb t3, mpb_0 644 pxor t3, t1 645 pand t3, mpb_1 646 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 647 648 pxor t3, p0, q1 649 pavgb t2, p0, q1 650 pand t3, mpb_1 651 psubb t2, t3 652 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 653 654 pxor t1, t2 655 pxor t2, p0 656 pand t1, mask1p 657 pand t2, mask0 658 pxor t1, t2 659 pxor t1, p0 660 mova %1, t1 ; store p0 661 662 mova t1, %4 ; p3 663 paddb t2, t1, p2 664 pavgb t1, p2 665 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 666 paddb t2, t2 667 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 668 psrlw t2, 2 669 pavgb t2, mpb_0 670 pxor t2, t1 671 pand t2, mpb_1 672 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 673 674 pxor t0, p1 675 pxor t1, p2 676 pand t0, mask1p 677 pand t1, mask1p 678 pxor t0, p1 679 pxor t1, p2 680 mova %2, t0 ; store p1 681 mova %3, t1 ; store p2 682%endmacro 683 684%macro LUMA_INTRA_SWAP_PQ 0 685 %define q1 m0 686 %define q0 m1 687 %define p0 m2 688 %define p1 m3 689 %define p2 q2 690 %define mask1p mask1q 691%endmacro 692 693%macro DEBLOCK_LUMA_INTRA 1 694 %define p1 m0 695 %define p0 m1 696 %define q0 m2 697 %define q1 m3 698 %define t0 m4 699 %define t1 m5 700 %define t2 m6 701 %define t3 m7 702%if ARCH_X86_64 703 %define p2 m8 704 %define q2 m9 705 %define t4 m10 706 %define t5 m11 707 %define mask0 m12 708 %define mask1p m13 709%if WIN64 710 %define mask1q [rsp] 711%else 712 %define mask1q [rsp-24] 713%endif 714 %define mpb_0 m14 715 %define mpb_1 m15 716%else 717 %define spill(x) [esp+16*x] 718 %define p2 [r4+r1] 719 %define q2 [r0+2*r1] 720 %define t4 spill(0) 721 %define t5 spill(1) 722 %define mask0 spill(2) 723 %define mask1p spill(3) 724 %define mask1q spill(4) 725 %define mpb_0 [pb_0] 726 %define mpb_1 [pb_1] 727%endif 728 729;----------------------------------------------------------------------------- 730; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 731;----------------------------------------------------------------------------- 732%if WIN64 733cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 734%else 735cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 736%endif 737 lea r4, [r1*4] 738 lea r5, [r1*3] ; 3*stride 739 dec r2d ; alpha-1 740 jl .end 741 neg r4 742 dec r3d ; beta-1 743 jl .end 744 add r4, r0 ; pix-4*stride 745 mova p1, [r4+2*r1] 746 mova p0, [r4+r5] 747 mova q0, [r0] 748 mova q1, [r0+r1] 749%if ARCH_X86_64 750 pxor mpb_0, mpb_0 751 mova mpb_1, [pb_1] 752 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 753 SWAP 7, 12 ; m12=mask0 754 pavgb t5, mpb_0 755 pavgb t5, mpb_1 ; alpha/4+1 756 movdqa p2, [r4+r1] 757 movdqa q2, [r0+2*r1] 758 DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 759 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 760 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 761 pand t0, mask0 762 pand t4, t0 763 pand t2, t0 764 mova mask1q, t4 765 mova mask1p, t2 766%else 767 LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 768 mova m4, t5 769 mova mask0, m7 770 pavgb m4, [pb_0] 771 pavgb m4, [pb_1] ; alpha/4+1 772 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 773 pand m6, mask0 774 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 775 pand m4, m6 776 mova mask1p, m4 777 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 778 pand m4, m6 779 mova mask1q, m4 780%endif 781 LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] 782 LUMA_INTRA_SWAP_PQ 783 LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] 784.end: 785 RET 786 787INIT_MMX cpuname 788%if ARCH_X86_64 789;----------------------------------------------------------------------------- 790; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta) 791;----------------------------------------------------------------------------- 792cglobal deblock_h_luma_intra_8, 4,9,0,0x80 793 movsxd r7, r1d 794 lea r8, [r7*3] 795 lea r6, [r0-4] 796 lea r5, [r0-4+r8] 797%if WIN64 798 %define pix_tmp rsp+0x20 ; shadow space 799%else 800 %define pix_tmp rsp 801%endif 802 803 ; transpose 8x16 -> tmp space 804 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 805 lea r6, [r6+r7*8] 806 lea r5, [r5+r7*8] 807 TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 808 809 lea r0, [pix_tmp+0x40] 810 mov r1, 0x10 811 call deblock_v_luma_intra_8 812 813 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 814 lea r5, [r6+r8] 815 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 816 shl r7, 3 817 sub r6, r7 818 sub r5, r7 819 shr r7, 3 820 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) 821 RET 822%else 823cglobal deblock_h_luma_intra_8, 2,4,8,0x80 824 lea r3, [r1*3] 825 sub r0, 4 826 lea r2, [r0+r3] 827 %define pix_tmp rsp 828 829 ; transpose 8x16 -> tmp space 830 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) 831 lea r0, [r0+r1*8] 832 lea r2, [r2+r1*8] 833 TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) 834 835 lea r0, [pix_tmp+0x40] 836 PUSH dword r3m 837 PUSH dword r2m 838 PUSH dword 16 839 PUSH r0 840 call deblock_%1_luma_intra_8 841%ifidn %1, v8 842 add dword [rsp], 8 ; pix_tmp+8 843 call deblock_%1_luma_intra_8 844%endif 845 ADD esp, 16 846 847 mov r1, r1m 848 mov r0, r0mp 849 lea r3, [r1*3] 850 sub r0, 4 851 lea r2, [r0+r3] 852 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) 853 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 854 lea r0, [r0+r1*8] 855 lea r2, [r2+r1*8] 856 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) 857 RET 858%endif ; ARCH_X86_64 859%endmacro ; DEBLOCK_LUMA_INTRA 860 861INIT_XMM sse2 862DEBLOCK_LUMA_INTRA v 863%if HAVE_AVX_EXTERNAL 864INIT_XMM avx 865DEBLOCK_LUMA_INTRA v 866%endif 867%if ARCH_X86_64 == 0 868INIT_MMX mmxext 869DEBLOCK_LUMA_INTRA v8 870%endif 871 872INIT_MMX mmxext 873 874%macro CHROMA_V_START 0 875 dec r2d ; alpha-1 876 dec r3d ; beta-1 877 mov t5, r0 878 sub t5, r1 879 sub t5, r1 880%endmacro 881 882%macro CHROMA_H_START 0 883 dec r2d 884 dec r3d 885 sub r0, 2 886 lea t6, [r1*3] 887 mov t5, r0 888 add r0, t6 889%endmacro 890 891%define t5 r5 892%define t6 r6 893 894;----------------------------------------------------------------------------- 895; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta, 896; int8_t *tc0) 897;----------------------------------------------------------------------------- 898cglobal deblock_v_chroma_8, 5,6 899 CHROMA_V_START 900 movq m0, [t5] 901 movq m1, [t5+r1] 902 movq m2, [r0] 903 movq m3, [r0+r1] 904 call ff_chroma_inter_body_mmxext 905 movq [t5+r1], m1 906 movq [r0], m2 907 RET 908 909;----------------------------------------------------------------------------- 910; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta, 911; int8_t *tc0) 912;----------------------------------------------------------------------------- 913cglobal deblock_h_chroma_8, 5,7 914%if ARCH_X86_64 915 ; This could use the red zone on 64 bit unix to avoid the stack pointer 916 ; readjustment, but valgrind assumes the red zone is clobbered on 917 ; function calls and returns. 918 sub rsp, 16 919 %define buf0 [rsp] 920 %define buf1 [rsp+8] 921%else 922 %define buf0 r0m 923 %define buf1 r2m 924%endif 925 CHROMA_H_START 926 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 927 movq buf0, m0 928 movq buf1, m3 929 LOAD_MASK r2d, r3d 930 movd m6, [r4] ; tc0 931 punpcklbw m6, m6 932 pand m7, m6 933 DEBLOCK_P0_Q0 934 movq m0, buf0 935 movq m3, buf1 936 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 937%if ARCH_X86_64 938 add rsp, 16 939%endif 940 RET 941 942ALIGN 16 943ff_chroma_inter_body_mmxext: 944 LOAD_MASK r2d, r3d 945 movd m6, [r4] ; tc0 946 punpcklbw m6, m6 947 pand m7, m6 948 DEBLOCK_P0_Q0 949 ret 950 951%define t5 r4 952%define t6 r5 953 954cglobal deblock_h_chroma422_8, 5, 6 955 SUB rsp, (1+ARCH_X86_64*2)*mmsize 956 %if ARCH_X86_64 957 %define buf0 [rsp+16] 958 %define buf1 [rsp+8] 959 %else 960 %define buf0 r0m 961 %define buf1 r2m 962 %endif 963 964 movd m6, [r4] 965 punpcklbw m6, m6 966 movq [rsp], m6 967 CHROMA_H_START 968 969 TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) 970 movq buf0, m0 971 movq buf1, m3 972 LOAD_MASK r2d, r3d 973 movd m6, [rsp] 974 punpcklwd m6, m6 975 pand m7, m6 976 DEBLOCK_P0_Q0 977 movq m0, buf0 978 movq m3, buf1 979 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 980 981 lea r0, [r0+r1*8] 982 lea t5, [t5+r1*8] 983 984 TRANSPOSE4x8B_LOAD PASS8ROWS(t5, r0, r1, t6) 985 movq buf0, m0 986 movq buf1, m3 987 LOAD_MASK r2d, r3d 988 movd m6, [rsp+4] 989 punpcklwd m6, m6 990 pand m7, m6 991 DEBLOCK_P0_Q0 992 movq m0, buf0 993 movq m3, buf1 994 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 995 ADD rsp, (1+ARCH_X86_64*2)*mmsize 996RET 997 998; in: %1=p0 %2=p1 %3=q1 999; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 1000%macro CHROMA_INTRA_P0 3 1001 movq m4, %1 1002 pxor m4, %3 1003 pand m4, [pb_1] ; m4 = (p0^q1)&1 1004 pavgb %1, %3 1005 psubusb %1, m4 1006 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) 1007%endmacro 1008 1009;------------------------------------------------------------------------------ 1010; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) 1011;------------------------------------------------------------------------------ 1012cglobal deblock_v_chroma_intra_8, 4,5 1013 CHROMA_V_START 1014 movq m0, [t5] 1015 movq m1, [t5+r1] 1016 movq m2, [r0] 1017 movq m3, [r0+r1] 1018 call ff_chroma_intra_body_mmxext 1019 movq [t5+r1], m1 1020 movq [r0], m2 1021 RET 1022 1023;------------------------------------------------------------------------------ 1024; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta) 1025;------------------------------------------------------------------------------ 1026cglobal deblock_h_chroma_intra_8, 4,6 1027 CHROMA_H_START 1028 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 1029 call ff_chroma_intra_body_mmxext 1030 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 1031 RET 1032 1033cglobal deblock_h_chroma422_intra_8, 4, 6 1034 CHROMA_H_START 1035 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 1036 call ff_chroma_intra_body_mmxext 1037 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 1038 1039 lea r0, [r0+r1*8] 1040 lea t5, [t5+r1*8] 1041 1042 TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) 1043 call ff_chroma_intra_body_mmxext 1044 TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) 1045RET 1046 1047ALIGN 16 1048ff_chroma_intra_body_mmxext: 1049 LOAD_MASK r2d, r3d 1050 movq m5, m1 1051 movq m6, m2 1052 CHROMA_INTRA_P0 m1, m0, m3 1053 CHROMA_INTRA_P0 m2, m3, m0 1054 psubb m1, m5 1055 psubb m2, m6 1056 pand m1, m7 1057 pand m2, m7 1058 paddb m1, m5 1059 paddb m2, m6 1060 ret 1061 1062%macro LOAD_8_ROWS 8 1063 movd m0, %1 1064 movd m1, %2 1065 movd m2, %3 1066 movd m3, %4 1067 movd m4, %5 1068 movd m5, %6 1069 movd m6, %7 1070 movd m7, %8 1071%endmacro 1072 1073%macro STORE_8_ROWS 8 1074 movd %1, m0 1075 movd %2, m1 1076 movd %3, m2 1077 movd %4, m3 1078 movd %5, m4 1079 movd %6, m5 1080 movd %7, m6 1081 movd %8, m7 1082%endmacro 1083 1084%macro TRANSPOSE_8x4B_XMM 0 1085 punpcklbw m0, m1 1086 punpcklbw m2, m3 1087 punpcklbw m4, m5 1088 punpcklbw m6, m7 1089 punpcklwd m0, m2 1090 punpcklwd m4, m6 1091 punpckhdq m2, m0, m4 1092 punpckldq m0, m4 1093 MOVHL m1, m0 1094 MOVHL m3, m2 1095%endmacro 1096 1097%macro TRANSPOSE_4x8B_XMM 0 1098 punpcklbw m0, m1 1099 punpcklbw m2, m3 1100 punpckhwd m4, m0, m2 1101 punpcklwd m0, m2 1102 MOVHL m6, m4 1103 MOVHL m2, m0 1104 pshufd m1, m0, 1 1105 pshufd m3, m2, 1 1106 pshufd m5, m4, 1 1107 pshufd m7, m6, 1 1108%endmacro 1109 1110%macro CHROMA_INTER_BODY_XMM 1 1111 LOAD_MASK alpha_d, beta_d 1112 movd m6, [tc0_q] 1113 %rep %1 1114 punpcklbw m6, m6 1115 %endrep 1116 pand m7, m6 1117 DEBLOCK_P0_Q0 1118%endmacro 1119 1120%macro CHROMA_INTRA_BODY_XMM 0 1121 LOAD_MASK alpha_d, beta_d 1122 mova m5, m1 1123 mova m6, m2 1124 pxor m4, m1, m3 1125 pand m4, [pb_1] 1126 pavgb m1, m3 1127 psubusb m1, m4 1128 pavgb m1, m0 1129 pxor m4, m2, m0 1130 pand m4, [pb_1] 1131 pavgb m2, m0 1132 psubusb m2, m4 1133 pavgb m2, m3 1134 psubb m1, m5 1135 psubb m2, m6 1136 pand m1, m7 1137 pand m2, m7 1138 paddb m1, m5 1139 paddb m2, m6 1140%endmacro 1141 1142%macro CHROMA_V_START_XMM 1 1143 movsxdifnidn stride_q, stride_d 1144 dec alpha_d 1145 dec beta_d 1146 mov %1, pix_q 1147 sub %1, stride_q 1148 sub %1, stride_q 1149%endmacro 1150 1151%macro CHROMA_H_START_XMM 2 1152 movsxdifnidn stride_q, stride_d 1153 dec alpha_d 1154 dec beta_d 1155 lea %2, [3*stride_q] 1156 mov %1, pix_q 1157 add %1, %2 1158%endmacro 1159 1160%macro DEBLOCK_CHROMA_XMM 1 1161 1162INIT_XMM %1 1163 1164cglobal deblock_v_chroma_8, 5, 6, 8, pix_, stride_, alpha_, beta_, tc0_ 1165 CHROMA_V_START_XMM r5 1166 movq m0, [r5] 1167 movq m1, [r5 + stride_q] 1168 movq m2, [pix_q] 1169 movq m3, [pix_q + stride_q] 1170 CHROMA_INTER_BODY_XMM 1 1171 movq [r5 + stride_q], m1 1172 movq [pix_q], m2 1173RET 1174 1175cglobal deblock_h_chroma_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 1176 CHROMA_H_START_XMM r5, r6 1177 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1178 TRANSPOSE_8x4B_XMM 1179 movq [rsp], m0 1180 movq [rsp + 8], m3 1181 CHROMA_INTER_BODY_XMM 1 1182 movq m0, [rsp] 1183 movq m3, [rsp + 8] 1184 TRANSPOSE_4x8B_XMM 1185 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1186RET 1187 1188cglobal deblock_h_chroma422_8, 5, 7, 8, 0-16, pix_, stride_, alpha_, beta_, tc0_ 1189 CHROMA_H_START_XMM r5, r6 1190 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1191 TRANSPOSE_8x4B_XMM 1192 movq [rsp], m0 1193 movq [rsp + 8], m3 1194 CHROMA_INTER_BODY_XMM 2 1195 movq m0, [rsp] 1196 movq m3, [rsp + 8] 1197 TRANSPOSE_4x8B_XMM 1198 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1199 1200 lea pix_q, [pix_q + 8*stride_q] 1201 lea r5, [r5 + 8*stride_q] 1202 add tc0_q, 2 1203 1204 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1205 TRANSPOSE_8x4B_XMM 1206 movq [rsp], m0 1207 movq [rsp + 8], m3 1208 CHROMA_INTER_BODY_XMM 2 1209 movq m0, [rsp] 1210 movq m3, [rsp + 8] 1211 TRANSPOSE_4x8B_XMM 1212 STORE_8_ROWS PASS8ROWS(pix_q - 2, r5 - 2, stride_q, r6) 1213RET 1214 1215cglobal deblock_v_chroma_intra_8, 4, 5, 8, pix_, stride_, alpha_, beta_ 1216 CHROMA_V_START_XMM r4 1217 movq m0, [r4] 1218 movq m1, [r4 + stride_q] 1219 movq m2, [pix_q] 1220 movq m3, [pix_q + stride_q] 1221 CHROMA_INTRA_BODY_XMM 1222 movq [r4 + stride_q], m1 1223 movq [pix_q], m2 1224RET 1225 1226cglobal deblock_h_chroma_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1227 CHROMA_H_START_XMM r4, r5 1228 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1229 TRANSPOSE_8x4B_XMM 1230 CHROMA_INTRA_BODY_XMM 1231 TRANSPOSE_4x8B_XMM 1232 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1233RET 1234 1235cglobal deblock_h_chroma422_intra_8, 4, 6, 8, pix_, stride_, alpha_, beta_ 1236 CHROMA_H_START_XMM r4, r5 1237 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1238 TRANSPOSE_8x4B_XMM 1239 CHROMA_INTRA_BODY_XMM 1240 TRANSPOSE_4x8B_XMM 1241 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1242 1243 lea pix_q, [pix_q + 8*stride_q] 1244 lea r4, [r4 + 8*stride_q] 1245 1246 LOAD_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1247 TRANSPOSE_8x4B_XMM 1248 CHROMA_INTRA_BODY_XMM 1249 TRANSPOSE_4x8B_XMM 1250 STORE_8_ROWS PASS8ROWS(pix_q - 2, r4 - 2, stride_q, r5) 1251RET 1252 1253%endmacro ; DEBLOCK_CHROMA_XMM 1254 1255DEBLOCK_CHROMA_XMM sse2 1256DEBLOCK_CHROMA_XMM avx 1257 1258;----------------------------------------------------------------------------- 1259; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], 1260; int8_t ref[2][40], int16_t mv[2][40][2], 1261; int bidir, int edges, int step, 1262; int mask_mv0, int mask_mv1, int field); 1263; 1264; bidir is 0 or 1 1265; edges is 1 or 4 1266; step is 1 or 2 1267; mask_mv0 is 0 or 3 1268; mask_mv1 is 0 or 1 1269; field is 0 or 1 1270;----------------------------------------------------------------------------- 1271%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, 1272 ; dir, d_idx, mask_dir, bidir 1273%define edgesd %1 1274%define stepd %2 1275%define mask_mvd %3 1276%define dir %4 1277%define d_idx %5 1278%define mask_dir %6 1279%define bidir %7 1280 xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) 1281%%.b_idx_loop: 1282%if mask_dir == 0 1283 pxor m0, m0 1284%endif 1285 test b_idxd, dword mask_mvd 1286 jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) 1287%if bidir == 1 1288 movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } 1289 punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } 1290 pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } 1291 pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } 1292 pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } 1293 psubb m0, m2 ; { ref0[b] != ref0[bn], 1294 ; ref0[b] != ref1[bn] } 1295 psubb m1, m3 ; { ref1[b] != ref1[bn], 1296 ; ref1[b] != ref0[bn] } 1297 1298 por m0, m1 1299 mova m1, [mvq+b_idxq*4+(d_idx+12)*4] 1300 mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1301 mova m3, m1 1302 mova m4, m2 1303 psubw m1, [mvq+b_idxq*4+12*4] 1304 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1305 psubw m3, [mvq+b_idxq*4+52*4] 1306 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1307 packsswb m1, m2 1308 packsswb m3, m4 1309 paddb m1, m6 1310 paddb m3, m6 1311 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1312 psubusb m3, m5 1313 packsswb m1, m3 1314 1315 por m0, m1 1316 mova m1, [mvq+b_idxq*4+(d_idx+52)*4] 1317 mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] 1318 mova m3, m1 1319 mova m4, m2 1320 psubw m1, [mvq+b_idxq*4+12*4] 1321 psubw m2, [mvq+b_idxq*4+12*4+mmsize] 1322 psubw m3, [mvq+b_idxq*4+52*4] 1323 psubw m4, [mvq+b_idxq*4+52*4+mmsize] 1324 packsswb m1, m2 1325 packsswb m3, m4 1326 paddb m1, m6 1327 paddb m3, m6 1328 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1329 psubusb m3, m5 1330 packsswb m1, m3 1331 1332 pshufw m1, m1, 0x4E 1333 por m0, m1 1334 pshufw m1, m0, 0x4E 1335 pminub m0, m1 1336%else ; bidir == 0 1337 movd m0, [refq+b_idxq+12] 1338 psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] 1339 1340 mova m1, [mvq+b_idxq*4+12*4] 1341 mova m2, [mvq+b_idxq*4+12*4+mmsize] 1342 psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] 1343 psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] 1344 packsswb m1, m2 1345 paddb m1, m6 1346 psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit 1347 packsswb m1, m1 1348 por m0, m1 1349%endif ; bidir == 1/0 1350 1351%%.skip_loop_iter: 1352 movd m1, [nnzq+b_idxq+12] 1353 por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] 1354 1355 pminub m1, m7 1356 pminub m0, m7 1357 psllw m1, 1 1358 pxor m2, m2 1359 pmaxub m1, m0 1360 punpcklbw m1, m2 1361 movq [bsq+b_idxq+32*dir], m1 1362 1363 add b_idxd, dword stepd 1364 cmp b_idxd, dword edgesd 1365 jl %%.b_idx_loop 1366%endmacro 1367 1368INIT_MMX mmxext 1369cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ 1370 step, mask_mv0, mask_mv1, field 1371%define b_idxq bidirq 1372%define b_idxd bidird 1373 cmp dword fieldm, 0 1374 mova m7, [pb_1] 1375 mova m5, [pb_3] 1376 je .nofield 1377 mova m5, [pb_3_1] 1378.nofield: 1379 mova m6, m5 1380 paddb m5, m5 1381 1382 shl dword stepd, 3 1383 shl dword edgesd, 3 1384%if ARCH_X86_32 1385%define mask_mv0d mask_mv0m 1386%define mask_mv1d mask_mv1m 1387%endif 1388 shl dword mask_mv1d, 3 1389 shl dword mask_mv0d, 3 1390 1391 cmp dword bidird, 0 1392 jne .bidir 1393 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 1394 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 1395 1396 mova m0, [bsq+mmsize*0] 1397 mova m1, [bsq+mmsize*1] 1398 mova m2, [bsq+mmsize*2] 1399 mova m3, [bsq+mmsize*3] 1400 TRANSPOSE4x4W 0, 1, 2, 3, 4 1401 mova [bsq+mmsize*0], m0 1402 mova [bsq+mmsize*1], m1 1403 mova [bsq+mmsize*2], m2 1404 mova [bsq+mmsize*3], m3 1405 RET 1406 1407.bidir: 1408 loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 1409 loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 1410 1411 mova m0, [bsq+mmsize*0] 1412 mova m1, [bsq+mmsize*1] 1413 mova m2, [bsq+mmsize*2] 1414 mova m3, [bsq+mmsize*3] 1415 TRANSPOSE4x4W 0, 1, 2, 3, 4 1416 mova [bsq+mmsize*0], m0 1417 mova [bsq+mmsize*1], m1 1418 mova [bsq+mmsize*2], m2 1419 mova [bsq+mmsize*3], m3 1420 RET 1421