1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33wiener_l_shuf: db 4, 4, 4, 4, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 34 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 35wiener_shufA: db 1, 7, 2, 8, 3, 9, 4, 10, 5, 11, 6, 12, 7, 13, 8, 14 36wiener_shufB: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 37wiener_shufC: db 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11, 13, 12 38sgr_l_shuf: db 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 39sgr_r_ext: times 16 db 1 40 times 16 db 9 41 42; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of 43; cache but eliminates some shifts in the inner sgr loop which is overall a win 44const sgr_x_by_x_avx2 45 dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16 46 dd 15, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 9, 9, 8, 8 47 dd 8, 8, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 5, 5 48 dd 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4 49 dd 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3 50 dd 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 51 dd 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 52 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 53 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 54 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 55 dd 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1 56 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 57 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 58 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 59 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 60 dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 61 62 times 4 db -1 ; needed for 16-bit sgr 63pb_m5: times 4 db -5 64pb_3: times 4 db 3 65pw_5_6: dw 5, 6 66 67sgr_shuf: db 1, -1, 2, -1, 3, -1, 4, -1, 5, -1, 6, -1, 7, -1, 8, -1 68 db 9, -1, 10, -1, 11, -1, 12, -1 69 70pw_256: times 2 dw 256 71pw_2056: times 2 dw 2056 72pw_m16380: times 2 dw -16380 73pd_25: dd 25 74pd_34816: dd 34816 75pd_m4096: dd -4096 76pd_0xf00801c7: dd 0xf00801c7 77pd_0xf00800a4: dd 0xf00800a4 78 79cextern pb_0to63 80 81SECTION .text 82 83DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers 84 85INIT_YMM avx2 86cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \ 87 w, h, edge, flt 88 mov fltq, r6mp 89 movifnidn hd, hm 90 mov edged, r7m 91 mov wd, wm 92 vbroadcasti128 m6, [wiener_shufA] 93 vpbroadcastb m11, [fltq+ 0] ; x0 x0 94 vbroadcasti128 m7, [wiener_shufB] 95 vpbroadcastd m12, [fltq+ 2] 96 vbroadcasti128 m8, [wiener_shufC] 97 packsswb m12, m12 ; x1 x2 98 vpbroadcastw m13, [fltq+ 6] ; x3 99 vbroadcasti128 m9, [sgr_shuf+6] 100 add lpfq, wq 101 vpbroadcastd m10, [pw_m16380] 102 vpbroadcastd m14, [fltq+16] ; y0 y1 103 add dstq, wq 104 vpbroadcastd m15, [fltq+20] ; y2 y3 105 lea t1, [rsp+wq*2+16] 106 psllw m14, 5 107 neg wq 108 psllw m15, 5 109 test edgeb, 4 ; LR_HAVE_TOP 110 jz .no_top 111 call .h_top 112 add lpfq, strideq 113 mov t6, t1 114 mov t5, t1 115 add t1, 384*2 116 call .h_top 117 lea r10, [lpfq+strideq*4] 118 mov lpfq, dstq 119 mov t4, t1 120 add t1, 384*2 121 add r10, strideq 122 mov [rsp], r10 ; below 123 call .h 124 mov t3, t1 125 mov t2, t1 126 dec hd 127 jz .v1 128 add lpfq, strideq 129 add t1, 384*2 130 call .h 131 mov t2, t1 132 dec hd 133 jz .v2 134 add lpfq, strideq 135 add t1, 384*2 136 call .h 137 dec hd 138 jz .v3 139.main: 140 lea t0, [t1+384*2] 141.main_loop: 142 call .hv 143 dec hd 144 jnz .main_loop 145 test edgeb, 8 ; LR_HAVE_BOTTOM 146 jz .v3 147 mov lpfq, [rsp] 148 call .hv_bottom 149 add lpfq, strideq 150 call .hv_bottom 151.v1: 152 call .v 153 RET 154.no_top: 155 lea r10, [lpfq+strideq*4] 156 mov lpfq, dstq 157 lea r10, [r10+strideq*2] 158 mov [rsp], r10 159 call .h 160 mov t6, t1 161 mov t5, t1 162 mov t4, t1 163 mov t3, t1 164 mov t2, t1 165 dec hd 166 jz .v1 167 add lpfq, strideq 168 add t1, 384*2 169 call .h 170 mov t2, t1 171 dec hd 172 jz .v2 173 add lpfq, strideq 174 add t1, 384*2 175 call .h 176 dec hd 177 jz .v3 178 lea t0, [t1+384*2] 179 call .hv 180 dec hd 181 jz .v3 182 add t0, 384*8 183 call .hv 184 dec hd 185 jnz .main 186.v3: 187 call .v 188.v2: 189 call .v 190 jmp .v1 191.extend_right: 192 movd xm2, r10d 193 vpbroadcastd m0, [pb_3] 194 vpbroadcastd m1, [pb_m5] 195 vpbroadcastb m2, xm2 196 mova m3, [pb_0to63] 197 psubb m0, m2 198 psubb m1, m2 199 pminub m0, m3 200 pminub m1, m3 201 pshufb m4, m0 202 pshufb m5, m1 203 ret 204.h: 205 mov r10, wq 206 test edgeb, 1 ; LR_HAVE_LEFT 207 jz .h_extend_left 208 movd xm4, [leftq] 209 vpblendd m4, [lpfq+r10-4], 0xfe 210 add leftq, 4 211 jmp .h_main 212.h_extend_left: 213 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 214 mova m4, [lpfq+r10] ; before the start of the buffer 215 palignr m4, m5, 12 216 pshufb m4, [wiener_l_shuf] 217 jmp .h_main 218.h_top: 219 mov r10, wq 220 test edgeb, 1 ; LR_HAVE_LEFT 221 jz .h_extend_left 222.h_loop: 223 movu m4, [lpfq+r10-4] 224.h_main: 225 movu m5, [lpfq+r10+4] 226 test edgeb, 2 ; LR_HAVE_RIGHT 227 jnz .h_have_right 228 cmp r10d, -34 229 jl .h_have_right 230 call .extend_right 231.h_have_right: 232 pshufb m0, m4, m6 233 pmaddubsw m0, m11 234 pshufb m1, m5, m6 235 pmaddubsw m1, m11 236 pshufb m2, m4, m7 237 pmaddubsw m2, m12 238 pshufb m3, m5, m7 239 pmaddubsw m3, m12 240 paddw m0, m2 241 pshufb m2, m4, m8 242 pmaddubsw m2, m12 243 paddw m1, m3 244 pshufb m3, m5, m8 245 pmaddubsw m3, m12 246 pshufb m4, m9 247 paddw m0, m2 248 pmullw m2, m4, m13 249 pshufb m5, m9 250 paddw m1, m3 251 pmullw m3, m5, m13 252 psllw m4, 7 253 psllw m5, 7 254 paddw m4, m10 255 paddw m5, m10 256 paddw m0, m2 257 vpbroadcastd m2, [pw_2056] 258 paddw m1, m3 259 paddsw m0, m4 260 paddsw m1, m5 261 psraw m0, 3 262 psraw m1, 3 263 paddw m0, m2 264 paddw m1, m2 265 mova [t1+r10*2+ 0], m0 266 mova [t1+r10*2+32], m1 267 add r10, 32 268 jl .h_loop 269 ret 270ALIGN function_align 271.hv: 272 add lpfq, strideq 273 mov r10, wq 274 test edgeb, 1 ; LR_HAVE_LEFT 275 jz .hv_extend_left 276 movd xm4, [leftq] 277 vpblendd m4, [lpfq+r10-4], 0xfe 278 add leftq, 4 279 jmp .hv_main 280.hv_extend_left: 281 movu m4, [lpfq+r10-4] 282 pshufb m4, [wiener_l_shuf] 283 jmp .hv_main 284.hv_bottom: 285 mov r10, wq 286 test edgeb, 1 ; LR_HAVE_LEFT 287 jz .hv_extend_left 288.hv_loop: 289 movu m4, [lpfq+r10-4] 290.hv_main: 291 movu m5, [lpfq+r10+4] 292 test edgeb, 2 ; LR_HAVE_RIGHT 293 jnz .hv_have_right 294 cmp r10d, -34 295 jl .hv_have_right 296 call .extend_right 297.hv_have_right: 298 pshufb m0, m4, m6 299 pmaddubsw m0, m11 300 pshufb m1, m5, m6 301 pmaddubsw m1, m11 302 pshufb m2, m4, m7 303 pmaddubsw m2, m12 304 pshufb m3, m5, m7 305 pmaddubsw m3, m12 306 paddw m0, m2 307 pshufb m2, m4, m8 308 pmaddubsw m2, m12 309 paddw m1, m3 310 pshufb m3, m5, m8 311 pmaddubsw m3, m12 312 pshufb m4, m9 313 paddw m0, m2 314 pmullw m2, m4, m13 315 pshufb m5, m9 316 paddw m1, m3 317 pmullw m3, m5, m13 318 psllw m4, 7 319 psllw m5, 7 320 paddw m4, m10 321 paddw m5, m10 322 paddw m0, m2 323 paddw m1, m3 324 mova m2, [t4+r10*2] 325 paddw m2, [t2+r10*2] 326 mova m3, [t3+r10*2] 327 paddsw m0, m4 328 vpbroadcastd m4, [pw_2056] 329 paddsw m1, m5 330 mova m5, [t5+r10*2] 331 paddw m5, [t1+r10*2] 332 psraw m0, 3 333 psraw m1, 3 334 paddw m0, m4 335 paddw m1, m4 336 paddw m4, m0, [t6+r10*2] 337 mova [t0+r10*2], m0 338 punpcklwd m0, m2, m3 339 pmaddwd m0, m15 340 punpckhwd m2, m3 341 pmaddwd m2, m15 342 punpcklwd m3, m4, m5 343 pmaddwd m3, m14 344 punpckhwd m4, m5 345 pmaddwd m4, m14 346 paddd m0, m3 347 paddd m4, m2 348 mova m2, [t4+r10*2+32] 349 paddw m2, [t2+r10*2+32] 350 mova m3, [t3+r10*2+32] 351 mova m5, [t5+r10*2+32] 352 paddw m5, [t1+r10*2+32] 353 packuswb m0, m4 354 paddw m4, m1, [t6+r10*2+32] 355 mova [t0+r10*2+32], m1 356 punpcklwd m1, m2, m3 357 pmaddwd m1, m15 358 punpckhwd m2, m3 359 pmaddwd m2, m15 360 punpcklwd m3, m4, m5 361 pmaddwd m3, m14 362 punpckhwd m4, m5 363 pmaddwd m4, m14 364 paddd m1, m3 365 paddd m2, m4 366 packuswb m1, m2 367 psrlw m0, 8 368 psrlw m1, 8 369 packuswb m0, m1 370 mova [dstq+r10], m0 371 add r10, 32 372 jl .hv_loop 373 mov t6, t5 374 mov t5, t4 375 mov t4, t3 376 mov t3, t2 377 mov t2, t1 378 mov t1, t0 379 mov t0, t6 380 add dstq, strideq 381 ret 382.v: 383 mov r10, wq 384.v_loop: 385 mova m2, [t4+r10*2+ 0] 386 paddw m2, [t2+r10*2+ 0] 387 mova m4, [t3+r10*2+ 0] 388 mova m6, [t1+r10*2+ 0] 389 paddw m8, m6, [t6+r10*2+ 0] 390 paddw m6, [t5+r10*2+ 0] 391 mova m3, [t4+r10*2+32] 392 paddw m3, [t2+r10*2+32] 393 mova m5, [t3+r10*2+32] 394 mova m7, [t1+r10*2+32] 395 paddw m9, m7, [t6+r10*2+32] 396 paddw m7, [t5+r10*2+32] 397 punpcklwd m0, m2, m4 398 pmaddwd m0, m15 399 punpckhwd m2, m4 400 pmaddwd m2, m15 401 punpcklwd m4, m8, m6 402 pmaddwd m4, m14 403 punpckhwd m6, m8, m6 404 pmaddwd m6, m14 405 punpcklwd m1, m3, m5 406 pmaddwd m1, m15 407 punpckhwd m3, m5 408 pmaddwd m3, m15 409 punpcklwd m5, m9, m7 410 pmaddwd m5, m14 411 punpckhwd m7, m9, m7 412 pmaddwd m7, m14 413 paddd m0, m4 414 paddd m2, m6 415 paddd m1, m5 416 paddd m3, m7 417 packuswb m0, m2 418 packuswb m1, m3 419 psrlw m0, 8 420 psrlw m1, 8 421 packuswb m0, m1 422 mova [dstq+r10], m0 423 add r10, 32 424 jl .v_loop 425 mov t6, t5 426 mov t5, t4 427 mov t4, t3 428 mov t3, t2 429 mov t2, t1 430 add dstq, strideq 431 ret 432 433cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \ 434 w, h, edge, flt 435 mov fltq, r6mp 436 movifnidn hd, hm 437 mov edged, r7m 438 mov wd, wm 439 vbroadcasti128 m6, [wiener_shufB] 440 vpbroadcastd m12, [fltq+ 2] 441 vbroadcasti128 m7, [wiener_shufC] 442 packsswb m12, m12 ; x1 x2 443 vpbroadcastw m13, [fltq+ 6] ; x3 444 vbroadcasti128 m8, [sgr_shuf+6] 445 add lpfq, wq 446 vpbroadcastd m9, [pw_m16380] 447 vpbroadcastd m10, [pw_2056] 448 mova m11, [wiener_l_shuf] 449 vpbroadcastd m14, [fltq+16] ; __ y1 450 add dstq, wq 451 vpbroadcastd m15, [fltq+20] ; y2 y3 452 lea t1, [rsp+wq*2+16] 453 psllw m14, 5 454 neg wq 455 psllw m15, 5 456 test edgeb, 4 ; LR_HAVE_TOP 457 jz .no_top 458 call .h_top 459 add lpfq, strideq 460 mov t4, t1 461 add t1, 384*2 462 call .h_top 463 lea r10, [lpfq+strideq*4] 464 mov lpfq, dstq 465 mov t3, t1 466 add t1, 384*2 467 add r10, strideq 468 mov [rsp], r10 ; below 469 call .h 470 mov t2, t1 471 dec hd 472 jz .v1 473 add lpfq, strideq 474 add t1, 384*2 475 call .h 476 dec hd 477 jz .v2 478.main: 479 mov t0, t4 480.main_loop: 481 call .hv 482 dec hd 483 jnz .main_loop 484 test edgeb, 8 ; LR_HAVE_BOTTOM 485 jz .v2 486 mov lpfq, [rsp] 487 call .hv_bottom 488 add lpfq, strideq 489 call .hv_bottom 490.end: 491 RET 492.no_top: 493 lea r10, [lpfq+strideq*4] 494 mov lpfq, dstq 495 lea r10, [r10+strideq*2] 496 mov [rsp], r10 497 call .h 498 mov t4, t1 499 mov t3, t1 500 mov t2, t1 501 dec hd 502 jz .v1 503 add lpfq, strideq 504 add t1, 384*2 505 call .h 506 dec hd 507 jz .v2 508 lea t0, [t1+384*2] 509 call .hv 510 dec hd 511 jz .v2 512 add t0, 384*6 513 call .hv 514 dec hd 515 jnz .main 516.v2: 517 call .v 518 mov t4, t3 519 mov t3, t2 520 mov t2, t1 521 add dstq, strideq 522.v1: 523 call .v 524 jmp .end 525.h: 526 mov r10, wq 527 test edgeb, 1 ; LR_HAVE_LEFT 528 jz .h_extend_left 529 movd xm4, [leftq] 530 vpblendd m4, [lpfq+r10-4], 0xfe 531 add leftq, 4 532 jmp .h_main 533.h_extend_left: 534 vbroadcasti128 m5, [lpfq+r10] ; avoid accessing memory located 535 mova m4, [lpfq+r10] ; before the start of the buffer 536 palignr m4, m5, 12 537 pshufb m4, m11 538 jmp .h_main 539.h_top: 540 mov r10, wq 541 test edgeb, 1 ; LR_HAVE_LEFT 542 jz .h_extend_left 543.h_loop: 544 movu m4, [lpfq+r10-4] 545.h_main: 546 movu m5, [lpfq+r10+4] 547 test edgeb, 2 ; LR_HAVE_RIGHT 548 jnz .h_have_right 549 cmp r10d, -33 550 jl .h_have_right 551 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 552.h_have_right: 553 pshufb m0, m4, m6 554 pmaddubsw m0, m12 555 pshufb m1, m5, m6 556 pmaddubsw m1, m12 557 pshufb m2, m4, m7 558 pmaddubsw m2, m12 559 pshufb m3, m5, m7 560 pmaddubsw m3, m12 561 pshufb m4, m8 562 paddw m0, m2 563 pmullw m2, m4, m13 564 pshufb m5, m8 565 paddw m1, m3 566 pmullw m3, m5, m13 567 psllw m4, 7 568 psllw m5, 7 569 paddw m4, m9 570 paddw m5, m9 571 paddw m0, m2 572 paddw m1, m3 573 paddsw m0, m4 574 paddsw m1, m5 575 psraw m0, 3 576 psraw m1, 3 577 paddw m0, m10 578 paddw m1, m10 579 mova [t1+r10*2+ 0], m0 580 mova [t1+r10*2+32], m1 581 add r10, 32 582 jl .h_loop 583 ret 584ALIGN function_align 585.hv: 586 add lpfq, strideq 587 mov r10, wq 588 test edgeb, 1 ; LR_HAVE_LEFT 589 jz .hv_extend_left 590 movd xm4, [leftq] 591 vpblendd m4, [lpfq+r10-4], 0xfe 592 add leftq, 4 593 jmp .hv_main 594.hv_extend_left: 595 movu m4, [lpfq+r10-4] 596 pshufb m4, m11 597 jmp .hv_main 598.hv_bottom: 599 mov r10, wq 600 test edgeb, 1 ; LR_HAVE_LEFT 601 jz .hv_extend_left 602.hv_loop: 603 movu m4, [lpfq+r10-4] 604.hv_main: 605 movu m5, [lpfq+r10+4] 606 test edgeb, 2 ; LR_HAVE_RIGHT 607 jnz .hv_have_right 608 cmp r10d, -33 609 jl .hv_have_right 610 call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right 611.hv_have_right: 612 pshufb m0, m4, m6 613 pmaddubsw m0, m12 614 pshufb m1, m5, m6 615 pmaddubsw m1, m12 616 pshufb m2, m4, m7 617 pmaddubsw m2, m12 618 pshufb m3, m5, m7 619 pmaddubsw m3, m12 620 pshufb m4, m8 621 paddw m0, m2 622 pmullw m2, m4, m13 623 pshufb m5, m8 624 paddw m1, m3 625 pmullw m3, m5, m13 626 psllw m4, 7 627 psllw m5, 7 628 paddw m4, m9 629 paddw m5, m9 630 paddw m0, m2 631 paddw m1, m3 632 mova m2, [t3+r10*2] 633 paddw m2, [t1+r10*2] 634 mova m3, [t2+r10*2] 635 paddsw m0, m4 636 paddsw m1, m5 637 psraw m0, 3 638 psraw m1, 3 639 paddw m0, m10 640 paddw m1, m10 641 paddw m4, m0, [t4+r10*2] 642 mova [t0+r10*2], m0 643 punpcklwd m0, m2, m3 644 pmaddwd m0, m15 645 punpckhwd m2, m3 646 pmaddwd m2, m15 647 punpcklwd m3, m4, m4 648 pmaddwd m3, m14 649 punpckhwd m4, m4 650 pmaddwd m4, m14 651 paddd m0, m3 652 paddd m4, m2 653 mova m2, [t3+r10*2+32] 654 paddw m2, [t1+r10*2+32] 655 mova m3, [t2+r10*2+32] 656 packuswb m0, m4 657 paddw m4, m1, [t4+r10*2+32] 658 mova [t0+r10*2+32], m1 659 punpcklwd m1, m2, m3 660 pmaddwd m1, m15 661 punpckhwd m2, m3 662 pmaddwd m2, m15 663 punpcklwd m3, m4, m4 664 pmaddwd m3, m14 665 punpckhwd m4, m4 666 pmaddwd m4, m14 667 paddd m1, m3 668 paddd m2, m4 669 packuswb m1, m2 670 psrlw m0, 8 671 psrlw m1, 8 672 packuswb m0, m1 673 mova [dstq+r10], m0 674 add r10, 32 675 jl .hv_loop 676 mov t4, t3 677 mov t3, t2 678 mov t2, t1 679 mov t1, t0 680 mov t0, t4 681 add dstq, strideq 682 ret 683.v: 684 mov r10, wq 685 psrld m13, m14, 16 ; y1 __ 686.v_loop: 687 mova m6, [t1+r10*2+ 0] 688 paddw m2, m6, [t3+r10*2+ 0] 689 mova m4, [t2+r10*2+ 0] 690 mova m7, [t1+r10*2+32] 691 paddw m3, m7, [t3+r10*2+32] 692 mova m5, [t2+r10*2+32] 693 paddw m6, [t4+r10*2+ 0] 694 paddw m7, [t4+r10*2+32] 695 punpcklwd m0, m2, m4 696 pmaddwd m0, m15 697 punpckhwd m2, m4 698 pmaddwd m2, m15 699 punpcklwd m1, m3, m5 700 pmaddwd m1, m15 701 punpckhwd m3, m5 702 pmaddwd m3, m15 703 punpcklwd m5, m7, m6 704 pmaddwd m4, m5, m14 705 punpckhwd m7, m6 706 pmaddwd m6, m7, m14 707 pmaddwd m5, m13 708 pmaddwd m7, m13 709 paddd m0, m4 710 paddd m2, m6 711 paddd m1, m5 712 paddd m3, m7 713 packuswb m0, m2 714 packuswb m1, m3 715 psrlw m0, 8 716 psrlw m1, 8 717 packuswb m0, m1 718 mova [dstq+r10], m0 719 add r10, 32 720 jl .v_loop 721 ret 722 723cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \ 724 w, h, edge, params 725%define base r12-sgr_x_by_x_avx2-256*4 726 lea r12, [sgr_x_by_x_avx2+256*4] 727 mov paramsq, r6mp 728 mov wd, wm 729 movifnidn hd, hm 730 mov edged, r7m 731 vbroadcasti128 m8, [base+sgr_shuf+0] 732 vbroadcasti128 m9, [base+sgr_shuf+8] 733 add lpfq, wq 734 vbroadcasti128 m10, [base+sgr_shuf+2] 735 add dstq, wq 736 vbroadcasti128 m11, [base+sgr_shuf+6] 737 lea t3, [rsp+wq*4+16+400*12] 738 vpbroadcastd m12, [paramsq+0] ; s0 739 pxor m6, m6 740 vpbroadcastw m7, [paramsq+8] ; w0 741 lea t1, [rsp+wq*2+20] 742 vpbroadcastd m13, [base+pd_0xf00800a4] 743 neg wq 744 vpbroadcastd m14, [base+pd_34816] ; (1 << 11) + (1 << 15) 745 psllw m7, 4 746 vpbroadcastd m15, [base+pd_m4096] 747 test edgeb, 4 ; LR_HAVE_TOP 748 jz .no_top 749 call .h_top 750 add lpfq, strideq 751 mov t2, t1 752 call .top_fixup 753 add t1, 400*6 754 call .h_top 755 lea r10, [lpfq+strideq*4] 756 mov lpfq, dstq 757 add r10, strideq 758 mov [rsp], r10 ; below 759 mov t0, t2 760 dec hd 761 jz .height1 762 or edged, 16 763 call .h 764.main: 765 add lpfq, strideq 766 call .hv 767 call .prep_n 768 sub hd, 2 769 jl .extend_bottom 770.main_loop: 771 add lpfq, strideq 772 test hd, hd 773 jz .odd_height 774 call .h 775 add lpfq, strideq 776 call .hv 777 call .n0 778 call .n1 779 sub hd, 2 780 jge .main_loop 781 test edgeb, 8 ; LR_HAVE_BOTTOM 782 jz .extend_bottom 783 mov lpfq, [rsp] 784 call .h_top 785 add lpfq, strideq 786 call .hv_bottom 787.end: 788 call .n0 789 call .n1 790.end2: 791 RET 792.height1: 793 call .hv 794 call .prep_n 795 jmp .odd_height_end 796.odd_height: 797 call .hv 798 call .n0 799 call .n1 800.odd_height_end: 801 call .v 802 call .n0 803 jmp .end2 804.extend_bottom: 805 call .v 806 jmp .end 807.no_top: 808 lea r10, [lpfq+strideq*4] 809 mov lpfq, dstq 810 lea r10, [r10+strideq*2] 811 mov [rsp], r10 812 call .h 813 lea t2, [t1+400*6] 814 call .top_fixup 815 dec hd 816 jz .no_top_height1 817 or edged, 16 818 mov t0, t1 819 mov t1, t2 820 jmp .main 821.no_top_height1: 822 call .v 823 call .prep_n 824 jmp .odd_height_end 825.extend_right: 826 movd xm2, r10d 827 mova m0, [sgr_r_ext] 828 vpbroadcastb m2, xm2 829 psubb m0, m2 830 pminub m0, [pb_0to63] 831 pshufb m5, m0 832 ret 833.h: ; horizontal boxsum 834 lea r10, [wq-2] 835 test edgeb, 1 ; LR_HAVE_LEFT 836 jz .h_extend_left 837 vpbroadcastd xm0, [leftq] 838 mova xm5, [lpfq+wq] 839 palignr xm5, xm0, 12 840 add leftq, 4 841 jmp .h_main 842.h_extend_left: 843 mova xm5, [lpfq+wq] 844 pshufb xm5, [base+sgr_l_shuf] 845 jmp .h_main 846.h_top: 847 lea r10, [wq-2] 848 test edgeb, 1 ; LR_HAVE_LEFT 849 jz .h_extend_left 850.h_loop: 851 movu xm5, [lpfq+r10-2] 852.h_main: 853 vinserti128 m5, [lpfq+r10+6], 1 854 test edgeb, 2 ; LR_HAVE_RIGHT 855 jnz .h_have_right 856 cmp r10d, -18 857 jl .h_have_right 858 call .extend_right 859.h_have_right: 860 pshufb m3, m5, m8 861 pmullw m4, m3, m3 862 pshufb m2, m5, m9 863 paddw m0, m3, m2 864 shufps m3, m2, q2121 865 paddw m0, m3 866 punpcklwd m1, m2, m3 867 pmaddwd m1, m1 868 punpckhwd m2, m3 869 pmaddwd m2, m2 870 punpcklwd m3, m4, m6 871 paddd m1, m3 872 punpckhwd m4, m6 873 paddd m2, m4 874 pshufb m4, m5, m10 875 paddw m0, m4 876 pshufb m5, m11 877 paddw m0, m5 ; sum 878 punpcklwd m3, m4, m5 879 pmaddwd m3, m3 880 punpckhwd m4, m5 881 pmaddwd m4, m4 882 test edgeb, 16 ; y > 0 883 jz .h_loop_end 884 paddw m0, [t1+r10*2+400*0] 885 paddd m1, [t1+r10*2+400*2] 886 paddd m2, [t1+r10*2+400*4] 887.h_loop_end: 888 paddd m1, m3 ; sumsq 889 paddd m2, m4 890 mova [t1+r10*2+400*0], m0 891 mova [t1+r10*2+400*2], m1 892 mova [t1+r10*2+400*4], m2 893 add r10, 16 894 jl .h_loop 895 ret 896.top_fixup: 897 lea r10, [wq-2] 898.top_fixup_loop: ; the sums of the first row needs to be doubled 899 mova m0, [t1+r10*2+400*0] 900 mova m1, [t1+r10*2+400*2] 901 mova m2, [t1+r10*2+400*4] 902 paddw m0, m0 903 paddd m1, m1 904 paddd m2, m2 905 mova [t2+r10*2+400*0], m0 906 mova [t2+r10*2+400*2], m1 907 mova [t2+r10*2+400*4], m2 908 add r10, 16 909 jl .top_fixup_loop 910 ret 911ALIGN function_align 912.hv: ; horizontal boxsum + vertical boxsum + ab 913 lea r10, [wq-2] 914 test edgeb, 1 ; LR_HAVE_LEFT 915 jz .hv_extend_left 916 vpbroadcastd xm0, [leftq] 917 mova xm5, [lpfq+wq] 918 palignr xm5, xm0, 12 919 add leftq, 4 920 jmp .hv_main 921.hv_extend_left: 922 mova xm5, [lpfq+wq] 923 pshufb xm5, [base+sgr_l_shuf] 924 jmp .hv_main 925.hv_bottom: 926 lea r10, [wq-2] 927 test edgeb, 1 ; LR_HAVE_LEFT 928 jz .hv_extend_left 929.hv_loop: 930 movu xm5, [lpfq+r10-2] 931.hv_main: 932 vinserti128 m5, [lpfq+r10+6], 1 933 test edgeb, 2 ; LR_HAVE_RIGHT 934 jnz .hv_have_right 935 cmp r10d, -18 936 jl .hv_have_right 937 call .extend_right 938.hv_have_right: 939 pshufb m1, m5, m8 940 pmullw m4, m1, m1 941 pshufb m3, m5, m9 942 paddw m0, m1, m3 943 shufps m1, m3, q2121 944 paddw m0, m1 945 punpcklwd m2, m3, m1 946 pmaddwd m2, m2 947 punpckhwd m3, m1 948 pmaddwd m3, m3 949 punpcklwd m1, m4, m6 950 paddd m2, m1 951 punpckhwd m4, m6 952 paddd m3, m4 953 pshufb m1, m5, m10 954 paddw m0, m1 955 pshufb m5, m11 956 paddw m0, m5 ; h sum 957 punpcklwd m4, m5, m1 958 pmaddwd m4, m4 959 punpckhwd m5, m1 960 pmaddwd m5, m5 961 paddw m1, m0, [t1+r10*2+400*0] 962 paddd m2, m4 ; h sumsq 963 paddd m3, m5 964 paddd m4, m2, [t1+r10*2+400*2] 965 paddd m5, m3, [t1+r10*2+400*4] 966 test hd, hd 967 jz .hv_last_row 968.hv_main2: 969 paddw m1, [t2+r10*2+400*0] ; hv sum 970 paddd m4, [t2+r10*2+400*2] ; hv sumsq 971 paddd m5, [t2+r10*2+400*4] 972 mova [t0+r10*2+400*0], m0 973 mova [t0+r10*2+400*2], m2 974 mova [t0+r10*2+400*4], m3 975 vpbroadcastd m2, [pd_25] 976 punpcklwd m0, m1, m6 ; b 977 punpckhwd m1, m6 978 pmulld m4, m2 ; a * 25 979 pmulld m5, m2 980 pmaddwd m2, m0, m0 ; b * b 981 pmaddwd m3, m1, m1 982 psubd m4, m2 ; p 983 psubd m5, m3 984 pmulld m4, m12 ; p * s 985 pmulld m5, m12 986 pmaddwd m0, m13 ; b * 164 987 pmaddwd m1, m13 988 paddusw m4, m13 989 paddusw m5, m13 990 psrad m3, m4, 20 ; min(z, 255) - 256 991 vpgatherdd m2, [r12+m3*4], m4 ; x 992 psrad m4, m5, 20 993 vpgatherdd m3, [r12+m4*4], m5 994 pmulld m0, m2 995 pmulld m1, m3 996 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 997 paddd m1, m14 998 pand m0, m15 999 pand m1, m15 1000 por m0, m2 ; a | (b << 12) 1001 por m1, m3 1002 mova [t3+r10*4+ 8], xm0 ; The neighbor calculations requires 1003 vextracti128 [t3+r10*4+40], m0, 1 ; 13 bits for a and 21 bits for b. 1004 mova [t3+r10*4+24], xm1 ; Packing them allows for 12+20, but 1005 vextracti128 [t3+r10*4+56], m1, 1 ; that gets us most of the way. 1006 add r10, 16 1007 jl .hv_loop 1008 mov t2, t1 1009 mov t1, t0 1010 mov t0, t2 1011 ret 1012.hv_last_row: ; esoteric edge case for odd heights 1013 mova [t1+r10*2+400*0], m1 1014 paddw m1, m0 1015 mova [t1+r10*2+400*2], m4 1016 paddd m4, m2 1017 mova [t1+r10*2+400*4], m5 1018 paddd m5, m3 1019 jmp .hv_main2 1020.v: ; vertical boxsum + ab 1021 lea r10, [wq-2] 1022.v_loop: 1023 mova m0, [t1+r10*2+400*0] 1024 mova m2, [t1+r10*2+400*2] 1025 mova m3, [t1+r10*2+400*4] 1026 paddw m1, m0, [t2+r10*2+400*0] 1027 paddd m4, m2, [t2+r10*2+400*2] 1028 paddd m5, m3, [t2+r10*2+400*4] 1029 paddw m0, m0 1030 paddd m2, m2 1031 paddd m3, m3 1032 paddw m1, m0 ; hv sum 1033 paddd m4, m2 ; hv sumsq 1034 paddd m5, m3 1035 vpbroadcastd m2, [pd_25] 1036 punpcklwd m0, m1, m6 ; b 1037 punpckhwd m1, m6 1038 pmulld m4, m2 ; a * 25 1039 pmulld m5, m2 1040 pmaddwd m2, m0, m0 ; b * b 1041 pmaddwd m3, m1, m1 1042 psubd m4, m2 ; p 1043 psubd m5, m3 1044 pmulld m4, m12 ; p * s 1045 pmulld m5, m12 1046 pmaddwd m0, m13 ; b * 164 1047 pmaddwd m1, m13 1048 paddusw m4, m13 1049 paddusw m5, m13 1050 psrad m3, m4, 20 ; min(z, 255) - 256 1051 vpgatherdd m2, [r12+m3*4], m4 ; x 1052 psrad m4, m5, 20 1053 vpgatherdd m3, [r12+m4*4], m5 1054 pmulld m0, m2 1055 pmulld m1, m3 1056 paddd m0, m14 ; x * b * 164 + (1 << 11) + (1 << 15) 1057 paddd m1, m14 1058 pand m0, m15 1059 pand m1, m15 1060 por m0, m2 ; a | (b << 12) 1061 por m1, m3 1062 mova [t3+r10*4+ 8], xm0 1063 vextracti128 [t3+r10*4+40], m0, 1 1064 mova [t3+r10*4+24], xm1 1065 vextracti128 [t3+r10*4+56], m1, 1 1066 add r10, 16 1067 jl .v_loop 1068 ret 1069.prep_n: ; initial neighbor setup 1070 mov r10, wq 1071.prep_n_loop: 1072 movu m0, [t3+r10*4+ 4] 1073 movu m1, [t3+r10*4+36] 1074 paddd m2, m0, [t3+r10*4+ 0] 1075 paddd m3, m1, [t3+r10*4+32] 1076 paddd m2, [t3+r10*4+ 8] 1077 paddd m3, [t3+r10*4+40] 1078 paddd m0, m2 1079 pslld m2, 2 1080 paddd m1, m3 1081 pslld m3, 2 1082 paddd m2, m0 ; ab 565 1083 paddd m3, m1 1084 pandn m0, m15, m2 ; a 1085 psrld m2, 12 ; b 1086 pandn m1, m15, m3 1087 psrld m3, 12 1088 mova [t3+r10*4+400*4+ 0], m0 1089 mova [t3+r10*4+400*8+ 0], m2 1090 mova [t3+r10*4+400*4+32], m1 1091 mova [t3+r10*4+400*8+32], m3 1092 add r10, 16 1093 jl .prep_n_loop 1094 ret 1095ALIGN function_align 1096.n0: ; neighbor + output (even rows) 1097 mov r10, wq 1098.n0_loop: 1099 movu m0, [t3+r10*4+ 4] 1100 movu m1, [t3+r10*4+36] 1101 paddd m2, m0, [t3+r10*4+ 0] 1102 paddd m3, m1, [t3+r10*4+32] 1103 paddd m2, [t3+r10*4+ 8] 1104 paddd m3, [t3+r10*4+40] 1105 paddd m0, m2 1106 pslld m2, 2 1107 paddd m1, m3 1108 pslld m3, 2 1109 paddd m2, m0 1110 paddd m3, m1 1111 pandn m0, m15, m2 1112 psrld m2, 12 1113 pandn m1, m15, m3 1114 psrld m3, 12 1115 paddd m4, m0, [t3+r10*4+400*4+ 0] ; a 1116 paddd m5, m1, [t3+r10*4+400*4+32] 1117 mova [t3+r10*4+400*4+ 0], m0 1118 mova [t3+r10*4+400*4+32], m1 1119 paddd m0, m2, [t3+r10*4+400*8+ 0] ; b 1120 paddd m1, m3, [t3+r10*4+400*8+32] 1121 mova [t3+r10*4+400*8+ 0], m2 1122 mova [t3+r10*4+400*8+32], m3 1123 pmovzxbd m2, [dstq+r10+0] 1124 pmovzxbd m3, [dstq+r10+8] 1125 pmaddwd m4, m2 ; a * src 1126 pmaddwd m5, m3 1127 packssdw m2, m3 1128 psubd m0, m4 ; b - a * src + (1 << 8) 1129 psubd m1, m5 1130 psrad m0, 9 1131 psrad m1, 9 1132 packssdw m0, m1 1133 pmulhrsw m0, m7 1134 paddw m0, m2 1135 vextracti128 xm1, m0, 1 1136 packuswb xm0, xm1 1137 pshufd xm0, xm0, q3120 1138 mova [dstq+r10], xm0 1139 add r10, 16 1140 jl .n0_loop 1141 add dstq, strideq 1142 ret 1143ALIGN function_align 1144.n1: ; neighbor + output (odd rows) 1145 mov r10, wq 1146.n1_loop: 1147 pmovzxbd m2, [dstq+r10+0] 1148 pmovzxbd m3, [dstq+r10+8] 1149 pmaddwd m4, m2, [t3+r10*4+400*4+ 0] ; a * src 1150 pmaddwd m5, m3, [t3+r10*4+400*4+32] 1151 mova m0, [t3+r10*4+400*8+ 0] ; b 1152 mova m1, [t3+r10*4+400*8+32] 1153 packssdw m2, m3 1154 psubd m0, m4 ; b - a * src + (1 << 7) 1155 psubd m1, m5 1156 psrad m0, 8 1157 psrad m1, 8 1158 packssdw m0, m1 1159 pmulhrsw m0, m7 1160 paddw m0, m2 1161 vextracti128 xm1, m0, 1 1162 packuswb xm0, xm1 1163 pshufd xm0, xm0, q3120 1164 mova [dstq+r10], xm0 1165 add r10, 16 1166 jl .n1_loop 1167 add dstq, strideq 1168 ret 1169 1170cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \ 1171 w, h, edge, params 1172%define base r14-sgr_x_by_x_avx2-256*4 1173 mov paramsq, r6mp 1174 mov wd, wm 1175 movifnidn hd, hm 1176 mov edged, r7m 1177 lea r14, [sgr_x_by_x_avx2+256*4] 1178 vbroadcasti128 m8, [base+sgr_shuf+2] 1179 add lpfq, wq 1180 vbroadcasti128 m9, [base+sgr_shuf+4] 1181 add dstq, wq 1182 vbroadcasti128 m10, [base+sgr_shuf+6] 1183 lea t3, [rsp+wq*4+16+400*12] 1184 vpbroadcastd m11, [paramsq+ 4] ; s1 1185 pxor m6, m6 1186 vpbroadcastw m7, [paramsq+10] ; w1 1187 lea t1, [rsp+wq*2+20] 1188 vpbroadcastd m12, [base+pd_0xf00801c7] 1189 neg wq 1190 vpbroadcastd m13, [base+pd_34816] ; (1 << 11) + (1 << 15) 1191 psllw m7, 4 1192 vpbroadcastd m14, [base+pd_m4096] 1193 test edgeb, 4 ; LR_HAVE_TOP 1194 jz .no_top 1195 call .h_top 1196 add lpfq, strideq 1197 mov t2, t1 1198 add t1, 400*6 1199 call .h_top 1200 lea t4, [lpfq+strideq*4] 1201 mov lpfq, dstq 1202 add t4, strideq 1203 mov [rsp], t4 ; below 1204 mov t0, t2 1205 call .hv 1206.main: 1207 mov t5, t3 1208 add t3, 400*4 1209 dec hd 1210 jz .height1 1211 add lpfq, strideq 1212 call .hv 1213 call .prep_n 1214 dec hd 1215 jz .extend_bottom 1216.main_loop: 1217 add lpfq, strideq 1218 call .hv 1219 call .n 1220 dec hd 1221 jnz .main_loop 1222 test edgeb, 8 ; LR_HAVE_BOTTOM 1223 jz .extend_bottom 1224 mov lpfq, [rsp] 1225 call .hv_bottom 1226 call .n 1227 add lpfq, strideq 1228 call .hv_bottom 1229.end: 1230 call .n 1231 RET 1232.height1: 1233 call .v 1234 call .prep_n 1235 mov t2, t1 1236 call .v 1237 jmp .end 1238.extend_bottom: 1239 call .v 1240 call .n 1241 mov t2, t1 1242 call .v 1243 jmp .end 1244.no_top: 1245 lea t4, [lpfq+strideq*4] 1246 mov lpfq, dstq 1247 lea t4, [t4+strideq*2] 1248 mov [rsp], t4 1249 call .h 1250 lea t0, [t1+400*6] 1251 mov t2, t1 1252 call .v 1253 jmp .main 1254.h: ; horizontal boxsum 1255 lea r10, [wq-2] 1256 test edgeb, 1 ; LR_HAVE_LEFT 1257 jz .h_extend_left 1258 vpbroadcastd xm0, [leftq] 1259 mova xm5, [lpfq+wq] 1260 palignr xm5, xm0, 12 1261 add leftq, 4 1262 jmp .h_main 1263.h_extend_left: 1264 mova xm5, [lpfq+wq] 1265 pshufb xm5, [base+sgr_l_shuf] 1266 jmp .h_main 1267.h_top: 1268 lea r10, [wq-2] 1269 test edgeb, 1 ; LR_HAVE_LEFT 1270 jz .h_extend_left 1271.h_loop: 1272 movu xm5, [lpfq+r10-2] 1273.h_main: 1274 vinserti128 m5, [lpfq+r10+6], 1 1275 test edgeb, 2 ; LR_HAVE_RIGHT 1276 jnz .h_have_right 1277 cmp r10d, -17 1278 jl .h_have_right 1279 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1280.h_have_right: 1281 pshufb m0, m5, m8 1282 pmullw m2, m0, m0 1283 pshufb m4, m5, m9 1284 paddw m0, m4 1285 pshufb m5, m10 1286 paddw m0, m5 ; sum 1287 punpcklwd m3, m4, m5 1288 pmaddwd m3, m3 1289 punpckhwd m4, m5 1290 pmaddwd m4, m4 1291 punpcklwd m1, m2, m6 1292 punpckhwd m2, m6 1293 mova [t1+r10*2+400*0], m0 1294 paddd m1, m3 ; sumsq 1295 paddd m2, m4 1296 mova [t1+r10*2+400*2], m1 1297 mova [t1+r10*2+400*4], m2 1298 add r10, 16 1299 jl .h_loop 1300 ret 1301ALIGN function_align 1302.hv: ; horizontal boxsum + vertical boxsum + ab 1303 lea r10, [wq-2] 1304 test edgeb, 1 ; LR_HAVE_LEFT 1305 jz .hv_extend_left 1306 vpbroadcastd xm0, [leftq] 1307 mova xm5, [lpfq+wq] 1308 palignr xm5, xm0, 12 1309 add leftq, 4 1310 jmp .hv_main 1311.hv_extend_left: 1312 mova xm5, [lpfq+wq] 1313 pshufb xm5, [base+sgr_l_shuf] 1314 jmp .hv_main 1315.hv_bottom: 1316 lea r10, [wq-2] 1317 test edgeb, 1 ; LR_HAVE_LEFT 1318 jz .hv_extend_left 1319.hv_loop: 1320 movu xm5, [lpfq+r10-2] 1321.hv_main: 1322 vinserti128 m5, [lpfq+r10+6], 1 1323 test edgeb, 2 ; LR_HAVE_RIGHT 1324 jnz .hv_have_right 1325 cmp r10d, -17 1326 jl .hv_have_right 1327 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1328.hv_have_right: 1329 pshufb m0, m5, m8 1330 pmullw m3, m0, m0 1331 pshufb m1, m5, m9 1332 paddw m0, m1 1333 pshufb m5, m10 1334 paddw m0, m5 ; h sum 1335 punpcklwd m4, m5, m1 1336 pmaddwd m4, m4 1337 punpckhwd m5, m1 1338 pmaddwd m5, m5 1339 paddw m1, m0, [t2+r10*2+400*0] 1340 paddw m1, [t1+r10*2+400*0] ; hv sum 1341 punpcklwd m2, m3, m6 1342 punpckhwd m3, m6 1343 paddd m4, m2 ; h sumsq 1344 paddd m5, m3 1345 paddd m2, m4, [t2+r10*2+400*2] 1346 paddd m3, m5, [t2+r10*2+400*4] 1347 paddd m2, [t1+r10*2+400*2] ; hv sumsq 1348 paddd m3, [t1+r10*2+400*4] 1349 mova [t0+r10*2+400*0], m0 1350 punpcklwd m0, m1, m6 ; b 1351 punpckhwd m1, m6 1352 mova [t0+r10*2+400*2], m4 1353 pslld m4, m2, 3 1354 mova [t0+r10*2+400*4], m5 1355 pslld m5, m3, 3 1356 paddd m4, m2 ; a * 9 1357 pmaddwd m2, m0, m0 ; b * b 1358 paddd m5, m3 1359 pmaddwd m3, m1, m1 1360 psubd m4, m2 ; p 1361 psubd m5, m3 1362 pmulld m4, m11 ; p * s 1363 pmulld m5, m11 1364 pmaddwd m0, m12 ; b * 455 1365 pmaddwd m1, m12 1366 paddusw m4, m12 1367 paddusw m5, m12 1368 psrad m3, m4, 20 ; min(z, 255) - 256 1369 vpgatherdd m2, [r14+m3*4], m4 1370 psrad m4, m5, 20 1371 vpgatherdd m3, [r14+m4*4], m5 1372 pmulld m0, m2 1373 pmulld m1, m3 1374 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1375 paddd m1, m13 1376 pand m0, m14 1377 pand m1, m14 1378 por m0, m2 ; a | (b << 12) 1379 por m1, m3 1380 mova [t3+r10*4+ 8], xm0 1381 vextracti128 [t3+r10*4+40], m0, 1 1382 mova [t3+r10*4+24], xm1 1383 vextracti128 [t3+r10*4+56], m1, 1 1384 add r10, 16 1385 jl .hv_loop 1386 mov t2, t1 1387 mov t1, t0 1388 mov t0, t2 1389 ret 1390.v: ; vertical boxsum + ab 1391 lea r10, [wq-2] 1392.v_loop: 1393 mova m1, [t1+r10*2+400*0] 1394 paddw m1, m1 1395 paddw m1, [t2+r10*2+400*0] ; hv sum 1396 mova m2, [t1+r10*2+400*2] 1397 mova m3, [t1+r10*2+400*4] 1398 paddd m2, m2 1399 paddd m3, m3 1400 paddd m2, [t2+r10*2+400*2] ; hv sumsq 1401 paddd m3, [t2+r10*2+400*4] 1402 punpcklwd m0, m1, m6 ; b 1403 punpckhwd m1, m6 1404 pslld m4, m2, 3 1405 pslld m5, m3, 3 1406 paddd m4, m2 ; a * 9 1407 pmaddwd m2, m0, m0 ; b * b 1408 paddd m5, m3 1409 pmaddwd m3, m1, m1 1410 psubd m4, m2 ; p 1411 psubd m5, m3 1412 pmulld m4, m11 ; p * s 1413 pmulld m5, m11 1414 pmaddwd m0, m12 ; b * 455 1415 pmaddwd m1, m12 1416 paddusw m4, m12 1417 paddusw m5, m12 1418 psrad m3, m4, 20 ; min(z, 255) - 256 1419 vpgatherdd m2, [r14+m3*4], m4 1420 psrad m4, m5, 20 1421 vpgatherdd m3, [r14+m4*4], m5 1422 pmulld m0, m2 1423 pmulld m1, m3 1424 paddd m0, m13 ; x * b * 455 + (1 << 11) + (1 << 15) 1425 paddd m1, m13 1426 pand m0, m14 1427 pand m1, m14 1428 por m0, m2 ; a | (b << 12) 1429 por m1, m3 1430 mova [t3+r10*4+ 8], xm0 1431 vextracti128 [t3+r10*4+40], m0, 1 1432 mova [t3+r10*4+24], xm1 1433 vextracti128 [t3+r10*4+56], m1, 1 1434 add r10, 16 1435 jl .v_loop 1436 ret 1437.prep_n: ; initial neighbor setup 1438 mov r10, wq 1439 mov t4, t3 1440 add t3, 400*4 1441.prep_n_loop: 1442 mova m2, [t5+r10*4+0] 1443 mova m3, [t4+r10*4+0] 1444 paddd m2, [t5+r10*4+8] 1445 paddd m3, [t4+r10*4+8] 1446 paddd m0, m2, [t5+r10*4+4] 1447 paddd m1, m3, [t4+r10*4+4] 1448 pslld m0, 2 1449 paddd m1, m1 ; ab[ 0] 222 1450 psubd m0, m2 ; ab[-1] 343 1451 mova [t3+r10*4+400*4], m1 1452 paddd m1, m1 1453 mova [t5+r10*4], m0 1454 psubd m1, m3 ; ab[ 0] 343 1455 mova [t4+r10*4], m1 1456 add r10, 8 1457 jl .prep_n_loop 1458 ret 1459; a+b are packed together in a single dword, but we can't do the 1460; full neighbor calculations before splitting them since we don't 1461; have sufficient precision. The solution is to do the calculations 1462; in two equal halves and split a and b before doing the final sum. 1463ALIGN function_align 1464.n: ; neighbor + output 1465 mov r10, wq 1466.n_loop: 1467 mova m4, [t3+r10*4+ 0] 1468 paddd m4, [t3+r10*4+ 8] 1469 paddd m5, m4, [t3+r10*4+ 4] 1470 paddd m5, m5 ; ab[+1] 222 1471 mova m2, [t3+r10*4+400*4+ 0] 1472 paddd m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343 1473 mova m3, [t3+r10*4+400*4+32] 1474 paddd m1, m3, [t5+r10*4+32] 1475 mova [t3+r10*4+400*4+ 0], m5 1476 paddd m5, m5 1477 psubd m5, m4 ; ab[+1] 343 1478 mova [t5+r10*4+ 0], m5 1479 paddd m2, m5 ; ab[ 0] 222 + ab[+1] 343 1480 mova m4, [t3+r10*4+32] 1481 paddd m4, [t3+r10*4+40] 1482 paddd m5, m4, [t3+r10*4+36] 1483 paddd m5, m5 1484 mova [t3+r10*4+400*4+32], m5 1485 paddd m5, m5 1486 psubd m5, m4 1487 mova [t5+r10*4+32], m5 1488 pandn m4, m14, m0 1489 psrld m0, 12 1490 paddd m3, m5 1491 pandn m5, m14, m2 1492 psrld m2, 12 1493 paddd m4, m5 ; a 1494 pandn m5, m14, m1 1495 psrld m1, 12 1496 paddd m0, m2 ; b + (1 << 8) 1497 pandn m2, m14, m3 1498 psrld m3, 12 1499 paddd m5, m2 1500 pmovzxbd m2, [dstq+r10+0] 1501 paddd m1, m3 1502 pmovzxbd m3, [dstq+r10+8] 1503 pmaddwd m4, m2 ; a * src 1504 pmaddwd m5, m3 1505 packssdw m2, m3 1506 psubd m0, m4 ; b - a * src + (1 << 8) 1507 psubd m1, m5 1508 psrad m0, 9 1509 psrad m1, 9 1510 packssdw m0, m1 1511 pmulhrsw m0, m7 1512 paddw m0, m2 1513 vextracti128 xm1, m0, 1 1514 packuswb xm0, xm1 1515 pshufd xm0, xm0, q3120 1516 mova [dstq+r10], xm0 1517 add r10, 16 1518 jl .n_loop 1519 mov r10, t5 1520 mov t5, t4 1521 mov t4, r10 1522 add dstq, strideq 1523 ret 1524 1525cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \ 1526 w, h, edge, params 1527%define base r12-sgr_x_by_x_avx2-256*4 1528 lea r12, [sgr_x_by_x_avx2+256*4] 1529 mov paramsq, r6mp 1530 mov wd, wm 1531 movifnidn hd, hm 1532 mov edged, r7m 1533 vbroadcasti128 m9, [base+sgr_shuf+0] 1534 vbroadcasti128 m10, [base+sgr_shuf+8] 1535 add lpfq, wq 1536 vbroadcasti128 m11, [base+sgr_shuf+2] 1537 vbroadcasti128 m12, [base+sgr_shuf+6] 1538 add dstq, wq 1539 vpbroadcastd m15, [paramsq+8] ; w0 w1 1540 lea t3, [rsp+wq*4+400*24+8] 1541 vpbroadcastd m13, [paramsq+0] ; s0 1542 pxor m7, m7 1543 vpbroadcastd m14, [paramsq+4] ; s1 1544 lea t1, [rsp+wq*2+12] 1545 neg wq 1546 psllw m15, 2 ; to reuse existing pd_m4096 register for rounding 1547 test edgeb, 4 ; LR_HAVE_TOP 1548 jz .no_top 1549 call .h_top 1550 add lpfq, strideq 1551 mov t2, t1 1552 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup 1553 add t1, 400*12 1554 call .h_top 1555 lea r10, [lpfq+strideq*4] 1556 mov lpfq, dstq 1557 add r10, strideq 1558 mov [rsp], r10 ; below 1559 call .hv0 1560.main: 1561 dec hd 1562 jz .height1 1563 add lpfq, strideq 1564 call .hv1 1565 call .prep_n 1566 sub hd, 2 1567 jl .extend_bottom 1568.main_loop: 1569 add lpfq, strideq 1570 call .hv0 1571 test hd, hd 1572 jz .odd_height 1573 add lpfq, strideq 1574 call .hv1 1575 call .n0 1576 call .n1 1577 sub hd, 2 1578 jge .main_loop 1579 test edgeb, 8 ; LR_HAVE_BOTTOM 1580 jz .extend_bottom 1581 mov lpfq, [rsp] 1582 call .hv0_bottom 1583 add lpfq, strideq 1584 call .hv1_bottom 1585.end: 1586 call .n0 1587 call .n1 1588.end2: 1589 RET 1590.height1: 1591 call .v1 1592 call .prep_n 1593 jmp .odd_height_end 1594.odd_height: 1595 call .v1 1596 call .n0 1597 call .n1 1598.odd_height_end: 1599 call .v0 1600 call .v1 1601 call .n0 1602 jmp .end2 1603.extend_bottom: 1604 call .v0 1605 call .v1 1606 jmp .end 1607.no_top: 1608 lea r10, [lpfq+strideq*4] 1609 mov lpfq, dstq 1610 lea r10, [r10+strideq*2] 1611 mov [rsp], r10 1612 call .h 1613 lea t2, [t1+400*12] 1614 lea r10, [wq-2] 1615.top_fixup_loop: 1616 mova m0, [t1+r10*2+400* 0] 1617 mova m1, [t1+r10*2+400* 2] 1618 mova m2, [t1+r10*2+400* 4] 1619 paddw m0, m0 1620 mova m3, [t1+r10*2+400* 6] 1621 paddd m1, m1 1622 mova m4, [t1+r10*2+400* 8] 1623 paddd m2, m2 1624 mova m5, [t1+r10*2+400*10] 1625 mova [t2+r10*2+400* 0], m0 1626 mova [t2+r10*2+400* 2], m1 1627 mova [t2+r10*2+400* 4], m2 1628 mova [t2+r10*2+400* 6], m3 1629 mova [t2+r10*2+400* 8], m4 1630 mova [t2+r10*2+400*10], m5 1631 add r10, 16 1632 jl .top_fixup_loop 1633 call .v0 1634 jmp .main 1635.h: ; horizontal boxsums 1636 lea r10, [wq-2] 1637 test edgeb, 1 ; LR_HAVE_LEFT 1638 jz .h_extend_left 1639 vpbroadcastd xm0, [leftq] 1640 mova xm5, [lpfq+wq] 1641 palignr xm5, xm0, 12 1642 add leftq, 4 1643 jmp .h_main 1644.h_extend_left: 1645 mova xm5, [lpfq+wq] 1646 pshufb xm5, [base+sgr_l_shuf] 1647 jmp .h_main 1648.h_top: 1649 lea r10, [wq-2] 1650 test edgeb, 1 ; LR_HAVE_LEFT 1651 jz .h_extend_left 1652.h_loop: 1653 movu xm5, [lpfq+r10-2] 1654.h_main: 1655 vinserti128 m5, [lpfq+r10+6], 1 1656 test edgeb, 2 ; LR_HAVE_RIGHT 1657 jnz .h_have_right 1658 cmp r10d, -18 1659 jl .h_have_right 1660 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1661.h_have_right: 1662 pshufb m6, m5, m9 1663 pshufb m4, m5, m10 1664 paddw m8, m6, m4 1665 shufps m0, m6, m4, q2121 1666 pmullw m3, m0, m0 1667 pshufb m2, m5, m11 1668 paddw m0, m2 1669 pshufb m5, m12 1670 paddw m0, m5 ; sum3 1671 punpcklwd m1, m2, m5 1672 pmaddwd m1, m1 1673 punpckhwd m2, m5 1674 pmaddwd m2, m2 1675 punpcklwd m5, m6, m4 1676 pmaddwd m5, m5 1677 punpckhwd m6, m4 1678 pmaddwd m6, m6 1679 punpcklwd m4, m3, m7 1680 paddd m1, m4 ; sumsq3 1681 punpckhwd m3, m7 1682 paddd m2, m3 1683 mova [t1+r10*2+400* 6], m0 1684 mova [t1+r10*2+400* 8], m1 1685 mova [t1+r10*2+400*10], m2 1686 paddw m8, m0 ; sum5 1687 paddd m5, m1 ; sumsq5 1688 paddd m6, m2 1689 mova [t1+r10*2+400* 0], m8 1690 mova [t1+r10*2+400* 2], m5 1691 mova [t1+r10*2+400* 4], m6 1692 add r10, 16 1693 jl .h_loop 1694 ret 1695ALIGN function_align 1696.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows) 1697 lea r10, [wq-2] 1698 test edgeb, 1 ; LR_HAVE_LEFT 1699 jz .hv0_extend_left 1700 vpbroadcastd xm0, [leftq] 1701 mova xm5, [lpfq+wq] 1702 palignr xm5, xm0, 12 1703 add leftq, 4 1704 jmp .hv0_main 1705.hv0_extend_left: 1706 mova xm5, [lpfq+wq] 1707 pshufb xm5, [base+sgr_l_shuf] 1708 jmp .hv0_main 1709.hv0_bottom: 1710 lea r10, [wq-2] 1711 test edgeb, 1 ; LR_HAVE_LEFT 1712 jz .hv0_extend_left 1713.hv0_loop: 1714 movu xm5, [lpfq+r10-2] 1715.hv0_main: 1716 vinserti128 m5, [lpfq+r10+6], 1 1717 test edgeb, 2 ; LR_HAVE_RIGHT 1718 jnz .hv0_have_right 1719 cmp r10d, -18 1720 jl .hv0_have_right 1721 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1722.hv0_have_right: 1723 pshufb m6, m5, m9 1724 pshufb m4, m5, m10 1725 paddw m8, m6, m4 1726 shufps m1, m6, m4, q2121 1727 pmullw m0, m1, m1 1728 pshufb m3, m5, m11 1729 paddw m1, m3 1730 pshufb m5, m12 1731 paddw m1, m5 ; sum3 1732 punpcklwd m2, m3, m5 1733 pmaddwd m2, m2 1734 punpckhwd m3, m5 1735 pmaddwd m3, m3 1736 punpcklwd m5, m6, m4 1737 pmaddwd m5, m5 1738 punpckhwd m6, m4 1739 pmaddwd m6, m6 1740 punpcklwd m4, m0, m7 1741 paddd m2, m4 ; sumsq3 1742 punpckhwd m0, m7 1743 paddd m3, m0 1744 paddw m8, m1 ; sum5 1745 paddd m5, m2 ; sumsq5 1746 paddd m6, m3 1747 mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row 1748 mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd 1749 mova [t3+r10*4+400*0+40], m6 1750 paddw m8, [t1+r10*2+400* 0] 1751 paddd m5, [t1+r10*2+400* 2] 1752 paddd m6, [t1+r10*2+400* 4] 1753 mova [t1+r10*2+400* 0], m8 1754 mova [t1+r10*2+400* 2], m5 1755 mova [t1+r10*2+400* 4], m6 1756 paddw m0, m1, [t1+r10*2+400* 6] 1757 paddd m4, m2, [t1+r10*2+400* 8] 1758 paddd m5, m3, [t1+r10*2+400*10] 1759 mova [t1+r10*2+400* 6], m1 1760 mova [t1+r10*2+400* 8], m2 1761 mova [t1+r10*2+400*10], m3 1762 paddw m1, m0, [t2+r10*2+400* 6] 1763 paddd m2, m4, [t2+r10*2+400* 8] 1764 paddd m3, m5, [t2+r10*2+400*10] 1765 mova [t2+r10*2+400* 6], m0 1766 mova [t2+r10*2+400* 8], m4 1767 mova [t2+r10*2+400*10], m5 1768 punpcklwd m0, m1, m7 ; b3 1769 punpckhwd m1, m7 1770 pslld m4, m2, 3 1771 pslld m5, m3, 3 1772 paddd m4, m2 ; a3 * 9 1773 pmaddwd m2, m0, m0 ; b3 * b 1774 paddd m5, m3 1775 pmaddwd m3, m1, m1 1776 psubd m4, m2 ; p3 1777 vpbroadcastd m2, [base+pd_0xf00801c7] 1778 psubd m5, m3 1779 pmulld m4, m14 ; p3 * s1 1780 pmulld m5, m14 1781 pmaddwd m0, m2 ; b3 * 455 1782 pmaddwd m1, m2 1783 paddusw m4, m2 1784 paddusw m5, m2 1785 psrad m3, m4, 20 ; min(z3, 255) - 256 1786 vpgatherdd m2, [r12+m3*4], m4 1787 psrad m4, m5, 20 1788 vpgatherdd m3, [r12+m4*4], m5 1789 vpbroadcastd m4, [base+pd_34816] 1790 pmulld m0, m2 1791 vpbroadcastd m5, [base+pd_m4096] 1792 pmulld m1, m3 1793 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1794 paddd m1, m4 1795 pand m0, m5 1796 pand m1, m5 1797 por m0, m2 ; a3 | (b3 << 12) 1798 por m1, m3 1799 mova [t3+r10*4+400*4+ 8], xm0 1800 vextracti128 [t3+r10*4+400*4+40], m0, 1 1801 mova [t3+r10*4+400*4+24], xm1 1802 vextracti128 [t3+r10*4+400*4+56], m1, 1 1803 add r10, 16 1804 jl .hv0_loop 1805 ret 1806ALIGN function_align 1807.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows) 1808 lea r10, [wq-2] 1809 test edgeb, 1 ; LR_HAVE_LEFT 1810 jz .hv1_extend_left 1811 vpbroadcastd xm0, [leftq] 1812 mova xm5, [lpfq+wq] 1813 palignr xm5, xm0, 12 1814 add leftq, 4 1815 jmp .hv1_main 1816.hv1_extend_left: 1817 mova xm5, [lpfq+wq] 1818 pshufb xm5, [base+sgr_l_shuf] 1819 jmp .hv1_main 1820.hv1_bottom: 1821 lea r10, [wq-2] 1822 test edgeb, 1 ; LR_HAVE_LEFT 1823 jz .hv1_extend_left 1824.hv1_loop: 1825 movu xm5, [lpfq+r10-2] 1826.hv1_main: 1827 vinserti128 m5, [lpfq+r10+6], 1 1828 test edgeb, 2 ; LR_HAVE_RIGHT 1829 jnz .hv1_have_right 1830 cmp r10d, -18 1831 jl .hv1_have_right 1832 call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right 1833.hv1_have_right: 1834 pshufb m6, m5, m9 1835 pshufb m3, m5, m10 1836 paddw m8, m6, m3 1837 shufps m2, m6, m3, q2121 1838 pmullw m1, m2, m2 1839 pshufb m0, m5, m11 1840 paddw m2, m0 1841 pshufb m5, m12 1842 paddw m2, m5 ; sum3 1843 punpcklwd m4, m5, m0 1844 pmaddwd m4, m4 1845 punpckhwd m5, m0 1846 pmaddwd m5, m5 1847 punpcklwd m0, m6, m3 1848 pmaddwd m0, m0 1849 punpckhwd m6, m3 1850 pmaddwd m6, m6 1851 punpcklwd m3, m1, m7 1852 paddd m4, m3 ; sumsq3 1853 punpckhwd m1, m7 1854 paddd m5, m1 1855 paddw m1, m2, [t2+r10*2+400* 6] 1856 mova [t2+r10*2+400* 6], m2 1857 paddw m8, m2 ; sum5 1858 paddd m2, m4, [t2+r10*2+400* 8] 1859 paddd m3, m5, [t2+r10*2+400*10] 1860 mova [t2+r10*2+400* 8], m4 1861 mova [t2+r10*2+400*10], m5 1862 paddd m4, m0 ; sumsq5 1863 paddd m5, m6 1864 punpcklwd m0, m1, m7 ; b3 1865 punpckhwd m1, m7 1866 pslld m6, m2, 3 1867 pslld m7, m3, 3 1868 paddd m6, m2 ; a3 * 9 1869 pmaddwd m2, m0, m0 ; b3 * b3 1870 paddd m7, m3 1871 pmaddwd m3, m1, m1 1872 psubd m6, m2 ; p3 1873 vpbroadcastd m2, [base+pd_0xf00801c7] 1874 psubd m7, m3 1875 pmulld m6, m14 ; p3 * s1 1876 pmulld m7, m14 1877 pmaddwd m0, m2 ; b3 * 455 1878 pmaddwd m1, m2 1879 paddusw m6, m2 1880 paddusw m7, m2 1881 psrad m3, m6, 20 ; min(z3, 255) - 256 1882 vpgatherdd m2, [r12+m3*4], m6 1883 psrad m6, m7, 20 1884 vpgatherdd m3, [r12+m6*4], m7 1885 vpbroadcastd m6, [base+pd_34816] ; x3 1886 pmulld m0, m2 1887 vpbroadcastd m7, [base+pd_m4096] 1888 pmulld m1, m3 1889 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1890 paddd m1, m6 1891 pand m0, m7 1892 pand m7, m1 1893 por m0, m2 ; a3 | (b3 << 12) 1894 por m7, m3 1895 paddw m1, m8, [t2+r10*2+400*0] 1896 paddd m2, m4, [t2+r10*2+400*2] 1897 paddd m3, m5, [t2+r10*2+400*4] 1898 paddw m1, [t1+r10*2+400*0] 1899 paddd m2, [t1+r10*2+400*2] 1900 paddd m3, [t1+r10*2+400*4] 1901 mova [t2+r10*2+400*0], m8 1902 mova [t2+r10*2+400*2], m4 1903 mova [t2+r10*2+400*4], m5 1904 mova [t3+r10*4+400*8+ 8], xm0 1905 vextracti128 [t3+r10*4+400*8+40], m0, 1 1906 mova [t3+r10*4+400*8+24], xm7 1907 vextracti128 [t3+r10*4+400*8+56], m7, 1 1908 vpbroadcastd m4, [base+pd_25] 1909 pxor m7, m7 1910 punpcklwd m0, m1, m7 ; b5 1911 punpckhwd m1, m7 1912 pmulld m2, m4 ; a5 * 25 1913 pmulld m3, m4 1914 pmaddwd m4, m0, m0 ; b5 * b5 1915 pmaddwd m5, m1, m1 1916 psubd m2, m4 ; p5 1917 vpbroadcastd m4, [base+pd_0xf00800a4] 1918 psubd m3, m5 1919 pmulld m2, m13 ; p5 * s0 1920 pmulld m3, m13 1921 pmaddwd m0, m4 ; b5 * 164 1922 pmaddwd m1, m4 1923 paddusw m2, m4 1924 paddusw m3, m4 1925 psrad m5, m2, 20 ; min(z5, 255) - 256 1926 vpgatherdd m4, [r12+m5*4], m2 ; x5 1927 psrad m2, m3, 20 1928 vpgatherdd m5, [r12+m2*4], m3 1929 pmulld m0, m4 1930 pmulld m1, m5 1931 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 1932 paddd m1, m6 1933 vpbroadcastd m6, [base+pd_m4096] 1934 pand m0, m6 1935 pand m1, m6 1936 por m0, m4 ; a5 | (b5 << 12) 1937 por m1, m5 1938 mova [t3+r10*4+400*0+ 8], xm0 1939 vextracti128 [t3+r10*4+400*0+40], m0, 1 1940 mova [t3+r10*4+400*0+24], xm1 1941 vextracti128 [t3+r10*4+400*0+56], m1, 1 1942 add r10, 16 1943 jl .hv1_loop 1944 mov r10, t2 1945 mov t2, t1 1946 mov t1, r10 1947 ret 1948.v0: ; vertical boxsums + ab3 (even rows) 1949 lea r10, [wq-2] 1950 vpbroadcastd m6, [base+pd_34816] 1951 vpbroadcastd m8, [base+pd_m4096] 1952.v0_loop: 1953 mova m0, [t1+r10*2+400* 6] 1954 mova m4, [t1+r10*2+400* 8] 1955 mova m5, [t1+r10*2+400*10] 1956 paddw m0, m0 1957 paddd m4, m4 1958 paddd m5, m5 1959 paddw m1, m0, [t2+r10*2+400* 6] 1960 paddd m2, m4, [t2+r10*2+400* 8] 1961 paddd m3, m5, [t2+r10*2+400*10] 1962 mova [t2+r10*2+400* 6], m0 1963 mova [t2+r10*2+400* 8], m4 1964 mova [t2+r10*2+400*10], m5 1965 punpcklwd m0, m1, m7 ; b3 1966 punpckhwd m1, m7 1967 pslld m4, m2, 3 1968 pslld m5, m3, 3 1969 paddd m4, m2 ; a3 * 9 1970 pmaddwd m2, m0, m0 ; b3 * b3 1971 paddd m5, m3 1972 pmaddwd m3, m1, m1 1973 psubd m4, m2 ; p3 1974 vpbroadcastd m2, [base+pd_0xf00801c7] 1975 psubd m5, m3 1976 pmulld m4, m14 ; p3 * s1 1977 pmulld m5, m14 1978 pmaddwd m0, m2 ; b3 * 455 1979 pmaddwd m1, m2 1980 paddusw m4, m2 1981 paddusw m5, m2 1982 psrad m3, m4, 20 ; min(z3, 255) - 256 1983 vpgatherdd m2, [r12+m3*4], m4 ; x3 1984 psrad m4, m5, 20 1985 vpgatherdd m3, [r12+m4*4], m5 1986 pmulld m0, m2 1987 pmulld m1, m3 1988 paddd m0, m6 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 1989 paddd m1, m6 1990 pand m0, m8 1991 pand m1, m8 1992 por m0, m2 ; a3 | (b3 << 12) 1993 por m1, m3 1994 mova m2, [t1+r10*2+400*0] 1995 mova m3, [t1+r10*2+400*2] 1996 mova m4, [t1+r10*2+400*4] 1997 mova [t3+r10*4+400*8+ 8], m2 1998 mova [t3+r10*4+400*0+ 8], m3 1999 mova [t3+r10*4+400*0+40], m4 2000 paddw m2, m2 ; cc5 2001 paddd m3, m3 2002 paddd m4, m4 2003 mova [t1+r10*2+400*0], m2 2004 mova [t1+r10*2+400*2], m3 2005 mova [t1+r10*2+400*4], m4 2006 mova [t3+r10*4+400*4+ 8], xm0 2007 vextracti128 [t3+r10*4+400*4+40], m0, 1 2008 mova [t3+r10*4+400*4+24], xm1 2009 vextracti128 [t3+r10*4+400*4+56], m1, 1 2010 add r10, 16 2011 jl .v0_loop 2012 ret 2013.v1: ; vertical boxsums + ab (odd rows) 2014 lea r10, [wq-2] 2015.v1_loop: 2016 mova m4, [t1+r10*2+400* 6] 2017 mova m5, [t1+r10*2+400* 8] 2018 mova m6, [t1+r10*2+400*10] 2019 paddw m1, m4, [t2+r10*2+400* 6] 2020 paddd m2, m5, [t2+r10*2+400* 8] 2021 paddd m3, m6, [t2+r10*2+400*10] 2022 mova [t2+r10*2+400* 6], m4 2023 mova [t2+r10*2+400* 8], m5 2024 mova [t2+r10*2+400*10], m6 2025 punpcklwd m0, m1, m7 ; b3 2026 punpckhwd m1, m7 2027 pslld m4, m2, 3 2028 pslld m5, m3, 3 2029 paddd m4, m2 ; a3 * 9 2030 pmaddwd m2, m0, m0 ; b3 * b3 2031 paddd m5, m3 2032 pmaddwd m3, m1, m1 2033 psubd m4, m2 ; p3 2034 vpbroadcastd m2, [base+pd_0xf00801c7] 2035 psubd m5, m3 2036 pmulld m4, m14 ; p3 * s1 2037 pmulld m5, m14 2038 pmaddwd m0, m2 ; b3 * 455 2039 pmaddwd m1, m2 2040 paddusw m4, m2 2041 paddusw m5, m2 2042 psrad m3, m4, 20 ; min(z3, 255) - 256 2043 vpgatherdd m2, [r12+m3*4], m4 ; x3 2044 psrad m4, m5, 20 2045 vpgatherdd m3, [r12+m4*4], m5 2046 vpbroadcastd m4, [base+pd_34816] 2047 pmulld m0, m2 2048 vpbroadcastd m8, [base+pd_m4096] 2049 pmulld m1, m3 2050 paddd m0, m4 ; x3 * b3 * 455 + (1 << 11) + (1 << 15) 2051 paddd m1, m4 2052 pand m0, m8 2053 pand m8, m1 2054 por m0, m2 ; a3 | (b3 << 12) 2055 por m8, m3 2056 mova m4, [t3+r10*4+400*8+ 8] 2057 mova m5, [t3+r10*4+400*0+ 8] 2058 mova m6, [t3+r10*4+400*0+40] 2059 paddw m1, m4, [t2+r10*2+400*0] 2060 paddd m2, m5, [t2+r10*2+400*2] 2061 paddd m3, m6, [t2+r10*2+400*4] 2062 paddw m1, [t1+r10*2+400*0] 2063 paddd m2, [t1+r10*2+400*2] 2064 paddd m3, [t1+r10*2+400*4] 2065 mova [t2+r10*2+400*0], m4 2066 mova [t2+r10*2+400*2], m5 2067 mova [t2+r10*2+400*4], m6 2068 vpbroadcastd m4, [base+pd_25] 2069 mova [t3+r10*4+400*8+ 8], xm0 2070 vextracti128 [t3+r10*4+400*8+40], m0, 1 2071 mova [t3+r10*4+400*8+24], xm8 2072 vextracti128 [t3+r10*4+400*8+56], m8, 1 2073 punpcklwd m0, m1, m7 ; b5 2074 punpckhwd m1, m7 2075 pmulld m2, m4 ; a5 * 25 2076 pmulld m3, m4 2077 pmaddwd m4, m0, m0 ; b5 * b5 2078 pmaddwd m5, m1, m1 2079 psubd m2, m4 ; p5 2080 vpbroadcastd m4, [base+pd_0xf00800a4] 2081 psubd m3, m5 2082 pmulld m2, m13 ; p5 * s0 2083 pmulld m3, m13 2084 pmaddwd m0, m4 ; b5 * 164 2085 pmaddwd m1, m4 2086 paddusw m2, m4 2087 paddusw m3, m4 2088 psrad m5, m2, 20 ; min(z5, 255) - 256 2089 vpgatherdd m4, [r12+m5*4], m2 ; x5 2090 psrad m2, m3, 20 2091 vpgatherdd m5, [r12+m2*4], m3 2092 pmulld m0, m4 2093 vpbroadcastd m6, [base+pd_34816] 2094 pmulld m1, m5 2095 paddd m0, m6 ; x5 * b5 * 164 + (1 << 11) + (1 << 15) 2096 paddd m1, m6 2097 vpbroadcastd m6, [base+pd_m4096] 2098 pand m0, m6 2099 pand m1, m6 2100 por m0, m4 ; a5 | (b5 << 12) 2101 por m1, m5 2102 mova [t3+r10*4+400*0+ 8], xm0 2103 vextracti128 [t3+r10*4+400*0+40], m0, 1 2104 mova [t3+r10*4+400*0+24], xm1 2105 vextracti128 [t3+r10*4+400*0+56], m1, 1 2106 add r10, 16 2107 jl .v1_loop 2108 mov r10, t2 2109 mov t2, t1 2110 mov t1, r10 2111 ret 2112.prep_n: ; initial neighbor setup 2113 mov r10, wq 2114.prep_n_loop: 2115 movu m0, [t3+r10*4+400*0+4] 2116 paddd m1, m0, [t3+r10*4+400*0+0] 2117 mova m4, [t3+r10*4+400*4+0] 2118 paddd m1, [t3+r10*4+400*0+8] 2119 mova m5, [t3+r10*4+400*8+0] 2120 paddd m4, [t3+r10*4+400*4+8] 2121 paddd m5, [t3+r10*4+400*8+8] 2122 paddd m2, m4, [t3+r10*4+400*4+4] 2123 paddd m3, m5, [t3+r10*4+400*8+4] 2124 paddd m0, m1 2125 pslld m1, 2 2126 pslld m2, 2 2127 paddd m1, m0 ; ab5 565 2128 paddd m3, m3 ; ab3[ 0] 222 2129 psubd m2, m4 ; ab3[-1] 343 2130 mova [t3+r10*4+400*20], m3 2131 pandn m0, m6, m1 ; a5 565 2132 mova [t3+r10*4+400*24], m2 2133 psrld m1, 12 ; b5 565 2134 mova [t3+r10*4+400*12], m0 2135 paddd m3, m3 2136 mova [t3+r10*4+400*16], m1 2137 psubd m3, m5 ; ab3[ 0] 343 2138 mova [t3+r10*4+400*28], m3 2139 add r10, 8 2140 jl .prep_n_loop 2141 ret 2142ALIGN function_align 2143.n0: ; neighbor + output (even rows) 2144 mov r10, wq 2145.n0_loop: 2146 movu m0, [t3+r10*4+4] 2147 paddd m4, m0, [t3+r10*4+0] 2148 paddd m4, [t3+r10*4+8] 2149 paddd m0, m4 2150 pslld m4, 2 2151 paddd m4, m0 2152 pandn m0, m6, m4 2153 psrld m4, 12 2154 paddd m2, m0, [t3+r10*4+400*12] ; a5 2155 mova [t3+r10*4+400*12], m0 2156 paddd m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8) 2157 mova [t3+r10*4+400*16], m4 2158 mova m3, [t3+r10*4+400*4+0] 2159 paddd m3, [t3+r10*4+400*4+8] 2160 paddd m5, m3, [t3+r10*4+400*4+4] 2161 paddd m5, m5 ; ab3[ 1] 222 2162 mova m4, [t3+r10*4+400*20] 2163 paddd m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343 2164 mova [t3+r10*4+400*20], m5 2165 paddd m5, m5 2166 psubd m5, m3 ; ab3[ 1] 343 2167 mova [t3+r10*4+400*24], m5 2168 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2169 pandn m3, m6, m1 2170 psrld m1, 12 2171 pandn m5, m6, m4 2172 psrld m4, 12 2173 paddd m3, m5 ; a3 2174 paddd m1, m4 ; b3 + (1 << 8) 2175 pmovzxbd m4, [dstq+r10] 2176 pmaddwd m2, m4 ; a5 * src 2177 pmaddwd m3, m4 ; a3 * src 2178 psubd m0, m2 ; b5 - a5 * src + (1 << 8) 2179 psubd m1, m3 ; b3 - a3 * src + (1 << 8) 2180 psrld m0, 9 2181 pslld m1, 7 2182 pblendw m0, m1, 0xaa 2183 pmaddwd m0, m15 2184 psubd m0, m6 2185 psrad m0, 13 2186 paddd m0, m4 2187 vextracti128 xm1, m0, 1 2188 packssdw xm0, xm1 2189 packuswb xm0, xm0 2190 movq [dstq+r10], xm0 2191 add r10, 8 2192 jl .n0_loop 2193 add dstq, strideq 2194 ret 2195ALIGN function_align 2196.n1: ; neighbor + output (odd rows) 2197 mov r10, wq 2198.n1_loop: 2199 mova m3, [t3+r10*4+400*8+0] 2200 paddd m3, [t3+r10*4+400*8+8] 2201 paddd m5, m3, [t3+r10*4+400*8+4] 2202 paddd m5, m5 ; ab3[ 1] 222 2203 mova m4, [t3+r10*4+400*20] 2204 paddd m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343 2205 mova [t3+r10*4+400*20], m5 2206 paddd m5, m5 2207 psubd m5, m3 ; ab3[ 1] 343 2208 mova [t3+r10*4+400*28], m5 2209 paddd m4, m5 ; ab3[ 0] 222 + ab3[ 1] 343 2210 pandn m3, m6, m1 2211 psrld m1, 12 2212 pandn m5, m6, m4 2213 psrld m4, 12 2214 paddd m3, m5 ; -a3 2215 paddd m1, m4 ; b3 + (1 << 8) 2216 pmovzxbd m4, [dstq+r10] 2217 pmaddwd m2, m4, [t3+r10*4+400*12] ; -a5 * src 2218 mova m0, [t3+r10*4+400*16] ; b5 + (1 << 7) 2219 pmaddwd m3, m4 ; -a3 * src 2220 psubd m0, m2 ; a5 * src + b5 + (1 << 7) 2221 psubd m1, m3 ; a3 * src + b3 + (1 << 8) 2222 psrld m0, 8 2223 pslld m1, 7 2224 pblendw m0, m1, 0xaa 2225 pmaddwd m0, m15 2226 psubd m0, m6 2227 psrad m0, 13 2228 paddd m0, m4 2229 vextracti128 xm1, m0, 1 2230 packssdw xm0, xm1 2231 packuswb xm0, xm0 2232 movq [dstq+r10], xm0 2233 add r10, 8 2234 jl .n1_loop 2235 add dstq, strideq 2236 ret 2237 2238%endif ; ARCH_X86_64 2239