1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8 34pb_4x1_4x5_4x9_4x13: times 4 db 0, 1 35 times 4 db 8, 9 36 times 4 db 0, 1 37 times 4 db 8, 9 38 39pw_1: times 16 dw 1 40pw_2: times 16 dw 2 41pw_3: times 16 dw 3 42pw_4096: times 2 dw 4096 43 44; 10bpc/12bpc: 45pw_4: times 2 dw 4 46 times 2 dw 16 47clip_max: times 2 dw 511 48 times 2 dw 2047 49clip_min: times 2 dw -512 50 times 2 dw -2048 51 52SECTION .text 53 54; in: out: 55; mm%1 a b c d a e i m 56; mm%2 e f g h b f j n 57; mm%3 i j k l -> c g k o 58; mm%4 m n o p d h l p 59%macro TRANSPOSE4X4W 5 60 punpcklwd m%5, m%1, m%2 61 punpckhwd m%1, m%2 62 punpcklwd m%2, m%3, m%4 63 punpckhwd m%3, m%4 64 punpckldq m%4, m%5, m%2 65 punpckhdq m%5, m%2 66 punpckldq m%2, m%1, m%3 67 punpckhdq m%1, m%3 68 69 SWAP %1, %4 70 SWAP %2, %5, %3 71%endmacro 72 73; in: out: 74; xmm%1 a b c d e f g h a i q y 6 E M U 75; xmm%2 i j k l m n o p b j r z 7 F N V 76; xmm%3 q r s t u v w x c k s 0 8 G O W 77; xmm%4 y z 0 1 2 3 4 5 d l t 1 9 H P X 78; xmm%5 6 7 8 9 A B C D -> e m u 2 A I Q Y 79; xmm%6 E F G H I J K L f n v 3 B J R Z 80; xmm%7 M N O P Q R S T g o w 4 C K S + 81; xmm%8 U V W X Y Z + = h p x 5 D L T = 82%macro TRANSPOSE8X8W 9 83 ; xmm%1 a b c d e f g h a i q y b j r z 84 ; xmm%2 i j k l m n o p c k s 0 d l t 1 85 ; xmm%3 q r s t u v w x -> e m u 2 f n v 3 86 ; xmm%4 y z 0 1 2 3 4 5 g o w 4 h p x 5 87 TRANSPOSE4X4W %1, %2, %3, %4, %9 88 89 ; xmm%5 6 7 8 9 A B C D 6 E M U 7 F N V 90 ; xmm%6 E F G H I J K L 8 G O W 9 H P X 91 ; xmm%7 M N O P Q R S T -> A I Q Y B J R Z 92 ; xmm%8 U V W X Y Z + = C K S + D L T = 93 TRANSPOSE4X4W %5, %6, %7, %8, %9 94 95 ; xmm%1 a i q y b j r z a i q y 6 E M U 96 ; xmm%2 c k s 0 d l t 1 b j r z 7 F N V 97 ; xmm%3 e m u 2 f n v 3 c k s 0 8 G O W 98 ; xmm%4 g o w 4 h p x 5 d l t 1 9 H P X 99 ; xmm%5 6 E M U 7 F N V -> e m u 2 A I Q Y 100 ; xmm%6 8 G O W 9 H P X f n v 3 B J R Z 101 ; xmm%7 A I Q Y B J R Z g o w 4 C K S + 102 ; xmm%8 C K S + D L T = h p x 5 D L T = 103 punpckhqdq m%9, m%1, m%5 104 punpcklqdq m%1, m%5 105 punpckhqdq m%5, m%2, m%6 106 punpcklqdq m%2, m%6 107 punpckhqdq m%6, m%3, m%7 108 punpcklqdq m%3, m%7 109 punpckhqdq m%7, m%4, m%8 110 punpcklqdq m%4, m%8 111 112 SWAP %8, %7, %4, %5, %3, %2, %9 113%endmacro 114 115; transpose and write m3-6, everything else is scratch 116%macro TRANSPOSE_8x4_AND_WRITE_4x16 0 117 ; transpose 8x4 118 punpcklwd m0, m3, m4 119 punpckhwd m3, m4 120 punpcklwd m4, m5, m6 121 punpckhwd m5, m6 122 punpckldq m6, m0, m4 123 punpckhdq m0, m4 124 punpckldq m4, m3, m5 125 punpckhdq m3, m5 126 127 ; write out 128 movq [dstq+strideq*0-4], xm6 129 movhps [dstq+strideq*1-4], xm6 130 movq [dstq+strideq*2-4], xm0 131 movhps [dstq+stride3q -4], xm0 132 lea dstq, [dstq+strideq*4] 133 movq [dstq+strideq*0-4], xm4 134 movhps [dstq+strideq*1-4], xm4 135 movq [dstq+strideq*2-4], xm3 136 movhps [dstq+stride3q -4], xm3 137 lea dstq, [dstq+strideq*4] 138 139 vextracti128 xm6, m6, 1 140 vextracti128 xm0, m0, 1 141 vextracti128 xm4, m4, 1 142 vextracti128 xm3, m3, 1 143 144 movq [dstq+strideq*0-4], xm6 145 movhps [dstq+strideq*1-4], xm6 146 movq [dstq+strideq*2-4], xm0 147 movhps [dstq+stride3q -4], xm0 148 lea dstq, [dstq+strideq*4] 149 movq [dstq+strideq*0-4], xm4 150 movhps [dstq+strideq*1-4], xm4 151 movq [dstq+strideq*2-4], xm3 152 movhps [dstq+stride3q -4], xm3 153 lea dstq, [dstq+strideq*4] 154%endmacro 155 156%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 157 ; load data 158%ifidn %2, v 159%if %1 == 4 160 lea tmpq, [dstq+mstrideq*2] 161 mova m3, [tmpq+strideq*0] ; p1 162 mova m4, [tmpq+strideq*1] ; p0 163 mova m5, [tmpq+strideq*2] ; q0 164 mova m6, [tmpq+stride3q] ; q1 165%else 166 ; load 6-8 pixels, remainder (for wd=16) will be read inline 167 lea tmpq, [dstq+mstrideq*4] 168 ; we load p3 later 169 mova m13, [tmpq+strideq*1] 170 mova m3, [tmpq+strideq*2] 171 mova m4, [tmpq+stride3q] 172 mova m5, [dstq+strideq*0] 173 mova m6, [dstq+strideq*1] 174 mova m14, [dstq+strideq*2] 175%if %1 != 6 176 mova m15, [dstq+stride3q] 177%endif 178%endif 179%else 180 ; load lines 181%if %1 == 4 182 movq xm3, [dstq+strideq*0-4] 183 movq xm4, [dstq+strideq*1-4] 184 movq xm5, [dstq+strideq*2-4] 185 movq xm6, [dstq+stride3q -4] 186 lea tmpq, [dstq+strideq*4] 187 movq xm11, [tmpq+strideq*0-4] 188 movq xm13, [tmpq+strideq*1-4] 189 movq xm14, [tmpq+strideq*2-4] 190 movq xm15, [tmpq+stride3q -4] 191 lea tmpq, [tmpq+strideq*4] 192 ; this overreads by 8 bytes but the buffers are padded 193 ; so that should be ok 194 vinserti128 m3, [tmpq+strideq*0-4], 1 195 vinserti128 m4, [tmpq+strideq*1-4], 1 196 vinserti128 m5, [tmpq+strideq*2-4], 1 197 vinserti128 m6, [tmpq+stride3q -4], 1 198 lea tmpq, [tmpq+strideq*4] 199 vinserti128 m11, [tmpq+strideq*0-4], 1 200 vinserti128 m13, [tmpq+strideq*1-4], 1 201 vinserti128 m14, [tmpq+strideq*2-4], 1 202 vinserti128 m15, [tmpq+stride3q -4], 1 203 204 ; transpose 4x8 205 ; xm3: A-D0,A-D4 206 ; xm4: A-D1,A-D5 207 ; xm5: A-D2,A-D6 208 ; xm6: A-D3,A-D7 209 punpcklwd m7, m3, m4 210 punpcklwd m3, m11, m13 211 punpcklwd m4, m5, m6 212 punpcklwd m5, m14, m15 213 ; xm7: A0-1,B0-1,C0-1,D0-1 214 ; xm3: A4-5,B4-5,C4-5,D4-5 215 ; xm4: A2-3,B2-3,C2-3,D2-3 216 ; xm5: A6-7,B6-7,C6-7,D6-7 217 punpckldq m6, m7, m4 218 punpckhdq m7, m4 219 punpckldq m8, m3, m5 220 punpckhdq m5, m3, m5 221 ; xm6: A0-3,B0-3 222 ; xm7: C0-3,D0-3 223 ; xm8: A4-7,B4-7 224 ; xm5: C4-7,D4-7 225 punpcklqdq m3, m6, m8 226 punpckhqdq m4, m6, m8 227 punpckhqdq m6, m7, m5 228 punpcklqdq m5, m7, m5 229 ; xm3: A0-7 230 ; xm4: B0-7 231 ; xm5: C0-7 232 ; xm6: D0-7 233%elif %1 == 6 || %1 == 8 234 movu xm3, [dstq+strideq*0-8] 235 movu xm4, [dstq+strideq*1-8] 236 movu xm5, [dstq+strideq*2-8] 237 movu xm6, [dstq+stride3q -8] 238 lea tmpq, [dstq+strideq*4] 239 movu xm11, [tmpq+strideq*0-8] 240 movu xm13, [tmpq+strideq*1-8] 241 movu xm14, [tmpq+strideq*2-8] 242 movu xm15, [tmpq+stride3q -8] 243 lea tmpq, [tmpq+strideq*4] 244 vinserti128 m3, [tmpq+strideq*0-8], 1 245 vinserti128 m4, [tmpq+strideq*1-8], 1 246 vinserti128 m5, [tmpq+strideq*2-8], 1 247 vinserti128 m6, [tmpq+stride3q -8], 1 248 lea tmpq, [tmpq+strideq*4] 249 vinserti128 m11, [tmpq+strideq*0-8], 1 250 vinserti128 m13, [tmpq+strideq*1-8], 1 251 vinserti128 m14, [tmpq+strideq*2-8], 1 252 vinserti128 m15, [tmpq+stride3q -8], 1 253 254 ; transpose 8x16 255 ; xm3: A-H0,A-H8 256 ; xm4: A-H1,A-H9 257 ; xm5: A-H2,A-H10 258 ; xm6: A-H3,A-H11 259 ; xm11: A-H4,A-H12 260 ; xm13: A-H5,A-H13 261 ; xm14: A-H6,A-H14 262 ; xm15: A-H7,A-H15 263 punpcklwd m7, m3, m4 264 punpckhwd m3, m4 265 punpcklwd m4, m5, m6 266 punpckhwd m5, m6 267 punpcklwd m6, m11, m13 268 punpckhwd m11, m13 269 punpcklwd m13, m14, m15 270 punpckhwd m14, m15 271 ; xm7: A0-1,B0-1,C0-1,D0-1 272 ; xm3: E0-1,F0-1,G0-1,H0-1 273 ; xm4: A2-3,B2-3,C2-3,D2-3 274 ; xm5: E2-3,F2-3,G2-3,H2-3 275 ; xm6: A4-5,B4-5,C4-5,D4-5 276 ; xm11: E4-5,F4-5,G4-5,H4-5 277 ; xm13: A6-7,B6-7,C6-7,D6-7 278 ; xm14: E6-7,F6-7,G6-7,H6-7 279 punpckldq m15, m7, m4 280 punpckhdq m7, m4 281 punpckldq m9, m3, m5 282 punpckhdq m8, m3, m5 283 punpckldq m3, m6, m13 284 punpckhdq m6, m13 285 punpckldq m10, m11, m14 286 punpckhdq m11, m14 287 ; xm15: A0-3,B0-3 288 ; xm7: C0-3,D0-3 289 ; xm9: E0-3,F0-3 290 ; xm8: G0-3,H0-3 291 ; xm3: A4-7,B4-7 292 ; xm6: C4-7,D4-7 293 ; xm10: E4-7,F4-7 294 ; xm11: G4-7,H4-7 295%if %1 != 6 296 punpcklqdq m0, m15, m3 297%endif 298 punpckhqdq m13, m15, m3 299 punpcklqdq m3, m7, m6 300 punpckhqdq m4, m7, m6 301 punpcklqdq m5, m9, m10 302 punpckhqdq m6, m9, m10 303 punpcklqdq m14, m8, m11 304%if %1 != 6 305 punpckhqdq m15, m8, m11 306 mova [rsp+5*32], m0 307%endif 308%else 309 ; We only use 14 pixels but we'll need the remainder at the end for 310 ; the second transpose 311 mova xm0, [dstq+strideq*0-16] 312 mova xm1, [dstq+strideq*1-16] 313 mova xm2, [dstq+strideq*2-16] 314 mova xm3, [dstq+stride3q -16] 315 lea tmpq, [dstq+strideq*4] 316 mova xm4, [tmpq+strideq*0-16] 317 mova xm5, [tmpq+strideq*1-16] 318 mova xm6, [tmpq+strideq*2-16] 319 mova xm7, [tmpq+stride3q -16] 320 lea tmpq, [tmpq+strideq*4] 321 vinserti128 m0, m0, [tmpq+strideq*0-16], 1 322 vinserti128 m1, m1, [tmpq+strideq*1-16], 1 323 vinserti128 m2, m2, [tmpq+strideq*2-16], 1 324 vinserti128 m3, m3, [tmpq+stride3q -16], 1 325 lea tmpq, [tmpq+strideq*4] 326 vinserti128 m4, m4, [tmpq+strideq*0-16], 1 327 vinserti128 m5, m5, [tmpq+strideq*1-16], 1 328 vinserti128 m6, m6, [tmpq+strideq*2-16], 1 329 vinserti128 m7, m7, [tmpq+stride3q -16], 1 330 331 TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8 332 333 mova [rsp+6*32], m0 334 mova [rsp+7*32], m1 335 mova [rsp+8*32], m2 336 mova [rsp+9*32], m3 337 mova [rsp+5*32], m4 338 339 mova xm0, [dstq+strideq*0] 340 mova xm1, [dstq+strideq*1] 341 mova xm2, [dstq+strideq*2] 342 mova xm3, [dstq+stride3q ] 343 lea tmpq, [dstq+strideq*4] 344 mova xm8, [tmpq+strideq*0] 345 mova xm9, [tmpq+strideq*1] 346 mova xm10, [tmpq+strideq*2] 347 mova xm11, [tmpq+stride3q ] 348 lea tmpq, [tmpq+strideq*4] 349 vinserti128 m0, m0, [tmpq+strideq*0], 1 350 vinserti128 m1, m1, [tmpq+strideq*1], 1 351 vinserti128 m2, m2, [tmpq+strideq*2], 1 352 vinserti128 m3, m3, [tmpq+stride3q ], 1 353 lea tmpq, [tmpq+strideq*4] 354 vinserti128 m8, m8, [tmpq+strideq*0], 1 355 vinserti128 m9, m9, [tmpq+strideq*1], 1 356 vinserti128 m10, m10, [tmpq+strideq*2], 1 357 vinserti128 m11, m11, [tmpq+stride3q ], 1 358 359 TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4 360 361 mova [rsp+10*32], m8 362 mova [rsp+11*32], m9 363 mova [rsp+12*32], m10 364 mova [rsp+13*32], m11 365 366 ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15 367 SWAP 13, 5, 0 368 SWAP 3, 6, 1, 15 369 SWAP 4, 7 370 SWAP 2, 14 371%endif 372%endif 373 374 ; load L/E/I/H 375%ifidn %2, v 376 pmovzxbw m1, [lq] 377 pmovzxbw m0, [lq+l_strideq] 378 pxor m2, m2 379%else 380 vpbroadcastq m0, [lq] ; l0, l1 381 vpbroadcastq m1, [lq+l_strideq] ; l2, l3 382 vpbroadcastq m2, [lq+l_strideq*2] ; l4, l5 383 vpbroadcastq m10, [lq+l_stride3q] ; l6, l7 384 punpckldq m0, m1 ; l0, l2, l1, l3 [2x] 385 punpckldq m2, m10 ; l4, l6, l5, l7 [2x] 386 vpblendd m0, m0, m2, 11110000b ; l0, l2, l1, l3, l4, l6, l5, l7 387 pxor m2, m2 388 punpcklbw m1, m0, m2 ; l0, l2, l4, l6 389 punpckhbw m0, m2 ; l1, l3, l5, l7 390%endif 391 pcmpeqw m10, m2, m0 392 pand m1, m10 393 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 394 pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] 395 pcmpeqw m10, m2, m0 ; !L 396 psrlw m10, 1 397 psrlw m2, m0, [lutq+128] 398 vpbroadcastw m1, [lutq+136] 399 pminuw m2, m1 400 pmaxuw m2, [pw_1] ; I 401 psrlw m1, m0, 4 ; H 402 paddw m0, [pw_2] 403 vpbroadcastd m8, [r11] 404 paddw m0, m0 405 paddw m0, m2 ; E 406 REPX {pmullw x, m8}, m0, m1, m2 407 408 psubw m8, m3, m4 ; p1-p0 409 psubw m9, m5, m6 ; q1-q0 410 REPX {pabsw x, x}, m8, m9 411 pmaxuw m8, m10 412 pmaxuw m8, m9 413 pcmpgtw m7, m8, m1 ; hev 414%if %1 != 4 415 psubw m9, m13, m4 ; p2-p0 416 pabsw m9, m9 417 pmaxuw m9, m8 418%if %1 != 6 419%ifidn %2, v 420 mova m11, [tmpq+strideq*0] ; p3 421%else 422 mova m11, [rsp+5*32] ; p3 423%endif 424 psubw m10, m11, m4 ; p3-p0 425 pabsw m10, m10 426 pmaxuw m9, m10 427%endif 428 psubw m10, m5, m14 ; q2-q0 429 pabsw m10, m10 430 pmaxuw m9, m10 431%if %1 != 6 432 psubw m10, m5, m15 ; q3-q0 433 pabsw m10, m10 434 pmaxuw m9, m10 435%endif 436 vpbroadcastd m10, [r11] 437 pcmpgtw m9, m10 ; !flat8in 438 439 psubw m10, m13, m3 ; p2-p1 440 pabsw m10, m10 441%if %1 != 6 442 psubw m11, m13 ; p3-p2 443 pabsw m11, m11 444 pmaxuw m10, m11 445 psubw m11, m14, m15 ; q3-q2 446 pabsw m11, m11 447 pmaxuw m10, m11 448%endif 449 psubw m11, m14, m6 ; q2-q1 450 pabsw m11, m11 451 pmaxuw m10, m11 452 453%if %1 == 16 454 vpbroadcastd m11, [maskq+8] 455 vpbroadcastd m1, [maskq+4] 456 por m11, m1 457 pand m11, m12 458 pcmpeqd m11, m12 459 pand m10, m11 460%else 461 vpbroadcastd m11, [maskq+4] 462 pand m11, m12 463 pcmpeqd m11, m12 464 pand m10, m11 ; only apply fm-wide to wd>4 blocks 465%endif 466 pmaxuw m8, m10 467%endif 468 pcmpgtw m8, m2 469 470 psubw m10, m3, m6 ; p1-q1 471 psubw m11, m4, m5 ; p0-q0 472 REPX {pabsw x, x}, m10, m11 473 paddw m11, m11 474 psrlw m10, 1 475 paddw m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 476 pcmpgtw m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 477 por m8, m10 478 479%if %1 == 16 480 481%ifidn %2, v 482 lea tmpq, [dstq+mstrideq*8] 483 mova m0, [tmpq+strideq*1] 484 mova m1, [tmpq+strideq*2] 485 mova m2, [tmpq+stride3q] 486%else 487 mova m0, [rsp+7*32] 488 mova m1, [rsp+8*32] 489 mova m2, [rsp+9*32] 490%endif 491 REPX {psubw x, m4}, m0, m1, m2 492 REPX {pabsw x, x}, m0, m1, m2 493 pmaxuw m1, m0 494 pmaxuw m1, m2 495%ifidn %2, v 496 lea tmpq, [dstq+strideq*4] 497 mova m0, [tmpq+strideq*0] 498 mova m2, [tmpq+strideq*1] 499 mova m10, [tmpq+strideq*2] 500%else 501 mova m0, [rsp+10*32] 502 mova m2, [rsp+11*32] 503 mova m10, [rsp+12*32] 504%endif 505 REPX {psubw x, m5}, m0, m2, m10 506 REPX {pabsw x, x}, m0, m2, m10 507 pmaxuw m0, m2 508 pmaxuw m1, m10 509 pmaxuw m1, m0 510 vpbroadcastd m0, [r11] 511 pcmpgtw m1, m0 ; !flat8out 512 por m1, m9 ; !flat8in | !flat8out 513 vpbroadcastd m2, [maskq+8] 514 pand m10, m2, m12 515 pcmpeqd m10, m12 516 pandn m1, m10 ; flat16 517 pandn m1, m8, m1 ; flat16 & fm 518 519 vpbroadcastd m10, [maskq+4] 520 por m10, m2 521 pand m2, m10, m12 522 pcmpeqd m2, m12 523 pandn m9, m2 ; flat8in 524 pandn m9, m8, m9 525 vpbroadcastd m2, [maskq+0] 526 por m2, m10 527 pand m2, m12 528 pcmpeqd m2, m12 529 pandn m8, m2 530 pandn m8, m9, m8 ; fm & !flat8 & !flat16 531 pandn m9, m1, m9 ; flat8 & !flat16 532%elif %1 != 4 533 vpbroadcastd m0, [maskq+4] 534 pand m2, m0, m12 535 pcmpeqd m2, m12 536 pandn m9, m2 537 pandn m9, m8, m9 ; flat8 & fm 538 vpbroadcastd m2, [maskq+0] 539 por m0, m2 540 pand m0, m12 541 pcmpeqd m0, m12 542 pandn m8, m0 543 pandn m8, m9, m8 ; fm & !flat8 544%else 545 vpbroadcastd m0, [maskq+0] 546 pand m0, m12 547 pcmpeqd m0, m12 548 pandn m8, m0 ; fm 549%endif 550 551 ; short filter 552 vpbroadcastd m0, [r11+8*1] ; 511 or 2047 553 vpbroadcastd m2, [r11+8*2] ; -512 or -2048 554 psubw m10, m5, m4 555 paddw m11, m10, m10 556 paddw m11, m10 557 psubw m10, m3, m6 ; iclip_diff(p1-q1) 558 pminsw m10, m0 559 pmaxsw m10, m2 560 pand m10, m7 ; f=iclip_diff(p1-q1)&hev 561 paddw m10, m11 ; f=iclip_diff(3*(q0-p0)+f) 562 pminsw m10, m0 563 pmaxsw m10, m2 564 pand m8, m10 ; f&=fm 565 vpbroadcastd m10, [pw_4] 566 paddw m10, m8 567 paddw m8, [pw_3] 568 REPX {pminsw x, m0}, m10, m8 569 psraw m10, 3 ; f2 570 psraw m8, 3 ; f1 571 psubw m5, m10 572 paddw m4, m8 573 574 paddw m10, [pw_1] 575 psraw m10, 1 ; f=(f1+1)>>1 576 pandn m8, m7, m10 ; f&=!hev 577 paddw m3, m8 578 psubw m6, m8 579 pxor m8, m8 580 psubw m0, m2 ; 1023 or 4095 581 REPX {pminsw x, m0}, m3, m4, m5, m6 582 REPX {pmaxsw x, m8}, m3, m4, m5, m6 583 584%if %1 == 16 585 586; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16 587; m12=filter bits mask 588; m13-15=p2/q2/q3 589; m0,2,7-8,10-11 = free 590 591 ; flat16 filter 592%ifidn %2, v 593 lea tmpq, [dstq+mstrideq*8] 594 mova m0, [tmpq+strideq*1] ; p6 595 mova m2, [tmpq+strideq*2] ; p5 596 mova m7, [tmpq+stride3q] ; p4 597 mova m11, [tmpq+strideq*4] ; p3 598%else 599 mova m0, [rsp+7*32] 600 mova m2, [rsp+8*32] 601 mova m7, [rsp+9*32] 602 mova m11, [rsp+5*32] 603%endif 604 605 mova [rsp+ 0*32], m9 606 607 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 608 paddw m8, m0, [pw_1] 609 psllw m8, 3 ; p6*8+8 610 paddw m10, m2, m7 ; p5+p4 611 psubw m8, m0 612 paddw m10, m10 ; (p5+p4)*2 613 paddw m8, m11 ; p6*7+p3 614 paddw m10, m13 ; (p5+p4)*2+p2 615 paddw m8, m3 ; p6*7+p3+p1 616 paddw m10, m4 ; (p5+p4)*2+p2+p0 617 paddw m8, m5 ; p6*7+p3+p1+q0 618 paddw m8, m10 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 619 psrlw m10, m8, 4 620 vpblendvb m10, m2, m10, m1 621%ifidn %2, v 622 mova [tmpq+strideq*2], m10 ; p5 623%else 624 mova [rsp+8*32], m10 625%endif 626 627 ; sub p6*2, add p3/q1 628 paddw m8, m11 629 paddw m10, m0, m0 630 paddw m8, m6 631 psubw m8, m10 632 psrlw m10, m8, 4 633 vpblendvb m10, m7, m10, m1 634%ifidn %2, v 635 mova [tmpq+stride3q], m10 ; p4 636%else 637 mova [rsp+9*32], m10 638%endif 639 640 ; sub p6/p5, add p2/q2 641 psubw m8, m0 642 paddw m10, m13, m14 643 psubw m8, m2 644 paddw m8, m10 645 psrlw m10, m8, 4 646 vpblendvb m10, m11, m10, m1 647%ifidn %2, v 648 mova [tmpq+strideq*4], m10 ; p3 649 lea tmpq, [dstq+strideq*4] 650%else 651 mova [rsp+5*32], m10 652%endif 653 654 ; sub p6/p4, add p1/q3 655 paddw m8, m3 656 paddw m10, m0, m7 657 paddw m8, m15 658 psubw m8, m10 659 psrlw m10, m8, 4 660 vpblendvb m10, m13, m10, m1 661 mova [rsp+1*32], m10 ; don't clobber p2/m13 662 663 ; sub p6/p3, add p0/q4 664 paddw m8, m4 665 paddw m10, m0, m11 666%ifidn %2, v 667 paddw m8, [tmpq+strideq*0] 668%else 669 paddw m8, [rsp+10*32] 670%endif 671 psubw m8, m10 672 psrlw m10, m8, 4 673 vpblendvb m10, m3, m10, m1 674 mova [rsp+2*32], m10 ; don't clobber p1/m3 675 676 ; sub p6/p2, add q0/q5 677 paddw m8, m5 678 paddw m10, m0, m13 679%ifidn %2, v 680 paddw m8, [tmpq+strideq*1] 681%else 682 paddw m8, [rsp+11*32] 683%endif 684 psubw m8, m10 685 psrlw m10, m8, 4 686 vpblendvb m10, m4, m10, m1 687 mova [rsp+3*32], m10 ; don't clobber p0/m4 688 689 ; sub p6/p1, add q1/q6 690 paddw m8, m6 691 paddw m10, m0, m3 692%ifidn %2, v 693 mova m0, [tmpq+strideq*2] ; q6 694%else 695 mova m0, [rsp+12*32] ; q6 696%endif 697 paddw m8, m0 698 psubw m8, m10 699 psrlw m10, m8, 4 700 vpblendvb m10, m5, m10, m1 701 mova [rsp+4*32], m10 ; don't clobber q0/m5 702 703 ; sub p5/p0, add q2/q6 704 paddw m8, m14 705 paddw m10, m2, m4 706 paddw m8, m0 707 psubw m8, m10 708 psrlw m10, m8, 4 709 vpblendvb m2, m6, m10, m1 ; don't clobber q1/m6 710 711 ; sub p4/q0, add q3/q6 712 paddw m8, m15 713 paddw m10, m7, m5 714 paddw m8, m0 715 psubw m8, m10 716 psrlw m10, m8, 4 717 vpblendvb m7, m14, m10, m1 ; don't clobber q2/m14 718 719 ; sub p3/q1, add q4/q6 720%ifidn %2, v 721 paddw m8, [tmpq+strideq*0] 722%else 723 paddw m8, [rsp+10*32] 724%endif 725 paddw m10, m11, m6 726 paddw m8, m0 727 psubw m8, m10 728 psrlw m10, m8, 4 729 vpblendvb m10, m15, m10, m1 730%ifidn %2, v 731 mova [tmpq+mstrideq], m10 ; q3 732%else 733 mova [rsp+14*32], m10 734%endif 735 736 ; sub p2/q2, add q5/q6 737%ifidn %2, v 738 paddw m8, [tmpq+strideq*1] 739%else 740 paddw m8, [rsp+11*32] 741%endif 742 paddw m10, m13, m14 743 paddw m8, m0 744 psubw m8, m10 745 psrlw m10, m8, 4 746%ifidn %2, v 747 mova m9, [tmpq+strideq*0] 748%else 749 mova m9, [rsp+10*32] 750%endif 751 vpblendvb m10, m9, m10, m1 752%ifidn %2, v 753 mova [tmpq+strideq*0], m10 ; q4 754%else 755 mova [rsp+10*32], m10 756%endif 757 758 ; sub p1/q3, add q6*2 759 psubw m8, m3 760 paddw m0, m0 761 psubw m8, m15 762 paddw m8, m0 763 psrlw m10, m8, 4 764%ifidn %2, v 765 mova m9, [tmpq+strideq*1] 766%else 767 mova m9, [rsp+11*32] 768%endif 769 vpblendvb m10, m9, m10, m1 770%ifidn %2, v 771 mova [tmpq+strideq*1], m10 ; q5 772%else 773 mova [rsp+11*32], m10 774%endif 775 776 mova m9, [rsp+0*32] 777 mova m13, [rsp+1*32] 778 mova m3, [rsp+2*32] 779 mova m4, [rsp+3*32] 780 mova m5, [rsp+4*32] 781 SWAP 2, 6 782 SWAP 7, 14 783%ifidn %2, v 784 lea tmpq, [dstq+mstrideq*4] 785%else 786 mova m15, [rsp+14*32] 787%endif 788%endif 789 790%if %1 >= 8 791 ; flat8 filter 792 vpbroadcastd m7, [pw_4096] 793%ifidn %2, v 794 mova m0, [tmpq+strideq*0] ; p3 795%else 796 mova m0, [rsp+5*32] ; p3 797%endif 798 paddw m1, m0, m13 ; p3+p2 799 paddw m2, m3, m4 ; p1+p0 800 paddw m8, m1, m1 ; 2*(p3+p2) 801 paddw m2, m0 ; p1+p0+p3 802 paddw m8, m5 ; 2*(p3+p2)+q0 803 paddw m2, m8 ; 3*p3+2*p2+p1+p0+q0 804 pmulhrsw m10, m2, m7 805 806 paddw m8, m3, m6 807 psubw m2, m1 808 paddw m2, m8 809 pmulhrsw m8, m2, m7 810 811 paddw m11, m0, m3 812 paddw m1, m4, m14 813 psubw m2, m11 814 paddw m2, m1 815 pmulhrsw m1, m2, m7 816 817 paddw m11, m0, m4 818 pblendvb m4, m1, m9 819 paddw m1, m5, m15 820 psubw m2, m11 821 paddw m2, m1 822 pmulhrsw m11, m2, m7 823 824 paddw m2, m6 825 paddw m2, m15 826 paddw m1, m13, m5 827 pblendvb m5, m11, m9 828 pblendvb m13, m10, m9 829 psubw m2, m1 830 pmulhrsw m1, m2, m7 831 832 psubw m2, m3 833 pblendvb m3, m8, m9 834 psubw m2, m6 835 pblendvb m6, m1, m9 836 paddw m1, m15, m14 837 paddw m2, m1 838 pmulhrsw m2, m7 839 840 pblendvb m14, m2, m9 841 842%ifidn %2, v 843 mova [tmpq+strideq*1], m13 ; p2 844 mova [tmpq+strideq*2], m3 ; p1 845 mova [tmpq+stride3q ], m4 ; p0 846 mova [dstq+strideq*0], m5 ; q0 847 mova [dstq+strideq*1], m6 ; q1 848 mova [dstq+strideq*2], m14 ; q2 849%elif %1 == 8 850 TRANSPOSE8X8W 0, 13, 3, 4, 5, 6, 14, 15, 1 851 852 ; write 8x16 853 movu [dstq+strideq*0-8], xm0 854 movu [dstq+strideq*1-8], xm13 855 movu [dstq+strideq*2-8], xm3 856 movu [dstq+stride3q -8], xm4 857 lea dstq, [dstq+strideq*4] 858 movu [dstq+strideq*0-8], xm5 859 movu [dstq+strideq*1-8], xm6 860 movu [dstq+strideq*2-8], xm14 861 movu [dstq+stride3q -8], xm15 862 lea dstq, [dstq+strideq*4] 863 vextracti128 [dstq+strideq*0-8], m0, 1 864 vextracti128 [dstq+strideq*1-8], m13, 1 865 vextracti128 [dstq+strideq*2-8], m3, 1 866 vextracti128 [dstq+stride3q -8], m4, 1 867 lea dstq, [dstq+strideq*4] 868 vextracti128 [dstq+strideq*0-8], m5, 1 869 vextracti128 [dstq+strideq*1-8], m6, 1 870 vextracti128 [dstq+strideq*2-8], m14, 1 871 vextracti128 [dstq+stride3q -8], m15, 1 872 lea dstq, [dstq+strideq*4] 873%else 874 mova m8, [rsp+6*32] 875 mova m1, [rsp+7*32] 876 mova m2, [rsp+8*32] 877 mova m7, [rsp+9*32] 878 TRANSPOSE8X8W 8, 1, 2, 7, 0, 13, 3, 4, 9 879 880 mova [dstq+strideq*0-16], xm8 881 mova [dstq+strideq*1-16], xm1 882 mova [dstq+strideq*2-16], xm2 883 mova [dstq+stride3q -16], xm7 884 lea tmpq, [dstq+strideq*4] 885 mova [tmpq+strideq*0-16], xm0 886 mova [tmpq+strideq*1-16], xm13 887 mova [tmpq+strideq*2-16], xm3 888 mova [tmpq+stride3q -16], xm4 889 lea tmpq, [tmpq+strideq*4] 890 vextracti128 [tmpq+strideq*0-16], m8, 1 891 vextracti128 [tmpq+strideq*1-16], m1, 1 892 vextracti128 [tmpq+strideq*2-16], m2, 1 893 vextracti128 [tmpq+stride3q -16], m7, 1 894 lea tmpq, [tmpq+strideq*4] 895 vextracti128 [tmpq+strideq*0-16], m0, 1 896 vextracti128 [tmpq+strideq*1-16], m13, 1 897 vextracti128 [tmpq+strideq*2-16], m3, 1 898 vextracti128 [tmpq+stride3q -16], m4, 1 899 900 mova m0, [rsp+10*32] 901 mova m1, [rsp+11*32] 902 mova m2, [rsp+12*32] 903 mova m3, [rsp+13*32] 904 TRANSPOSE8X8W 5, 6, 14, 15, 0, 1, 2, 3, 4 905 mova [dstq+strideq*0], xm5 906 mova [dstq+strideq*1], xm6 907 mova [dstq+strideq*2], xm14 908 mova [dstq+stride3q ], xm15 909 lea dstq, [dstq+strideq*4] 910 mova [dstq+strideq*0], xm0 911 mova [dstq+strideq*1], xm1 912 mova [dstq+strideq*2], xm2 913 mova [dstq+stride3q ], xm3 914 lea dstq, [dstq+strideq*4] 915 vextracti128 [dstq+strideq*0], m5, 1 916 vextracti128 [dstq+strideq*1], m6, 1 917 vextracti128 [dstq+strideq*2], m14, 1 918 vextracti128 [dstq+stride3q ], m15, 1 919 lea dstq, [dstq+strideq*4] 920 vextracti128 [dstq+strideq*0], m0, 1 921 vextracti128 [dstq+strideq*1], m1, 1 922 vextracti128 [dstq+strideq*2], m2, 1 923 vextracti128 [dstq+stride3q ], m3, 1 924 lea dstq, [dstq+strideq*4] 925%endif 926%elif %1 == 6 927 ; flat6 filter 928 vpbroadcastd m7, [pw_4096] 929 paddw m8, m3, m4 930 paddw m8, m13 ; p2+p1+p0 931 paddw m11, m13, m5 932 paddw m8, m8 933 paddw m8, m11 ; p2+2*(p2+p1+p0)+q0 934 pmulhrsw m2, m8, m7 935 936 paddw m8, m5 937 paddw m11, m13, m13 938 paddw m8, m6 939 psubw m8, m11 940 pmulhrsw m10, m8, m7 941 942 paddw m8, m6 943 paddw m11, m13, m3 944 paddw m8, m14 945 psubw m8, m11 946 pmulhrsw m11, m8, m7 947 948 psubw m8, m3 949 paddw m14, m14 950 psubw m8, m4 951 paddw m8, m14 952 pmulhrsw m8, m7 953 954 pblendvb m3, m2, m9 955 pblendvb m4, m10, m9 956 pblendvb m5, m11, m9 957 pblendvb m6, m8, m9 958 959%ifidn %2, v 960 mova [tmpq+strideq*2], m3 ; p1 961 mova [tmpq+stride3q ], m4 ; p0 962 mova [dstq+strideq*0], m5 ; q0 963 mova [dstq+strideq*1], m6 ; q1 964%else 965 TRANSPOSE_8x4_AND_WRITE_4x16 966%endif 967%else 968%ifidn %2, v 969 mova [tmpq+strideq*0], m3 ; p1 970 mova [tmpq+strideq*1], m4 ; p0 971 mova [tmpq+strideq*2], m5 ; q0 972 mova [tmpq+stride3q ], m6 ; q1 973%else 974 TRANSPOSE_8x4_AND_WRITE_4x16 975%endif 976%endif 977%endmacro 978 979INIT_YMM avx2 980cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \ 981 dst, stride, mask, l, l_stride, lut, \ 982 w, stride3, mstride, tmp, mask_bits 983 mov r6d, r7m 984 lea r11, [pw_4] 985 shr r6d, 11 ; is_12bpc 986 lea r11, [r11+r6*4] 987 mov wd, wm 988 shl l_strideq, 2 989 sub lq, l_strideq 990 mov mstrideq, strideq 991 neg mstrideq 992 lea stride3q, [strideq*3] 993 mov mask_bitsd, 0xf 994 mova m12, [pb_mask] 995 996.loop: 997 test [maskq+8], mask_bitsd ; vmask[2] 998 jz .no_flat16 999 1000 FILTER 16, v 1001 jmp .end 1002 1003.no_flat16: 1004 test [maskq+4], mask_bitsd ; vmask[1] 1005 jz .no_flat 1006 1007 FILTER 8, v 1008 jmp .end 1009 1010.no_flat: 1011 test [maskq+0], mask_bitsd ; vmask[0] 1012 jz .end 1013 1014 call .v4 1015 1016.end: 1017 pslld m12, 4 1018 add lq, 16 1019 add dstq, 32 1020 shl mask_bitsd, 4 1021 sub wd, 4 1022 jg .loop 1023 RET 1024ALIGN function_align 1025.v4: 1026 FILTER 4, v 1027 ret 1028 1029INIT_YMM avx2 1030cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \ 1031 dst, stride, mask, l, l_stride, lut, \ 1032 h, stride3, l_stride3, tmp, mask_bits 1033 mov r6d, r7m 1034 lea r11, [pw_4] 1035 shr r6d, 11 ; is_12bpc 1036 lea r11, [r11+r6*4] 1037 mov hd, hm 1038 shl l_strideq, 2 1039 sub lq, 4 1040 lea stride3q, [strideq*3] 1041 lea l_stride3q, [l_strideq*3] 1042 mov mask_bitsd, 0xf 1043 mova m12, [pb_mask] 1044 1045.loop: 1046 test [maskq+8], mask_bitsd ; vmask[2] 1047 jz .no_flat16 1048 1049 FILTER 16, h 1050 jmp .end 1051 1052.no_flat16: 1053 test [maskq+4], mask_bitsd ; vmask[1] 1054 jz .no_flat 1055 1056 FILTER 8, h 1057 jmp .end 1058 1059.no_flat: 1060 test [maskq+0], mask_bitsd ; vmask[0] 1061 jz .no_filter 1062 1063 call .h4 1064 jmp .end 1065 1066.no_filter: 1067 lea dstq, [dstq+strideq*8] 1068 lea dstq, [dstq+strideq*8] 1069.end: 1070 pslld m12, 4 1071 lea lq, [lq+l_strideq*4] 1072 shl mask_bitsd, 4 1073 sub hd, 4 1074 jg .loop 1075 RET 1076ALIGN function_align 1077.h4: 1078 FILTER 4, h 1079 ret 1080 1081INIT_YMM avx2 1082cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \ 1083 dst, stride, mask, l, l_stride, lut, \ 1084 w, stride3, mstride, tmp, mask_bits 1085 mov r6d, r7m 1086 lea r11, [pw_4] 1087 shr r6d, 11 ; is_12bpc 1088 lea r11, [r11+r6*4] 1089 mov wd, wm 1090 shl l_strideq, 2 1091 sub lq, l_strideq 1092 mov mstrideq, strideq 1093 neg mstrideq 1094 lea stride3q, [strideq*3] 1095 mov mask_bitsd, 0xf 1096 mova m12, [pb_mask] 1097 1098.loop: 1099 test [maskq+4], mask_bitsd ; vmask[1] 1100 jz .no_flat 1101 1102 FILTER 6, v 1103 jmp .end 1104 1105.no_flat: 1106 test [maskq+0], mask_bitsd ; vmask[0] 1107 jz .end 1108 1109 call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4 1110 1111.end: 1112 pslld m12, 4 1113 add lq, 16 1114 add dstq, 32 1115 shl mask_bitsd, 4 1116 sub wd, 4 1117 jg .loop 1118 RET 1119 1120INIT_YMM avx2 1121cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \ 1122 dst, stride, mask, l, l_stride, lut, \ 1123 h, stride3, l_stride3, tmp, mask_bits 1124 mov r6d, r7m 1125 lea r11, [pw_4] 1126 shr r6d, 11 ; is_12bpc 1127 lea r11, [r11+r6*4] 1128 mov hd, hm 1129 shl l_strideq, 2 1130 sub lq, 4 1131 lea stride3q, [strideq*3] 1132 lea l_stride3q, [l_strideq*3] 1133 mov mask_bitsd, 0xf 1134 mova m12, [pb_mask] 1135 1136.loop: 1137 test [maskq+4], mask_bitsd ; vmask[1] 1138 jz .no_flat 1139 1140 FILTER 6, h 1141 jmp .end 1142 1143.no_flat: 1144 test [maskq+0], mask_bitsd ; vmask[0] 1145 jz .no_filter 1146 1147 call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4 1148 jmp .end 1149 1150.no_filter: 1151 lea dstq, [dstq+strideq*8] 1152 lea dstq, [dstq+strideq*8] 1153.end: 1154 pslld m12, 4 1155 lea lq, [lq+l_strideq*4] 1156 shl mask_bitsd, 4 1157 sub hd, 4 1158 jg .loop 1159 RET 1160 1161%endif ; ARCH_X86_64 1162