1; Copyright © 2018, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 64 32 33pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 34 35pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080 36 dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000 37 38hmulA: dd 0, 8, 16, 24, 32, 40, 48, 56, 4, 12, 20, 28, 36, 44, 52, 60 39hmulB: dd 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 40hmulC: dd 0, 1, 2, 3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51 41hmulD: dd 0, 1, 16, 17, 32, 33, 48, 49 42hshuf4:db 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 43 44shift1: dq 0x0204081020408000 45shift3: dq 0x0810204080000000 46shift4: dq 0x1020408000000000 47 48pb_1: times 4 db 1 49pb_2: times 4 db 2 50pb_3: times 4 db 3 51pb_4: times 4 db 4 52pb_16: times 4 db 16 53pb_63: times 4 db 63 54pb_64: times 4 db 64 55pb_128: times 4 db 0x80 56pb_2_1: times 2 db 2, 1 57pb_3_1: times 2 db 3, 1 58pb_7_1: times 2 db 7, 1 59pb_m1_0: times 2 db -1, 0 60pb_m1_1: times 2 db -1, 1 61pb_m1_2: times 2 db -1, 2 62pw_2048: times 2 dw 2048 63pw_4096: times 2 dw 4096 64 65SECTION .text 66 67%macro ABSSUB 4 ; dst, a, b, tmp 68 psubusb %1, %2, %3 69 psubusb %4, %3, %2 70 por %1, %4 71%endmacro 72 73%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 74 punpcklbw m%5, m%1, m%2 75 punpckhbw m%1, m%2 76 punpcklbw m%2, m%3, m%4 77 punpckhbw m%3, m%4 78 punpcklwd m%4, m%5, m%2 79 punpckhwd m%5, m%2 80 punpcklwd m%2, m%1, m%3 81 punpckhwd m%1, m%3 82 kmovw k1, k6 83 lea t0, [dstq+strideq*4] 84 vpscatterdd [dstq+m19-2]{k1}, m%4 85 kmovw k1, k6 86 lea t1, [dstq+strideq*8] 87 vpscatterdd [t0 +m19-2]{k1}, m%5 88 kmovw k1, k6 89 lea t2, [t0 +strideq*8] 90 vpscatterdd [t1 +m19-2]{k1}, m%2 91 kmovw k1, k6 92 vpscatterdd [t2 +m19-2]{k1}, m%1 93%endmacro 94 95%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem 96%if %1 == 0 97 SWAP m16, m22 98%endif 99 punpcklbw m22, m24, m26 100 punpckhbw m24, m26 101 punpcklbw m26, m2, m3 102 punpckhbw m2, m3 103 punpcklbw m3, m4, m5 104 punpckhbw m4, m5 105 punpcklbw m5, m6, m7 106 punpckhbw m6, m7 107 punpcklbw m7, m8, m9 108 punpckhbw m8, m9 109 punpcklbw m9, m10, m11 110 punpckhbw m10, m11 111 punpcklbw m11, m25, m13 112 punpckhbw m25, m13 113%if %1 == 0 114 SWAP m13, m16 115%else 116 mova m13, %3 117%endif 118 SWAP m16, m25 119 punpcklbw m25, m14, m13 120 punpckhbw m13, m14, m13 121 ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13 122 punpcklwd m14, m22, m26 123 punpckhwd m22, m26 124 punpcklwd m26, m24, m2 125 punpckhwd m24, m2 126 punpcklwd m2, m3, m5 127 punpckhwd m3, m5 128 punpcklwd m5, m4, m6 129 punpckhwd m4, m6 130 punpcklwd m6, m7, m9 131 punpckhwd m7, m9 132 punpcklwd m9, m8, m10 133 punpckhwd m8, m10 134 punpcklwd m10, m11, m25 135 punpckhwd m11, m25 136 SWAP m25, m16, m11 137 punpcklwd m11, m25, m13 138 punpckhwd m25, m13 139 ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25 140 punpckldq m13, m14, m2 141 punpckhdq m14, m2 142 punpckldq m2, m22, m3 143 punpckhdq m22, m3 144 punpckldq m3, m26, m5 145 punpckhdq m26, m5 146 punpckldq m5, m24, m4 147 punpckhdq m24, m4 148 punpckldq m4, m6, m10 149 punpckhdq m6, m10 150 punpckldq m10, m9, m11 151 punpckhdq m9, m11 152 punpckldq m11, m8, m25 153 punpckhdq m8, m25 154 SWAP m25, m16, m8 155 punpckldq m8, m7, m25 156 punpckhdq m7, m25 157 ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3 158 punpcklqdq m25, m13, m4 159 punpckhqdq m13, m4 160 punpcklqdq m4, m14, m6 161 punpckhqdq m14, m6 162 punpcklqdq m6, m2, m8 163 punpckhqdq m2, m8 164 punpcklqdq m8, m22, m7 165 punpckhqdq m22, m7 166 punpcklqdq m7, m3, m10 167 punpckhqdq m3, m10 168 punpcklqdq m10, m26, m9 169 punpckhqdq m26, m9 170 punpcklqdq m9, m5, m11 171 punpckhqdq m5, m11 172 SWAP m11, m16 173%if %2 == 0 174 SWAP m16, m25 175%else 176 mova %3, m25 177%endif 178 punpcklqdq m25, m24, m11 179 punpckhqdq m24, m11 180%if %2 == 0 181 SWAP m11, m16 182%endif 183 ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24 184 SWAP 24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22 185 SWAP 3, 14, 25, 9 186%endmacro 187 188%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 189 ; load data 190%ifidn %2, v 191%define is_h 0 192%if %1 == 4 193 lea t0, [dstq+mstrideq*2] 194 mova m3, [t0 +strideq*0] ; p1 195 mova m4, [t0 +strideq*1] ; p0 196 mova m5, [t0 +strideq*2] ; q0 197 mova m6, [t0 +stride3q ] ; q1 198%else 199 ; load 6-8 pixels, remainder (for wd=16) will be read inline 200%if %1 == 16 201 lea t0, [dstq+mstrideq*8] 202 mova m16, [t0 +strideq*1] 203 mova m17, [t0 +strideq*2] 204 mova m18, [t0 +stride3q ] 205%endif 206 lea t0, [dstq+mstrideq*4] 207%if %1 != 6 208 mova m25, [t0 +strideq*0] 209%endif 210 mova m13, [t0 +strideq*1] 211 mova m3, [t0 +strideq*2] 212 mova m4, [t0 +stride3q ] 213 mova m5, [dstq+strideq*0] 214 mova m6, [dstq+strideq*1] 215 mova m14, [dstq+strideq*2] 216%if %1 != 6 217 mova m22, [dstq+stride3q ] 218%endif 219%if %1 == 16 220 lea t0, [dstq+strideq*4] 221 mova m29, [t0 +strideq*0] 222 mova m30, [t0 +strideq*1] 223 mova m31, [t0 +strideq*2] 224%endif 225%endif 226%else ; h 227%define is_h 1 228 ; load lines 229%if %1 == 4 230 vbroadcasti32x4 m0, [hshuf4] 231 kmovw k1, k6 232 lea t0, [dstq+strideq*4] 233 vpgatherdd m3{k1}, [dstq+m19-2] 234 kmovw k1, k6 235 lea t1, [dstq+strideq*8] 236 vpgatherdd m4{k1}, [t0 +m19-2] 237 kmovw k1, k6 238 lea t2, [t0 +strideq*8] 239 vpgatherdd m5{k1}, [t1 +m19-2] 240 kmovw k1, k6 241 vpgatherdd m6{k1}, [t2 +m19-2] 242 pshufb m3, m0 243 pshufb m4, m0 244 pshufb m5, m0 245 pshufb m6, m0 246 punpckldq m7, m3, m4 247 punpckhdq m3, m4 248 punpckldq m4, m5, m6 249 punpckhdq m5, m6 250 punpcklqdq m6, m7, m4 251 punpckhqdq m7, m4 252 punpcklqdq m4, m3, m5 253 punpckhqdq m3, m5 254 SWAP 3, 6 255 SWAP 5, 4, 7 256 ; 6,7,4,3 -> 3,4,5,6 257%elif %1 == 6 || %1 == 8 258 kmovb k1, k7 259 lea t0, [dstq+strideq*1] 260 vpgatherdq m3{k1}, [dstq+ym21-%1/2] 261 kmovb k1, k7 262 lea t1, [dstq+strideq*2] 263 vpgatherdq m4{k1}, [t0 +ym21-%1/2] 264 kmovb k1, k7 265 lea t2, [dstq+stride3q ] 266 vpgatherdq m5{k1}, [t1 +ym21-%1/2] 267 kmovb k1, k7 268 vextracti32x8 ym0, m21, 1 269 vpgatherdq m6{k1}, [t2 +ym21-%1/2] 270 kmovb k1, k7 271 vpgatherdq m12{k1}, [dstq+ym0 -%1/2] 272 kmovb k1, k7 273 vpgatherdq m13{k1}, [t0 +ym0 -%1/2] 274 kmovb k1, k7 275 vpgatherdq m14{k1}, [t1 +ym0 -%1/2] 276 kmovb k1, k7 277 vpgatherdq m15{k1}, [t2 +ym0 -%1/2] 278 ; transpose 8x16 279 ; xm3: A-H0,A-H8 280 ; xm4: A-H1,A-H9 281 ; xm5: A-H2,A-H10 282 ; xm6: A-H3,A-H11 283 ; xm12: A-H4,A-H12 284 ; xm13: A-H5,A-H13 285 ; xm14: A-H6,A-H14 286 ; xm15: A-H7,A-H15 287 punpcklbw m7, m3, m4 288 punpckhbw m3, m4 289 punpcklbw m4, m5, m6 290 punpckhbw m5, m6 291 punpcklbw m6, m12, m13 292 punpckhbw m12, m13 293 punpcklbw m13, m14, m15 294 punpckhbw m14, m15 295 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 296 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 297 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 298 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 299 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 300 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 301 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 302 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 303 punpcklwd m15, m7, m4 304 punpckhwd m7, m4 305 punpcklwd m4, m3, m5 306 punpckhwd m3, m5 307 punpcklwd m5, m6, m13 308 punpckhwd m6, m13 309 punpcklwd m13, m12, m14 310 punpckhwd m12, m14 311 ; xm15: A0-3,B0-3,C0-3,D0-3 312 ; xm7: E0-3,F0-3,G0-3,H0-3 313 ; xm4: A8-11,B8-11,C8-11,D8-11 314 ; xm3: E8-11,F8-11,G8-11,H8-11 315 ; xm5: A4-7,B4-7,C4-7,D4-7 316 ; xm6: E4-7,F4-7,G4-7,H4-7 317 ; xm13: A12-15,B12-15,C12-15,D12-15 318 ; xm12: E12-15,F12-15,G12-15,H12-15 319 punpckldq m14, m15, m5 320 punpckhdq m15, m5 321 punpckldq m5, m7, m6 322 %if %1 != 6 323 punpckhdq m7, m6 324 %endif 325 punpckldq m6, m4, m13 326 punpckhdq m4, m13 327 punpckldq m13, m3, m12 328 %if %1 != 6 329 punpckhdq m12, m3, m12 330 %endif 331 ; xm14: A0-7,B0-7 332 ; xm15: C0-7,D0-7 333 ; xm5: E0-7,F0-7 334 ; xm7: G0-7,H0-7 335 ; xm6: A8-15,B8-15 336 ; xm4: C8-15,D8-15 337 ; xm13: E8-15,F8-15 338 ; xm12: G8-15,H8-15 339 punpcklqdq m3, m14, m6 340 punpckhqdq m14, m6 341 punpckhqdq m6, m15, m4 342 punpcklqdq m15, m4 343 punpcklqdq m4, m5, m13 344 punpckhqdq m13, m5, m13 345 %if %1 == 8 346 punpcklqdq m5, m7, m12 347 punpckhqdq m25, m7, m12 348 ; xm3: A0-15 349 ; xm14: B0-15 350 ; xm15: C0-15 351 ; xm6: D0-15 352 ; xm4: E0-15 353 ; xm13: F0-15 354 ; xm5: G0-15 355 ; xm25: H0-15 356 SWAP 25, 3, 15 357 SWAP 13, 14, 5, 4, 6 358 SWAP 15, 22 359 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22 360 %else 361 SWAP 13, 3, 14 362 SWAP 6, 4, 15, 5 363 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 364 %endif 365%else ; 16, h 366 ; load and 16x16 transpose. We only use 14 pixels but we'll need the 367 ; remainder at the end for the second transpose 368 movu xm24, [dstq+strideq*0-8] 369 movu xm26, [dstq+strideq*1-8] 370 movu xm2, [dstq+strideq*2-8] 371 movu xm3, [dstq+stride3q -8] 372 lea t0, [dstq+strideq*4] 373 movu xm4, [t0 +strideq*0-8] 374 movu xm5, [t0 +strideq*1-8] 375 movu xm6, [t0 +strideq*2-8] 376 movu xm7, [t0 +stride3q -8] 377 lea t0, [t0 +strideq*4] 378 movu xm8, [t0 +strideq*0-8] 379 movu xm9, [t0 +strideq*1-8] 380 movu xm10, [t0 +strideq*2-8] 381 movu xm11, [t0 +stride3q -8] 382 lea t0, [t0 +strideq*4] 383 movu xm25, [t0 +strideq*0-8] 384 movu xm13, [t0 +strideq*1-8] 385 movu xm14, [t0 +strideq*2-8] 386 movu xm22, [t0 +stride3q -8] 387 lea t0, [t0 +strideq*4] 388 vinserti32x4 ym24, [t0 +strideq*0-8], 1 389 vinserti32x4 ym26, [t0 +strideq*1-8], 1 390 vinserti32x4 ym2, [t0 +strideq*2-8], 1 391 vinserti32x4 ym3, [t0 +stride3q -8], 1 392 lea t0, [t0 +strideq*4] 393 vinserti32x4 ym4, [t0 +strideq*0-8], 1 394 vinserti32x4 ym5, [t0 +strideq*1-8], 1 395 vinserti32x4 ym6, [t0 +strideq*2-8], 1 396 vinserti32x4 ym7, [t0 +stride3q -8], 1 397 lea t0, [t0 +strideq*4] 398 vinserti32x4 ym8, [t0 +strideq*0-8], 1 399 vinserti32x4 ym9, [t0 +strideq*1-8], 1 400 vinserti32x4 ym10, [t0 +strideq*2-8], 1 401 vinserti32x4 ym11, [t0 +stride3q -8], 1 402 lea t0, [t0 +strideq*4] 403 vinserti32x4 ym25, [t0 +strideq*0-8], 1 404 vinserti32x4 ym13, [t0 +strideq*1-8], 1 405 vinserti32x4 ym14, [t0 +strideq*2-8], 1 406 vinserti32x4 ym22, [t0 +stride3q -8], 1 407 lea t0, [t0 +strideq*4] 408 vinserti32x4 m24, [t0 +strideq*0-8], 2 409 vinserti32x4 m26, [t0 +strideq*1-8], 2 410 vinserti32x4 m2, [t0 +strideq*2-8], 2 411 vinserti32x4 m3, [t0 +stride3q -8], 2 412 lea t0, [t0 +strideq*4] 413 vinserti32x4 m4, [t0 +strideq*0-8], 2 414 vinserti32x4 m5, [t0 +strideq*1-8], 2 415 vinserti32x4 m6, [t0 +strideq*2-8], 2 416 vinserti32x4 m7, [t0 +stride3q -8], 2 417 lea t0, [t0 +strideq*4] 418 vinserti32x4 m8, [t0 +strideq*0-8], 2 419 vinserti32x4 m9, [t0 +strideq*1-8], 2 420 vinserti32x4 m10, [t0 +strideq*2-8], 2 421 vinserti32x4 m11, [t0 +stride3q -8], 2 422 lea t0, [t0 +strideq*4] 423 vinserti32x4 m25, [t0 +strideq*0-8], 2 424 vinserti32x4 m13, [t0 +strideq*1-8], 2 425 vinserti32x4 m14, [t0 +strideq*2-8], 2 426 vinserti32x4 m22, [t0 +stride3q -8], 2 427 lea t0, [t0 +strideq*4] 428 vinserti32x4 m24, [t0 +strideq*0-8], 3 429 vinserti32x4 m26, [t0 +strideq*1-8], 3 430 vinserti32x4 m2, [t0 +strideq*2-8], 3 431 vinserti32x4 m3, [t0 +stride3q -8], 3 432 lea t0, [t0 +strideq*4] 433 vinserti32x4 m4, [t0 +strideq*0-8], 3 434 vinserti32x4 m5, [t0 +strideq*1-8], 3 435 vinserti32x4 m6, [t0 +strideq*2-8], 3 436 vinserti32x4 m7, [t0 +stride3q -8], 3 437 lea t0, [t0 +strideq*4] 438 vinserti32x4 m8, [t0 +strideq*0-8], 3 439 vinserti32x4 m9, [t0 +strideq*1-8], 3 440 vinserti32x4 m10, [t0 +strideq*2-8], 3 441 vinserti32x4 m11, [t0 +stride3q -8], 3 442 lea t0, [t0 +strideq*4] 443 vinserti32x4 m25, [t0 +strideq*0-8], 3 444 vinserti32x4 m13, [t0 +strideq*1-8], 3 445 vinserti32x4 m14, [t0 +strideq*2-8], 3 446 vinserti32x4 m22, [t0 +stride3q -8], 3 447 ; 448 TRANSPOSE_16X16B 0, 1, [rsp+0*64] 449 SWAP m16, m26 450 SWAP m17, m2 451 SWAP m18, m3 452 SWAP m29, m25 453 SWAP m30, m13 454 SWAP m31, m14 455 mova [rsp+4*64], m22 456 ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22 457 SWAP 25, 4, 7 458 SWAP 13, 5, 8 459 SWAP 3, 6, 9 460 SWAP 10, 14 461 SWAP 11, 22 462%endif 463%endif 464 465 ; load L/E/I/H 466 vpbroadcastd m15, [pb_1] 467%ifidn %2, v 468 movu m1, [lq] 469 movu m0, [lq+l_strideq] 470%else 471 kmovw k1, k6 472 vpgatherdd m0{k1}, [lq+m20+4] 473 kmovw k1, k6 474 vpgatherdd m1{k1}, [lq+m20+0] 475%endif 476 pxor m2, m2 477 pcmpeqb k1, m0, m2 478 vmovdqu8 m0{k1}, m1 ; l[x][] ? l[x][] : l[x-stride][] 479 pshufb m0, pbshuf ; l[x][0] 480 vpcmpub k3, m0, m2, 4 ; neq ; L 481 psrlq m2, m0, [lutq+128] 482 pand m2, [pb_63]{bcstd} 483 vpbroadcastb m1, [lutq+136] 484 pminub m2, m1 485 pmaxub m2, m15 ; I 486 gf2p8affineqb m1, m0, [shift4]{bcstq}, 0 ; H 487 paddd m0, [pb_2]{bcstd} 488 paddb m0, m0 489 paddb m0, m2 ; E 490 491 ABSSUB m8, m3, m4, m9 ; abs(p1-p0) 492 ABSSUB m9, m5, m6, m10 ; abs(q1-q0) 493 pmaxub m8, m9 494 vpcmpub k1, m8, m1, 6 ; gt ; hev 495%if %1 != 4 496 %if %1 == 6 497 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) 498 pmaxub m9, m8 499 %else 500 ABSSUB m9, m25, m4, m10 ; abs(p3-p0) 501 pmaxub m9, m8 502 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) 503 pmaxub m9, m10 504 %endif 505 ABSSUB m10, m5, m14, m11 ; abs(q2-q0) 506 pmaxub m9, m10 507 %if %1 != 6 508 ABSSUB m10, m5, m22, m11 ; abs(q3-q0) 509 pmaxub m9, m10 510 %endif 511 vpcmpub k2{k3}, m9, m15, 2 ; le ; flat8in 512 %if %1 == 6 513 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) 514 %else 515 ABSSUB m10, m25, m13, m11 ; abs(p3-p2) 516 ABSSUB m11, m13, m3, m1 ; abs(p2-p1) 517 pmaxub m10, m11 518 ABSSUB m11, m14, m22, m1 ; abs(q3-q2) 519 pmaxub m10, m11 520 %endif 521 ABSSUB m11, m14, m6, m1 ; abs(q2-q1) 522 pmaxub m10, m11 523 %if %1 == 16 524 vpbroadcastd m11, [maskq+8] 525 por m11, [maskq+4]{bcstd} 526 %else 527 vpbroadcastd m11, [maskq+4] 528 %endif 529 vptestmd k4, m11, pbmask 530 vmovdqa32 m10{k4}{z}, m10 ; only apply fm-wide to wd>4 blocks 531 pmaxub m8, m10 532%endif 533 vpcmpub k3{k3}, m8, m2, 2 ; le 534 ABSSUB m10, m3, m6, m11 ; abs(p1-q1) 535 ABSSUB m11, m4, m5, m2 ; abs(p0-q0) 536 paddusb m11, m11 537 gf2p8affineqb m10, m10, [shift1]{bcstq}, 0 538 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 539 vpcmpub k3{k3}, m10, m0, 2 ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E 540 541%if %1 == 16 542 ABSSUB m1, m16, m4, m2 543 ABSSUB m2, m17, m4, m10 544 pmaxub m1, m2 545 ABSSUB m2, m18, m4, m10 546 pmaxub m1, m2 547 ABSSUB m2, m29, m5, m10 548 pmaxub m1, m2 549 ABSSUB m2, m30, m5, m10 550 pmaxub m1, m2 551 ABSSUB m2, m31, m5, m10 552 pmaxub m1, m2 553 kandq k2, k2, k3 554 vpcmpub k4{k2}, m1, m15, 2 ; flat8in & flat8out 555 vpbroadcastd m2, [maskq+8] 556 vptestmd k5, m2, pbmask 557 vpmovm2d m7, k5 558 vptestmb k4{k4}, m7, m7 ; flat16 & fm 559 por m10, m2, [maskq+4]{bcstd} 560 vptestmd k5, m10, pbmask 561 vpmovm2d m7, k5 562 vptestmb k2{k2}, m7, m7 ; flat8in 563 por m2, m10, [maskq+0]{bcstd} 564 vptestmd k5, m2, pbmask 565 vpmovm2d m7, k5 566 vptestmb k3{k3}, m7, m7 567 kandnq k3, k2, k3 ; fm & !flat8 & !flat16 568 kandnq k2, k4, k2 ; flat8 & !flat16 569%elif %1 != 4 570 vpbroadcastd m0, [maskq+4] 571 vptestmd k4, m0, pbmask 572 vpmovm2d m7, k4 573 vptestmb k2{k2}, m7, m7 574 kandq k2, k2, k3 ; flat8 & fm 575 por m0, [maskq+0]{bcstd} 576 vptestmd k4, m0, pbmask 577 vpmovm2d m7, k4 578 vptestmb k3{k3}, m7, m7 579 kandnq k3, k2, k3 ; fm & !flat8 580%else 581 %ifidn %2, v 582 vptestmd k4, pbmask, [maskq+0]{bcstd} 583 %else 584 vpbroadcastd m0, [maskq+0] 585 vptestmd k4, m0, pbmask 586 %endif 587 vpmovm2d m7, k4 588 vptestmb k3{k3}, m7, m7 ; fm 589%endif 590 591 ; short filter 592%if %1 >= 8 593 SWAP m23, m15 594%endif 595 vpbroadcastd m15, [pb_3] 596 vpbroadcastd m0, [pb_4] 597 vpbroadcastd m12, [pb_16] 598 vpbroadcastd m1, [pb_64] 599 pxor m3, pb128 600 pxor m6, pb128 601 psubsb m10{k1}{z}, m3, m6 ; f=iclip_diff(p1-q1)&hev 602 pxor m4, pb128 603 pxor m5, pb128 604 psubsb m11, m5, m4 605 paddsb m10, m11 606 paddsb m10, m11 607 paddsb m10{k3}{z}, m10, m11 ; f=iclip_diff(3*(q0-p0)+f)&fm 608 paddsb m8, m10, m15 609 paddsb m10, m0 610 gf2p8affineqb m8, m8, [shift3]{bcstq}, 16 611 gf2p8affineqb m10, m10, [shift3]{bcstq}, 16 612 psubb m8, m12 ; f2 613 psubb m10, m12 ; f1 614 paddsb m4, m8 615 psubsb m5, m10 616 pxor m4, pb128 617 pxor m5, pb128 618 ; 619 pxor m10, pb128 620 pxor m8, m8 621 pavgb m8, m10 ; f=(f1+1)>>1 622 psubb m8, m1 623 knotq k1, k1 624 paddsb m3{k1}, m3, m8 625 psubsb m6{k1}, m6, m8 626 pxor m3, pb128 627 pxor m6, pb128 628 629%if %1 == 16 630 ; flat16 filter 631%ifidn %2, v 632 lea t0, [dstq+mstrideq*8] 633%endif 634 SWAP m24, m16, m14 635 SWAP m2, m17, m22 636 SWAP m7, m18 637 638 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A 639 ; write -6 640 vpbroadcastd m1, [pb_7_1] 641 vpbroadcastd m12, [pb_2] 642 punpcklbw m14, m24, m25 643 punpckhbw m22, m24, m25 644 pmaddubsw m10, m14, m1 645 pmaddubsw m11, m22, m1 ; p6*7+p3 646 punpcklbw m8, m2, m7 647 punpckhbw m9, m2, m7 648 pmaddubsw m8, m12 649 pmaddubsw m9, m12 650 paddw m10, m8 651 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 652%ifidn %2, h 653 vpbroadcastd m27, [pw_2048] 654 vpbroadcastd m1, [pb_m1_1] 655 %define pw2048 m27 656 %define pbm1_1 m1 657%endif 658 punpcklbw m8, m13, m3 659 punpckhbw m9, m13, m3 660 pmaddubsw m8, m23 661 pmaddubsw m9, m23 662 paddw m10, m8 663 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 664 punpcklbw m8, m4, m5 665 punpckhbw m9, m4, m5 666 pmaddubsw m8, m23 667 pmaddubsw m9, m23 668 paddw m10, m8 669 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 670 pmulhrsw m8, m10, pw2048 671 pmulhrsw m9, m11, pw2048 672 packuswb m8, m9 673%ifidn %2, v 674 vmovdqu8 [t0+strideq*2]{k4}, m8 ; p5 675%else 676 vpblendmb m8{k4}, m2, m8 677 mova [rsp+1*64], m8 678%endif 679 680 ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B 681 ; write -5 682 pmaddubsw m14, pbm1_1 683 pmaddubsw m22, pbm1_1 684 paddw m10, m14 685 paddw m11, m22 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 686 punpcklbw m8, m24, m6 687 punpckhbw m9, m24, m6 688 pmaddubsw m8, pbm1_1 689 pmaddubsw m9, pbm1_1 690 paddw m10, m8 691 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 692 SWAP m18, m8 693 SWAP m23, m9 694 pmulhrsw m8, m10, pw2048 695 pmulhrsw m9, m11, pw2048 696 packuswb m8, m9 697%ifidn %2, v 698 vmovdqu8 [t0+stride3q]{k4}, m8 ; p4 699%else 700 vpblendmb m8{k4}, m7, m8 701 mova [rsp+2*64], m8 702%endif 703 704 ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C 705 ; write -4 706 SWAP m14, m16 707 punpcklbw m8, m24, m13 708 punpckhbw m9, m24, m13 709 pmaddubsw m8, pbm1_1 710 pmaddubsw m9, pbm1_1 711 paddw m10, m8 712 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 713 punpcklbw m8, m2, m14 714 punpckhbw m2, m14 715 pmaddubsw m8, pbm1_1 716 pmaddubsw m2, pbm1_1 717 paddw m10, m8 718 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 719 SWAP m16, m8 720 pmulhrsw m8, m10, pw2048 721 pmulhrsw m9, m11, pw2048 722 packuswb m8, m9 723%ifidn %2, v 724 vmovdqu8 [t0+strideq*4]{k4}, m8 ; p3 725%else 726 vpblendmb m8{k4}, m25, m8 727 mova [rsp+3*64], m8 728%endif 729 730 ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D 731 ; write -3 732 SWAP m22, m17 733 punpcklbw m8, m24, m3 734 punpckhbw m9, m24, m3 735 pmaddubsw m8, pbm1_1 736 pmaddubsw m9, pbm1_1 737 paddw m10, m8 738 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 739 punpcklbw m8, m7, m22 740 punpckhbw m7, m22 741 pmaddubsw m8, pbm1_1 742 pmaddubsw m7, pbm1_1 743 paddw m10, m8 744 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 745 SWAP m17, m8 746 pmulhrsw m8, m10, pw2048 747 pmulhrsw m9, m11, pw2048 748 packuswb m8, m9 749 vpblendmb m15{k4}, m13, m8 ; don't clobber p2/m13 since we need it in F 750 751 ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E 752 ; write -2 753%ifidn %2, v 754 lea t0, [dstq+strideq*4] 755%endif 756 punpcklbw m8, m24, m4 757 punpckhbw m9, m24, m4 758 pmaddubsw m8, pbm1_1 759 pmaddubsw m9, pbm1_1 760 paddw m10, m8 761 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 762 punpcklbw m8, m25, m29 763 punpckhbw m9, m25, m29 764 SWAP m26, m29 765 pmaddubsw m8, pbm1_1 766 pmaddubsw m9, pbm1_1 767 paddw m10, m8 768 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 769 SWAP m29, m8 770 SWAP m0, m9 771 pmulhrsw m8, m10, pw2048 772 pmulhrsw m9, m11, pw2048 773 packuswb m8, m9 774 vpblendmb m12{k4}, m3, m8 ; don't clobber p1/m3 since we need it in G 775 776 ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F 777 ; write -1 778%ifidn %2, h 779 SWAP m28, m24 780 punpcklbw m8, m28, m5 781 punpckhbw m24, m28, m5 782%else 783 punpcklbw m8, m24, m5 784 punpckhbw m24, m5 785%endif 786 pmaddubsw m8, pbm1_1 787 pmaddubsw m24, pbm1_1 788 paddw m10, m8 789 paddw m11, m24 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 790 punpcklbw m24, m13, m30 791 punpckhbw m9, m13, m30 792%ifidn %2, h 793 SWAP m27, m30 794%endif 795 SWAP m13, m15 796 pmaddubsw m24, pbm1_1 797 pmaddubsw m9, pbm1_1 798 paddw m10, m24 799 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 800 SWAP m30, m24 801 SWAP m15, m9 802%ifidn %2, h 803 SWAP m9, m24 804 %define pw2048 m9 805%endif 806 pmulhrsw m24, m10, pw2048 807 pmulhrsw m8, m11, pw2048 808 paddw m10, m18 ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 809 paddw m11, m23 810 packuswb m24, m8 811 punpcklbw m8, m3, m31 812 pmaddubsw m8, pbm1_1 813 paddw m10, m8 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 814 SWAP m18, m8 815 pmulhrsw m8, m10, pw2048 816 paddw m10, m16 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 817%ifidn %2, h 818 SWAP m16, m9 819 %define pw2048 m16 820%endif 821 punpckhbw m9, m3, m31 822 SWAP m3, m12 823 pmaddubsw m9, pbm1_1 824 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 825 SWAP m23, m9 826 pmulhrsw m9, m11, pw2048 827 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 828%ifidn %2, h 829 SWAP m2, m1 830 %define pbm1_1 m2 831%endif 832 vpblendmb m1{k4}, m4, m24 ; don't clobber p0/m4 since we need it in H 833 834 ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G 835 ; write +0 836 SWAP m24, m31 ; q6 837 packuswb m8, m9 838%ifidn %2, h 839 SWAP m31, m2 840 %define pbm1_1 m31 841%endif 842 vpblendmb m12{k4}, m5, m8 ; don't clobber q0/m5 since we need it in I 843 844 ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H 845 ; write +1 846 punpcklbw m8, m4, m24 847 punpckhbw m2, m4, m24 848 SWAP m4, m1 849 pmaddubsw m8, pbm1_1 850 pmaddubsw m2, pbm1_1 851 paddw m10, m8 852 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 853 pmulhrsw m2, m10, pw2048 854 pmulhrsw m9, m11, pw2048 855 packuswb m2, m9 856 vpblendmb m2{k4}, m6, m2 ; don't clobber q1/m6 since we need it in K 857 858 ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I 859 ; write +2 860 paddw m10, m17 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 861 paddw m11, m7 862 punpcklbw m8, m5, m24 863 punpckhbw m9, m5, m24 864 SWAP m5, m12 865 pmaddubsw m8, pbm1_1 866 pmaddubsw m9, pbm1_1 867 paddw m10, m8 868 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 869 pmulhrsw m7, m10, pw2048 870 pmulhrsw m9, m11, pw2048 871 packuswb m7, m9 872 vpblendmb m7{k4}, m14, m7 ; don't clobber q2/m14 since we need it in K 873 874 ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J 875 ; write +3 876 paddw m10, m29 ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 877 paddw m11, m0 878 punpcklbw m8, m6, m24 879 punpckhbw m9, m6, m24 880 SWAP 2, 6 881 pmaddubsw m8, pbm1_1 882 pmaddubsw m9, pbm1_1 883 paddw m10, m8 884 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 885 pmulhrsw m8, m10, pw2048 886 pmulhrsw m9, m11, pw2048 887 packuswb m8, m9 888%ifidn %2, v 889 vmovdqu8 [t0+mstrideq]{k4}, m8 890%else 891 SWAP m29, m16 892 %define pw2048 m29 893 vpblendmb m16{k4}, m22, m8 894%endif 895 896 ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K 897 ; write +4 898 paddw m10, m30 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 899 paddw m11, m15 900%ifidn %2, h 901 SWAP m15, m8 902%endif 903 punpcklbw m8, m14, m24 904 punpckhbw m9, m14, m24 905 SWAP 14, 7 906 pmaddubsw m8, pbm1_1 907 pmaddubsw m9, pbm1_1 908 paddw m10, m8 909 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 910 pmulhrsw m8, m10, pw2048 911 pmulhrsw m9, m11, pw2048 912 packuswb m8, m9 913%ifidn %2, v 914 vmovdqu8 [t0+strideq*0]{k4}, m8 ; q4 915%else 916 vpblendmb m17{k4}, m26, m8 917%endif 918 919 ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L 920 ; write +5 921 paddw m10, m18 ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 922 paddw m11, m23 923 punpcklbw m8, m22, m24 924 punpckhbw m9, m22, m24 925 SWAP m30, m24 926 pmaddubsw m8, pbm1_1 927 pmaddubsw m9, pbm1_1 928 paddw m10, m8 929 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 930 pmulhrsw m10, pw2048 931 pmulhrsw m11, pw2048 932 packuswb m10, m11 933%ifidn %2, v 934 vmovdqu8 [t0+strideq*1]{k4}, m10 ; q5 935%else 936 vmovdqu8 m27{k4}, m10 937%endif 938 939%ifidn %2, v 940 lea t0, [dstq+mstrideq*4] 941%endif 942%endif 943 944%if %1 >= 8 945 ; flat8 filter 946 vpbroadcastd m9, [pb_3_1] 947 vpbroadcastd m10, [pb_2_1] 948%if %1 == 16 949 vpbroadcastd m23, [pb_1] 950 vpbroadcastd m0, [pb_4] 951%elifidn %2, h 952 vpbroadcastd m31, [pb_m1_1] 953 %define pbm1_1 m31 954%endif 955 punpcklbw m24, m25, m3 956 punpckhbw m26, m25, m3 957 pmaddubsw m2, m24, m9 958 pmaddubsw m7, m26, m9 ; 3 * p3 + p1 959 punpcklbw m8, m13, m4 960 punpckhbw m11, m13, m4 961 pmaddubsw m8, m10 962 pmaddubsw m11, m10 963 paddw m2, m8 964 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 965 punpcklbw m8, m5, m0 966 punpckhbw m11, m5, m0 967 pmaddubsw m8, m23 968 pmaddubsw m11, m23 969 paddw m2, m8 970 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 971 psrlw m8, m2, 3 972 psrlw m11, m7, 3 973 packuswb m8, m11 974%if is_h || %1 == 16 975 vpblendmb m10{k2}, m13, m8 ; p2 976%endif 977%ifidn %2, v 978 %if %1 == 8 979 vmovdqu8 [t0+strideq*1]{k2}, m8 980 %else 981 mova [t0+strideq*1], m10 982 %endif 983%endif 984 985 pmaddubsw m8, m24, pbm1_1 986 pmaddubsw m11, m26, pbm1_1 987 paddw m2, m8 988 paddw m7, m11 989 punpcklbw m8, m13, m6 990 punpckhbw m11, m13, m6 991 pmaddubsw m8, pbm1_1 992 pmaddubsw m11, pbm1_1 993 paddw m2, m8 994 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 995 psrlw m8, m2, 3 996 psrlw m11, m7, 3 997 packuswb m8, m11 998 vpblendmb m8{k2}, m3, m8 ; p1 999%ifidn %2, v 1000 mova [t0+strideq*2], m8 1001%else 1002 SWAP m18, m8 1003%endif 1004 1005 pmaddubsw m24, m23 1006 pmaddubsw m26, m23 1007 psubw m2, m24 1008 psubw m7, m26 1009 punpcklbw m8, m4, m14 1010 punpckhbw m11, m4, m14 1011 pmaddubsw m8, m23 1012 pmaddubsw m11, m23 1013 paddw m2, m8 1014 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 1015 psrlw m8, m2, 3 1016 psrlw m11, m7, 3 1017 packuswb m8, m11 1018 vpblendmb m8{k2}, m4, m8 ; p0 1019%ifidn %2, v 1020 mova [t0+stride3q], m8 1021%else 1022 SWAP m29, m8 1023%endif 1024 1025 punpcklbw m24, m5, m22 1026 punpckhbw m26, m5, m22 1027 pmaddubsw m8, m24, m23 1028 pmaddubsw m11, m26, m23 1029 paddw m2, m8 1030 paddw m7, m11 1031 punpcklbw m8, m4, m25 1032 punpckhbw m11, m4, m25 1033 pmaddubsw m8, m23 1034 pmaddubsw m11, m23 1035 psubw m2, m8 1036 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 1037 psrlw m8, m2, 3 1038 psrlw m11, m7, 3 1039 packuswb m8, m11 1040 vpblendmb m11{k2}, m5, m8 ; q0 1041%ifidn %2, v 1042 mova [dstq+strideq*0], m11 1043%endif 1044 1045 pmaddubsw m24, pbm1_1 1046 pmaddubsw m26, pbm1_1 1047 paddw m2, m24 1048 paddw m7, m26 1049 punpcklbw m8, m13, m6 1050 punpckhbw m13, m6 1051 pmaddubsw m8, pbm1_1 1052 pmaddubsw m13, pbm1_1 1053 paddw m2, m8 1054 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 1055 psrlw m8, m2, 3 1056 psrlw m13, m7, 3 1057 packuswb m8, m13 1058 vpblendmb m13{k2}, m6, m8 ; q1 1059%ifidn %2, v 1060 mova [dstq+strideq*1], m13 1061%endif 1062 1063 punpcklbw m24, m3, m6 1064 punpckhbw m26, m3, m6 1065 pmaddubsw m24, m23 1066 pmaddubsw m26, m23 1067 psubw m2, m24 1068 psubw m7, m26 1069 punpcklbw m24, m14, m22 1070 punpckhbw m26, m14, m22 1071 pmaddubsw m24, m23 1072 pmaddubsw m26, m23 1073 paddw m2, m24 1074 paddw m7, m26 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 1075 psrlw m2, 3 1076 psrlw m7, 3 1077 packuswb m2, m7 1078%if is_h || %1 == 16 1079 vpblendmb m2{k2}, m14, m2 ; q2 1080%endif 1081%ifidn %2, v 1082 %if %1 == 8 1083 vmovdqu8 [dstq+strideq*2]{k2}, m2 1084 %else 1085 mova [dstq+strideq*2], m2 1086 %endif 1087%endif 1088 1089%ifidn %2, h 1090 SWAP m24, m18 1091 SWAP m26, m29 1092%if %1 == 8 1093 ; 16x8 transpose 1094 punpcklbw m3, m25, m10 1095 punpckhbw m25, m10 1096 punpcklbw m10, m24, m26 1097 punpckhbw m24, m26 1098 punpcklbw m26, m11, m13 1099 punpckhbw m11, m13 1100 punpcklbw m13, m2, m22 1101 punpckhbw m2, m22 1102 ; 1103 punpcklwd m22, m3, m10 1104 punpckhwd m3, m10 1105 punpcklwd m10, m25, m24 1106 punpckhwd m25, m24 1107 punpcklwd m24, m26, m13 1108 punpckhwd m26, m13 1109 punpcklwd m13, m11, m2 1110 punpckhwd m11, m2 1111 ; 1112 punpckldq m2, m22, m24 1113 punpckhdq m22, m24 1114 punpckldq m24, m3, m26 1115 punpckhdq m3, m26 1116 punpckldq m26, m10, m13 1117 punpckhdq m10, m13 1118 punpckldq m13, m25, m11 1119 punpckhdq m25, m11 1120 ; write 8x32 1121 vpbroadcastd ym16, strided 1122 pmulld ym16, [hmulD] 1123 lea t1, [dstq+strideq*2] 1124 lea t2, [dstq+strideq*4] 1125 lea t3, [t1 +strideq*4] 1126 lea t0, [dstq+strideq*8] 1127 kmovb k1, k6 1128 kmovb k2, k6 1129 kmovb k3, k6 1130 kmovb k4, k6 1131 vpscatterdq [dstq+ym16-4]{k1}, m2 1132 vpscatterdq [t1 +ym16-4]{k2}, m22 1133 vpscatterdq [t2 +ym16-4]{k3}, m24 1134 vpscatterdq [t3 +ym16-4]{k4}, m3 1135 lea t1, [t0+strideq*2] 1136 lea t2, [t0+strideq*4] 1137 lea t3, [t1+strideq*4] 1138 kmovb k1, k6 1139 kmovb k2, k6 1140 kmovb k3, k6 1141 kmovb k4, k6 1142 vpscatterdq [t0+ym16-4]{k1}, m26 1143 vpscatterdq [t1+ym16-4]{k2}, m10 1144 vpscatterdq [t2+ym16-4]{k3}, m13 1145 vpscatterdq [t3+ym16-4]{k4}, m25 1146%else 1147 ; 16x16 transpose and store 1148 SWAP 5, 10, 2 1149 SWAP 6, 24 1150 SWAP 7, 26 1151 SWAP 8, 11 1152 SWAP 9, 13 1153 mova m24, [rsp+0*64] 1154 SWAP m26, m28 1155 mova m2, [rsp+1*64] 1156 mova m3, [rsp+2*64] 1157 mova m4, [rsp+3*64] 1158 SWAP m11, m16 1159 SWAP m25, m17 1160 SWAP m13, m27 1161 SWAP m14, m30 1162 TRANSPOSE_16X16B 1, 0, [rsp+4*64] 1163 movu [dstq+strideq*0-8], xm24 1164 movu [dstq+strideq*1-8], xm26 1165 movu [dstq+strideq*2-8], xm2 1166 movu [dstq+stride3q -8], xm3 1167 lea t0, [dstq+strideq*4] 1168 movu [t0+strideq*0-8], xm4 1169 movu [t0+strideq*1-8], xm5 1170 movu [t0+strideq*2-8], xm6 1171 movu [t0+stride3q -8], xm7 1172 lea t0, [t0+strideq*4] 1173 movu [t0+strideq*0-8], xm8 1174 movu [t0+strideq*1-8], xm9 1175 movu [t0+strideq*2-8], xm10 1176 movu [t0+stride3q -8], xm11 1177 lea t0, [t0+strideq*4] 1178 movu [t0+strideq*0-8], xm25 1179 movu [t0+strideq*1-8], xm13 1180 movu [t0+strideq*2-8], xm14 1181 movu [t0+stride3q -8], xm22 1182 lea t0, [t0+strideq*4] 1183 vextracti128 [t0+strideq*0-8], ym24, 1 1184 vextracti128 [t0+strideq*1-8], ym26, 1 1185 vextracti128 [t0+strideq*2-8], ym2, 1 1186 vextracti128 [t0+stride3q -8], ym3, 1 1187 lea t0, [t0+strideq*4] 1188 vextracti128 [t0+strideq*0-8], ym4, 1 1189 vextracti128 [t0+strideq*1-8], ym5, 1 1190 vextracti128 [t0+strideq*2-8], ym6, 1 1191 vextracti128 [t0+stride3q -8], ym7, 1 1192 lea t0, [t0+strideq*4] 1193 vextracti128 [t0+strideq*0-8], ym8, 1 1194 vextracti128 [t0+strideq*1-8], ym9, 1 1195 vextracti128 [t0+strideq*2-8], ym10, 1 1196 vextracti128 [t0+stride3q -8], ym11, 1 1197 lea t0, [t0+strideq*4] 1198 vextracti128 [t0+strideq*0-8], ym25, 1 1199 vextracti128 [t0+strideq*1-8], ym13, 1 1200 vextracti128 [t0+strideq*2-8], ym14, 1 1201 vextracti128 [t0+stride3q -8], ym22, 1 1202 lea t0, [t0+strideq*4] 1203 vextracti32x4 [t0+strideq*0-8], m24, 2 1204 vextracti32x4 [t0+strideq*1-8], m26, 2 1205 vextracti32x4 [t0+strideq*2-8], m2, 2 1206 vextracti32x4 [t0+stride3q -8], m3, 2 1207 lea t0, [t0+strideq*4] 1208 vextracti32x4 [t0+strideq*0-8], m4, 2 1209 vextracti32x4 [t0+strideq*1-8], m5, 2 1210 vextracti32x4 [t0+strideq*2-8], m6, 2 1211 vextracti32x4 [t0+stride3q -8], m7, 2 1212 lea t0, [t0+strideq*4] 1213 vextracti32x4 [t0+strideq*0-8], m8, 2 1214 vextracti32x4 [t0+strideq*1-8], m9, 2 1215 vextracti32x4 [t0+strideq*2-8], m10, 2 1216 vextracti32x4 [t0+stride3q -8], m11, 2 1217 lea t0, [t0+strideq*4] 1218 vextracti32x4 [t0+strideq*0-8], m25, 2 1219 vextracti32x4 [t0+strideq*1-8], m13, 2 1220 vextracti32x4 [t0+strideq*2-8], m14, 2 1221 vextracti32x4 [t0+stride3q -8], m22, 2 1222 lea t0, [t0+strideq*4] 1223 vextracti32x4 [t0+strideq*0-8], m24, 3 1224 vextracti32x4 [t0+strideq*1-8], m26, 3 1225 vextracti32x4 [t0+strideq*2-8], m2, 3 1226 vextracti32x4 [t0+stride3q -8], m3, 3 1227 lea t0, [t0+strideq*4] 1228 vextracti32x4 [t0+strideq*0-8], m4, 3 1229 vextracti32x4 [t0+strideq*1-8], m5, 3 1230 vextracti32x4 [t0+strideq*2-8], m6, 3 1231 vextracti32x4 [t0+stride3q -8], m7, 3 1232 lea t0, [t0+strideq*4] 1233 vextracti32x4 [t0+strideq*0-8], m8, 3 1234 vextracti32x4 [t0+strideq*1-8], m9, 3 1235 vextracti32x4 [t0+strideq*2-8], m10, 3 1236 vextracti32x4 [t0+stride3q -8], m11, 3 1237 lea t0, [t0+strideq*4] 1238 vextracti32x4 [t0+strideq*0-8], m25, 3 1239 vextracti32x4 [t0+strideq*1-8], m13, 3 1240 vextracti32x4 [t0+strideq*2-8], m14, 3 1241 vextracti32x4 [t0+stride3q -8], m22, 3 1242%endif 1243%endif 1244 1245%elif %1 == 6 1246 ; flat6 filter 1247 vpbroadcastd m15, [pb_3_1] 1248 vpbroadcastd m12, [pb_2] 1249 punpcklbw m8, m13, m5 1250 punpckhbw m11, m13, m5 1251 pmaddubsw m0, m8, m15 1252 pmaddubsw m1, m11, m15 1253 punpcklbw m7, m4, m3 1254 punpckhbw m10, m4, m3 1255 pmaddubsw m2, m7, m12 1256 pmaddubsw m12, m10, m12 1257%ifidn %2, h 1258 vpbroadcastd m15, [pb_m1_1] 1259 %define pbm1_1 m15 1260%endif 1261 paddw m0, m2 1262 paddw m1, m12 1263 pmulhrsw m2, m0, m16 1264 pmulhrsw m12, m1, m16 1265 packuswb m2, m12 1266 vpblendmb m2{k2}, m3, m2 ; p1 1267%ifidn %2, v 1268 mova [t0+strideq*2], m2 1269%endif 1270 1271 pmaddubsw m8, pbm1_1 1272 pmaddubsw m11, pbm1_1 1273 paddw m0, m8 1274 paddw m1, m11 1275 punpcklbw m8, m13, m6 1276 punpckhbw m11, m13, m6 1277 pmaddubsw m8, pbm1_1 1278 pmaddubsw m11, pbm1_1 1279 paddw m0, m8 1280 paddw m1, m11 1281 pmulhrsw m12, m0, m16 1282 pmulhrsw m13, m1, m16 1283 packuswb m12, m13 1284 vpblendmb m12{k2}, m4, m12 ; p0 1285%ifidn %2, v 1286 mova [t0+stride3q], m12 1287%endif 1288 1289 vpbroadcastd m9, [pb_m1_2] 1290 vpbroadcastd m4, [pb_m1_0] 1291 paddw m0, m8 1292 paddw m1, m11 1293 punpcklbw m8, m3, m14 1294 punpckhbw m11, m3, m14 1295 pmaddubsw m14, m8, pbm1_1 1296 pmaddubsw m13, m11, pbm1_1 1297 paddw m0, m14 1298 paddw m1, m13 1299 pmulhrsw m14, m0, m16 1300 pmulhrsw m13, m1, m16 1301 packuswb m14, m13 1302 vpblendmb m14{k2}, m5, m14 ; q0 1303%ifidn %2, v 1304 mova [dstq+strideq*0], m14 1305%endif 1306 1307 pmaddubsw m8, m9 1308 pmaddubsw m11, m9 1309 paddw m0, m8 1310 paddw m1, m11 1311 pmaddubsw m7, m4 1312 pmaddubsw m10, m4 1313 paddw m0, m7 1314 paddw m1, m10 1315 pmulhrsw m0, m16 1316 pmulhrsw m1, m16 1317 packuswb m0, m1 1318 vpblendmb m0{k2}, m6, m0 ; q1 1319%ifidn %2, v 1320 mova [dstq+strideq*1], m0 1321%else 1322 TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 1323%endif 1324%else ; %1 == 4 1325%ifidn %2, v 1326 mova [t0+strideq*0], m3 ; p1 1327 mova [t0+strideq*1], m4 ; p0 1328 mova [t0+strideq*2], m5 ; q0 1329 mova [t0+stride3q ], m6 ; q1 1330%else 1331 TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 1332%endif 1333%endif 1334%endmacro 1335 1336%define k7 k6 1337 1338INIT_ZMM avx512icl 1339cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \ 1340 lut, w, stride3, mstride 1341 DECLARE_REG_TMP 9 1342 shl l_strideq, 2 1343 sub lq, l_strideq 1344 mov mstrideq, strideq 1345 neg mstrideq 1346 lea stride3q, [strideq*3] 1347 mova m21, [pb_4x0_4x4_4x8_4x12] 1348 mova m20, [pb_mask] 1349 vpbroadcastd m19, [pb_128] 1350 vpbroadcastd m28, [pb_m1_1] 1351 vpbroadcastd m27, [pw_2048] 1352 %define pbshuf m21 1353 %define pbmask m20 1354 %define pb128 m19 1355 %define pbm1_1 m28 1356 %define pw2048 m27 1357 1358.loop: 1359 cmp word [maskq+8], 0 ; vmask[2] 1360 je .no_flat16 1361 1362 FILTER 16, v 1363 jmp .end 1364 1365.no_flat16: 1366 cmp word [maskq+4], 0 ; vmask[1] 1367 je .no_flat 1368 1369 FILTER 8, v 1370 jmp .end 1371 1372.no_flat: 1373 cmp word [maskq+0], 0 ; vmask[0] 1374 je .end 1375 1376 call .v4 1377 1378.end: 1379 add lq, 64 1380 add dstq, 64 1381 add maskq, 2 1382 sub wd, 16 1383 jg .loop 1384 RET 1385ALIGN function_align 1386RESET_MM_PERMUTATION 1387.v4: 1388 FILTER 4, v 1389 ret 1390 1391cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \ 1392 lut, h, stride3, stride8 1393 DECLARE_REG_TMP 9, 10, 11, 12 1394 shl l_strideq, 2 1395 sub lq, 4 1396 lea stride3q, [strideq*3] 1397 lea stride8q, [strideq*8] 1398 kxnorw k6, k6, k6 1399 vpbroadcastd m19, strided 1400 vpbroadcastd m20, l_strided 1401 pmulld m21, m19, [hmulA] 1402 pmulld m20, [hmulB] 1403 pmulld m19, [hmulC] 1404 %define pbshuf [pb_4x0_4x4_4x8_4x12] 1405 %define pbmask [pb_mask] 1406 %define pb128 [pb_128]{bcstd} 1407 shl l_strideq, 1 1408 1409.loop: 1410 cmp word [maskq+8], 0 ; vmask[2] 1411 je .no_flat16 1412 1413 FILTER 16, h 1414 jmp .end 1415 1416.no_flat16: 1417 cmp word [maskq+4], 0 ; vmask[1] 1418 je .no_flat 1419 1420 FILTER 8, h 1421 jmp .end 1422 1423.no_flat: 1424 cmp word [maskq+0], 0 ; vmask[0] 1425 je .end 1426 1427 call .h4 1428 1429.end: 1430 lea lq, [lq+l_strideq*8] 1431 lea dstq, [dstq+stride8q*8] 1432 add maskq, 2 1433 sub hd, 16 1434 jg .loop 1435 RET 1436ALIGN function_align 1437RESET_MM_PERMUTATION 1438.h4: 1439 FILTER 4, h 1440 ret 1441 1442cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \ 1443 lut, w, stride3, mstride 1444 DECLARE_REG_TMP 9 1445 shl l_strideq, 2 1446 sub lq, l_strideq 1447 mov mstrideq, strideq 1448 neg mstrideq 1449 lea stride3q, [strideq*3] 1450 mova m21, [pb_4x0_4x4_4x8_4x12] 1451 mova m20, [pb_mask] 1452 vpbroadcastd m19, [pb_128] 1453 vpbroadcastd m17, [pb_m1_1] 1454 vpbroadcastd m16, [pw_4096] 1455 %define pbshuf m21 1456 %define pbmask m20 1457 %define pb128 m19 1458 %define pbm1_1 m17 1459 1460.loop: 1461 cmp word [maskq+4], 0 ; vmask[1] 1462 je .no_flat 1463 1464 FILTER 6, v 1465 jmp .end 1466 1467.no_flat: 1468 cmp word [maskq+0], 0 ; vmask[0] 1469 je .end 1470 1471 call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4 1472 1473.end: 1474 add lq, 64 1475 add dstq, 64 1476 add maskq, 2 1477 sub wd, 16 1478 jg .loop 1479 RET 1480 1481%undef k7 1482cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \ 1483 lut, h, stride3, stride8 1484 DECLARE_REG_TMP 9, 10, 11 1485 mov r7d, 0xffff 1486 movzx r8d, r7b 1487 cmp hd, 9 1488 cmovb r7d, r8d 1489 kmovw k6, r7d ; h > 8 ? 0xffff : 0x00ff 1490 shl l_strideq, 2 1491 sub lq, 4 1492 kshiftrw k7, k6, 4 ; h > 8 ? 0xff : 0xf0 1493 lea stride3q, [strideq*3] 1494 lea stride8q, [strideq*8] 1495 vpbroadcastd m19, strided 1496 vpbroadcastd m20, l_strided 1497 pmulld m21, m19, [hmulA] 1498 pmulld m20, [hmulB] 1499 pmulld m19, [hmulC] 1500 mova m18, [pb_mask] 1501 vpbroadcastd m17, [pb_128] 1502 vpbroadcastd m16, [pw_4096] 1503 %define pbshuf [pb_4x0_4x4_4x8_4x12] 1504 %define pbmask m18 1505 %define pb128 m17 1506 add l_strideq, l_strideq 1507 1508.loop: 1509 cmp word [maskq+4], 0 ; vmask[1] 1510 je .no_flat 1511 1512 FILTER 6, h 1513 jmp .end 1514 1515.no_flat: 1516 cmp word [maskq+0], 0 ; vmask[0] 1517 je .end 1518 1519 call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4 1520 1521.end: 1522 lea lq, [lq+l_strideq*8] 1523 lea dstq, [dstq+stride8q*8] 1524 add maskq, 2 1525 sub hd, 16 1526 jg .loop 1527 RET 1528 1529%endif ; ARCH_X86_64 1530