1; Copyright © 2018-2021, VideoLAN and dav1d authors 2; Copyright © 2018, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29%if ARCH_X86_64 30 31SECTION_RODATA 32 32 33pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12 34pb_7_1: times 16 db 7, 1 35pb_3_1: times 16 db 3, 1 36pb_2_1: times 16 db 2, 1 37pb_m1_0: times 16 db -1, 0 38pb_m1_1: times 16 db -1, 1 39pb_m1_2: times 16 db -1, 2 40pb_1: times 32 db 1 41pb_2: times 32 db 2 42pb_3: times 32 db 3 43pb_4: times 32 db 4 44pb_16: times 32 db 16 45pb_63: times 32 db 63 46pb_64: times 32 db 64 47pb_128: times 32 db 0x80 48pb_129: times 32 db 0x81 49pb_240: times 32 db 0xf0 50pb_248: times 32 db 0xf8 51pb_254: times 32 db 0xfe 52 53pw_2048: times 16 dw 2048 54pw_4096: times 16 dw 4096 55 56pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128 57 58SECTION .text 59 60%macro ABSSUB 4 ; dst, a, b, tmp 61 psubusb %1, %2, %3 62 psubusb %4, %3, %2 63 por %1, %4 64%endmacro 65 66%macro TRANSPOSE_16x4_AND_WRITE_4x32 5 67 ; transpose 16x4 68 punpcklbw m%5, m%1, m%2 69 punpckhbw m%1, m%2 70 punpcklbw m%2, m%3, m%4 71 punpckhbw m%3, m%4 72 punpcklwd m%4, m%5, m%2 73 punpckhwd m%5, m%2 74 punpcklwd m%2, m%1, m%3 75 punpckhwd m%1, m%3 76 77 ; write out 78 movd [dstq+strideq*0-2], xm%4 79 pextrd [dstq+strideq*1-2], xm%4, 1 80 pextrd [dstq+strideq*2-2], xm%4, 2 81 pextrd [dstq+stride3q-2], xm%4, 3 82 lea dstq, [dstq+strideq*4] 83 movd [dstq+strideq*0-2], xm%5 84 pextrd [dstq+strideq*1-2], xm%5, 1 85 pextrd [dstq+strideq*2-2], xm%5, 2 86 pextrd [dstq+stride3q-2], xm%5, 3 87 lea dstq, [dstq+strideq*4] 88 movd [dstq+strideq*0-2], xm%2 89 pextrd [dstq+strideq*1-2], xm%2, 1 90 pextrd [dstq+strideq*2-2], xm%2, 2 91 pextrd [dstq+stride3q-2], xm%2, 3 92 lea dstq, [dstq+strideq*4] 93 movd [dstq+strideq*0-2], xm%1 94 pextrd [dstq+strideq*1-2], xm%1, 1 95 pextrd [dstq+strideq*2-2], xm%1, 2 96 pextrd [dstq+stride3q-2], xm%1, 3 97 lea dstq, [dstq+strideq*4] 98 99 vextracti128 xm%4, m%4, 1 100 vextracti128 xm%5, m%5, 1 101 vextracti128 xm%2, m%2, 1 102 vextracti128 xm%1, m%1, 1 103 104 movd [dstq+strideq*0-2], xm%4 105 pextrd [dstq+strideq*1-2], xm%4, 1 106 pextrd [dstq+strideq*2-2], xm%4, 2 107 pextrd [dstq+stride3q-2], xm%4, 3 108 lea dstq, [dstq+strideq*4] 109 movd [dstq+strideq*0-2], xm%5 110 pextrd [dstq+strideq*1-2], xm%5, 1 111 pextrd [dstq+strideq*2-2], xm%5, 2 112 pextrd [dstq+stride3q-2], xm%5, 3 113 lea dstq, [dstq+strideq*4] 114 movd [dstq+strideq*0-2], xm%2 115 pextrd [dstq+strideq*1-2], xm%2, 1 116 pextrd [dstq+strideq*2-2], xm%2, 2 117 pextrd [dstq+stride3q-2], xm%2, 3 118 lea dstq, [dstq+strideq*4] 119 movd [dstq+strideq*0-2], xm%1 120 pextrd [dstq+strideq*1-2], xm%1, 1 121 pextrd [dstq+strideq*2-2], xm%1, 2 122 pextrd [dstq+stride3q-2], xm%1, 3 123 lea dstq, [dstq+strideq*4] 124%endmacro 125 126%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem 127%if %1 == 0 128 mova %3, m15 129%endif 130 131 ; input in m0-15 132 punpcklbw m15, m0, m1 133 punpckhbw m0, m1 134 punpcklbw m1, m2, m3 135 punpckhbw m2, m3 136 punpcklbw m3, m4, m5 137 punpckhbw m4, m5 138 punpcklbw m5, m6, m7 139 punpckhbw m6, m7 140 punpcklbw m7, m8, m9 141 punpckhbw m8, m9 142 punpcklbw m9, m10, m11 143 punpckhbw m10, m11 144 punpcklbw m11, m12, m13 145 punpckhbw m12, m13 146 mova m13, %3 147 mova %3, m12 148 punpcklbw m12, m14, m13 149 punpckhbw m13, m14, m13 150 151 ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13 152 punpcklwd m14, m15, m1 153 punpckhwd m15, m1 154 punpcklwd m1, m0, m2 155 punpckhwd m0, m2 156 punpcklwd m2, m3, m5 157 punpckhwd m3, m5 158 punpcklwd m5, m4, m6 159 punpckhwd m4, m6 160 punpcklwd m6, m7, m9 161 punpckhwd m7, m9 162 punpcklwd m9, m8, m10 163 punpckhwd m8, m10 164 punpcklwd m10, m11, m12 165 punpckhwd m11, m12 166 mova m12, %3 167 mova %3, m11 168 punpcklwd m11, m12, m13 169 punpckhwd m12, m13 170 171 ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12 172 punpckldq m13, m14, m2 173 punpckhdq m14, m2 174 punpckldq m2, m15, m3 175 punpckhdq m15, m3 176 punpckldq m3, m1, m5 177 punpckhdq m1, m5 178 punpckldq m5, m0, m4 179 punpckhdq m0, m4 180 punpckldq m4, m6, m10 181 punpckhdq m6, m10 182 punpckldq m10, m9, m11 183 punpckhdq m9, m11 184 punpckldq m11, m8, m12 185 punpckhdq m8, m12 186 mova m12, %3 187 mova %3, m8 188 punpckldq m8, m7, m12 189 punpckhdq m7, m12 190 191 ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3 192 punpcklqdq m12, m13, m4 193 punpckhqdq m13, m4 194 punpcklqdq m4, m14, m6 195 punpckhqdq m14, m6 196 punpcklqdq m6, m2, m8 197 punpckhqdq m2, m8 198 punpcklqdq m8, m15, m7 199 punpckhqdq m15, m7 200 punpcklqdq m7, m3, m10 201 punpckhqdq m3, m10 202 punpcklqdq m10, m1, m9 203 punpckhqdq m1, m9 204 punpcklqdq m9, m5, m11 205 punpckhqdq m5, m11 206 mova m11, %3 207 mova %3, m12 208 punpcklqdq m12, m0, m11 209 punpckhqdq m0, m11 210%if %2 == 0 211 mova m11, %3 212%endif 213 214 ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0 215 SWAP 0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15 216 SWAP 3, 14, 12, 9 217%endmacro 218 219%macro FILTER 2 ; width [4/6/8/16], dir [h/v] 220 ; load data 221%ifidn %2, v 222%if %1 == 4 223 lea tmpq, [dstq+mstrideq*2] 224 mova m3, [tmpq+strideq*0] ; p1 225 mova m4, [tmpq+strideq*1] ; p0 226 mova m5, [tmpq+strideq*2] ; q0 227 mova m6, [tmpq+stride3q] ; q1 228%else 229 ; load 6-8 pixels, remainder (for wd=16) will be read inline 230 lea tmpq, [dstq+mstrideq*4] 231%if %1 != 6 232 mova m12, [tmpq+strideq*0] 233%endif 234 mova m13, [tmpq+strideq*1] 235 mova m3, [tmpq+strideq*2] 236 mova m4, [tmpq+stride3q] 237 mova m5, [dstq+strideq*0] 238 mova m6, [dstq+strideq*1] 239 mova m14, [dstq+strideq*2] 240%if %1 != 6 241 mova m15, [dstq+stride3q] 242%endif 243%endif 244%else 245 ; load lines 246%if %1 == 4 247 movd xm3, [dstq+strideq*0-2] 248 movd xm4, [dstq+strideq*1-2] 249 movd xm5, [dstq+strideq*2-2] 250 movd xm6, [dstq+stride3q -2] 251 lea tmpq, [dstq+strideq*4] 252 pinsrd xm3, [tmpq+strideq*0-2], 2 253 pinsrd xm4, [tmpq+strideq*1-2], 2 254 pinsrd xm5, [tmpq+strideq*2-2], 2 255 pinsrd xm6, [tmpq+stride3q -2], 2 256 lea tmpq, [tmpq+strideq*4] 257 pinsrd xm3, [tmpq+strideq*0-2], 1 258 pinsrd xm4, [tmpq+strideq*1-2], 1 259 pinsrd xm5, [tmpq+strideq*2-2], 1 260 pinsrd xm6, [tmpq+stride3q -2], 1 261 lea tmpq, [tmpq+strideq*4] 262 pinsrd xm3, [tmpq+strideq*0-2], 3 263 pinsrd xm4, [tmpq+strideq*1-2], 3 264 pinsrd xm5, [tmpq+strideq*2-2], 3 265 pinsrd xm6, [tmpq+stride3q -2], 3 266 lea tmpq, [tmpq+strideq*4] 267 movd xm12, [tmpq+strideq*0-2] 268 movd xm13, [tmpq+strideq*1-2] 269 movd xm14, [tmpq+strideq*2-2] 270 movd xm15, [tmpq+stride3q -2] 271 lea tmpq, [tmpq+strideq*4] 272 pinsrd xm12, [tmpq+strideq*0-2], 2 273 pinsrd xm13, [tmpq+strideq*1-2], 2 274 pinsrd xm14, [tmpq+strideq*2-2], 2 275 pinsrd xm15, [tmpq+stride3q -2], 2 276 lea tmpq, [tmpq+strideq*4] 277 pinsrd xm12, [tmpq+strideq*0-2], 1 278 pinsrd xm13, [tmpq+strideq*1-2], 1 279 pinsrd xm14, [tmpq+strideq*2-2], 1 280 pinsrd xm15, [tmpq+stride3q -2], 1 281 lea tmpq, [tmpq+strideq*4] 282 pinsrd xm12, [tmpq+strideq*0-2], 3 283 pinsrd xm13, [tmpq+strideq*1-2], 3 284 pinsrd xm14, [tmpq+strideq*2-2], 3 285 pinsrd xm15, [tmpq+stride3q -2], 3 286 vinserti128 m3, xm12, 1 287 vinserti128 m4, xm13, 1 288 vinserti128 m5, xm14, 1 289 vinserti128 m6, xm15, 1 290 291 ; transpose 4x16 292 ; xm3: A-D0,A-D8,A-D4,A-D12 293 ; xm4: A-D1,A-D9,A-D5,A-D13 294 ; xm5: A-D2,A-D10,A-D6,A-D14 295 ; xm6: A-D3,A-D11,A-D7,A-D15 296 punpcklbw m7, m3, m4 297 punpckhbw m3, m4 298 punpcklbw m4, m5, m6 299 punpckhbw m5, m6 300 ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9 301 ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13 302 ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11 303 ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15 304 punpcklwd m6, m7, m4 305 punpckhwd m7, m4 306 punpcklwd m4, m3, m5 307 punpckhwd m3, m5 308 ; xm6: A0-3,B0-3,C0-3,D0-3 309 ; xm7: A8-11,B8-11,C8-11,D8-11 310 ; xm4: A4-7,B4-7,C4-7,D4-7 311 ; xm3: A12-15,B12-15,C12-15,D12-15 312 punpckldq m5, m6, m4 313 punpckhdq m6, m4 314 punpckldq m4, m7, m3 315 punpckhdq m7, m3 316 ; xm5: A0-7,B0-7 317 ; xm6: C0-7,D0-7 318 ; xm4: A8-15,B8-15 319 ; xm7: C8-15,D8-15 320 punpcklqdq m3, m5, m4 321 punpckhqdq m4, m5, m4 322 punpcklqdq m5, m6, m7 323 punpckhqdq m6, m7 324 ; xm3: A0-15 325 ; xm5: B0-15 326 ; xm4: C0-15 327 ; xm6: D0-15 328%elif %1 == 6 || %1 == 8 329 movq xm3, [dstq+strideq*0-%1/2] 330 movq xm4, [dstq+strideq*1-%1/2] 331 movq xm5, [dstq+strideq*2-%1/2] 332 movq xm6, [dstq+stride3q -%1/2] 333 lea tmpq, [dstq+strideq*8] 334 movhps xm3, [tmpq+strideq*0-%1/2] 335 movhps xm4, [tmpq+strideq*1-%1/2] 336 movhps xm5, [tmpq+strideq*2-%1/2] 337 movhps xm6, [tmpq+stride3q -%1/2] 338 lea tmpq, [tmpq+strideq*8] 339 movq xm7, [tmpq+strideq*0-%1/2] 340 movq xm8, [tmpq+strideq*1-%1/2] 341 movq xm9, [tmpq+strideq*2-%1/2] 342 movq xm11, [tmpq+stride3q -%1/2] 343 lea tmpq, [tmpq+strideq*8] 344 movhps xm7, [tmpq+strideq*0-%1/2] 345 movhps xm8, [tmpq+strideq*1-%1/2] 346 movhps xm9, [tmpq+strideq*2-%1/2] 347 movhps xm11, [tmpq+stride3q -%1/2] 348 vinserti128 m3, xm7, 1 349 vinserti128 m4, xm8, 1 350 vinserti128 m5, xm9, 1 351 vinserti128 m6, xm11, 1 352 lea tmpq, [dstq+strideq*4] 353 movq xm12, [tmpq+strideq*0-%1/2] 354 movq xm13, [tmpq+strideq*1-%1/2] 355 movq xm14, [tmpq+strideq*2-%1/2] 356 movq xm15, [tmpq+stride3q -%1/2] 357 lea tmpq, [tmpq+strideq*8] 358 movhps xm12, [tmpq+strideq*0-%1/2] 359 movhps xm13, [tmpq+strideq*1-%1/2] 360 movhps xm14, [tmpq+strideq*2-%1/2] 361 movhps xm15, [tmpq+stride3q -%1/2] 362 lea tmpq, [tmpq+strideq*8] 363 movq xm7, [tmpq+strideq*0-%1/2] 364 movq xm8, [tmpq+strideq*1-%1/2] 365 movq xm9, [tmpq+strideq*2-%1/2] 366 movq xm11, [tmpq+stride3q -%1/2] 367 lea tmpq, [tmpq+strideq*8] 368 movhps xm7, [tmpq+strideq*0-%1/2] 369 movhps xm8, [tmpq+strideq*1-%1/2] 370 movhps xm9, [tmpq+strideq*2-%1/2] 371 movhps xm11, [tmpq+stride3q -%1/2] 372 vinserti128 m12, xm7, 1 373 vinserti128 m13, xm8, 1 374 vinserti128 m14, xm9, 1 375 vinserti128 m15, xm11, 1 376 377 ; transpose 8x16 378 ; xm3: A-H0,A-H8 379 ; xm4: A-H1,A-H9 380 ; xm5: A-H2,A-H10 381 ; xm6: A-H3,A-H11 382 ; xm12: A-H4,A-H12 383 ; xm13: A-H5,A-H13 384 ; xm14: A-H6,A-H14 385 ; xm15: A-H7,A-H15 386 punpcklbw m7, m3, m4 387 punpckhbw m3, m4 388 punpcklbw m4, m5, m6 389 punpckhbw m5, m6 390 punpcklbw m6, m12, m13 391 punpckhbw m12, m13 392 punpcklbw m13, m14, m15 393 punpckhbw m14, m15 394 ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1 395 ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9 396 ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3 397 ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11 398 ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5 399 ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13 400 ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7 401 ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15 402 punpcklwd m15, m7, m4 403 punpckhwd m7, m4 404 punpcklwd m4, m3, m5 405 punpckhwd m3, m5 406 punpcklwd m5, m6, m13 407 punpckhwd m6, m13 408 punpcklwd m13, m12, m14 409 punpckhwd m12, m14 410 ; xm15: A0-3,B0-3,C0-3,D0-3 411 ; xm7: E0-3,F0-3,G0-3,H0-3 412 ; xm4: A8-11,B8-11,C8-11,D8-11 413 ; xm3: E8-11,F8-11,G8-11,H8-11 414 ; xm5: A4-7,B4-7,C4-7,D4-7 415 ; xm6: E4-7,F4-7,G4-7,H4-7 416 ; xm13: A12-15,B12-15,C12-15,D12-15 417 ; xm12: E12-15,F12-15,G12-15,H12-15 418 punpckldq m14, m15, m5 419 punpckhdq m15, m5 420 punpckldq m5, m7, m6 421%if %1 != 6 422 punpckhdq m7, m6 423%endif 424 punpckldq m6, m4, m13 425 punpckhdq m4, m13 426 punpckldq m13, m3, m12 427%if %1 != 6 428 punpckhdq m12, m3, m12 429%endif 430 ; xm14: A0-7,B0-7 431 ; xm15: C0-7,D0-7 432 ; xm5: E0-7,F0-7 433 ; xm7: G0-7,H0-7 434 ; xm6: A8-15,B8-15 435 ; xm4: C8-15,D8-15 436 ; xm13: E8-15,F8-15 437 ; xm12: G8-15,H8-15 438 punpcklqdq m3, m14, m6 439 punpckhqdq m14, m6 440 punpckhqdq m6, m15, m4 441 punpcklqdq m15, m4 442 punpcklqdq m4, m5, m13 443 punpckhqdq m13, m5, m13 444%if %1 == 8 445 punpcklqdq m5, m7, m12 446 punpckhqdq m12, m7, m12 447 ; xm3: A0-15 448 ; xm14: B0-15 449 ; xm15: C0-15 450 ; xm6: D0-15 451 ; xm4: E0-15 452 ; xm13: F0-15 453 ; xm5: G0-15 454 ; xm12: H0-15 455 SWAP 12, 3, 15 456 SWAP 13, 14, 5, 4, 6 457 ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15 458%else 459 SWAP 13, 3, 14 460 SWAP 6, 4, 15, 5 461 ; 3,14,15,6,4,13 -> 13,3,4,5,6,14 462%endif 463%else 464 ; load and 16x16 transpose. We only use 14 pixels but we'll need the 465 ; remainder at the end for the second transpose 466 movu xm0, [dstq+strideq*0-8] 467 movu xm1, [dstq+strideq*1-8] 468 movu xm2, [dstq+strideq*2-8] 469 movu xm3, [dstq+stride3q -8] 470 lea tmpq, [dstq+strideq*4] 471 movu xm4, [tmpq+strideq*0-8] 472 movu xm5, [tmpq+strideq*1-8] 473 movu xm6, [tmpq+strideq*2-8] 474 movu xm7, [tmpq+stride3q -8] 475 lea tmpq, [tmpq+strideq*4] 476 movu xm8, [tmpq+strideq*0-8] 477 movu xm9, [tmpq+strideq*1-8] 478 movu xm10, [tmpq+strideq*2-8] 479 movu xm11, [tmpq+stride3q -8] 480 lea tmpq, [tmpq+strideq*4] 481 movu xm12, [tmpq+strideq*0-8] 482 movu xm13, [tmpq+strideq*1-8] 483 movu xm14, [tmpq+strideq*2-8] 484 movu xm15, [tmpq+stride3q -8] 485 lea tmpq, [tmpq+strideq*4] 486 vinserti128 m0, [tmpq+strideq*0-8], 1 487 vinserti128 m1, [tmpq+strideq*1-8], 1 488 vinserti128 m2, [tmpq+strideq*2-8], 1 489 vinserti128 m3, [tmpq+stride3q -8], 1 490 lea tmpq, [tmpq+strideq*4] 491 vinserti128 m4, [tmpq+strideq*0-8], 1 492 vinserti128 m5, [tmpq+strideq*1-8], 1 493 vinserti128 m6, [tmpq+strideq*2-8], 1 494 vinserti128 m7, [tmpq+stride3q -8], 1 495 lea tmpq, [tmpq+strideq*4] 496 vinserti128 m8, [tmpq+strideq*0-8], 1 497 vinserti128 m9, [tmpq+strideq*1-8], 1 498 vinserti128 m10, [tmpq+strideq*2-8], 1 499 vinserti128 m11, [tmpq+stride3q -8], 1 500 lea tmpq, [tmpq+strideq*4] 501 vinserti128 m12, [tmpq+strideq*0-8], 1 502 vinserti128 m13, [tmpq+strideq*1-8], 1 503 vinserti128 m14, [tmpq+strideq*2-8], 1 504 vinserti128 m15, [tmpq+stride3q -8], 1 505 506 TRANSPOSE_16X16B 0, 1, [rsp+11*32] 507 mova [rsp+12*32], m1 508 mova [rsp+13*32], m2 509 mova [rsp+14*32], m3 510 mova [rsp+15*32], m12 511 mova [rsp+16*32], m13 512 mova [rsp+17*32], m14 513 mova [rsp+18*32], m15 514 ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15 515 SWAP 12, 4, 7 516 SWAP 13, 5, 8 517 SWAP 3, 6, 9 518 SWAP 10, 14 519 SWAP 11, 15 520%endif 521%endif 522 523 ; load L/E/I/H 524%ifidn %2, v 525 movu m1, [lq] 526 movu m0, [lq+l_strideq] 527%else 528 movq xm1, [lq] 529 movq xm2, [lq+l_strideq*2] 530 movhps xm1, [lq+l_strideq] 531 movhps xm2, [lq+l_stride3q] 532 lea lq, [lq+l_strideq*4] 533 movq xm10, [lq] 534 movq xm0, [lq+l_strideq*2] 535 movhps xm10, [lq+l_strideq] 536 movhps xm0, [lq+l_stride3q] 537 lea lq, [lq+l_strideq*4] 538 vinserti128 m1, xm10, 1 539 vinserti128 m2, xm0, 1 540 shufps m0, m1, m2, q3131 541 shufps m1, m2, q2020 542%endif 543 pxor m2, m2 544 pcmpeqb m10, m2, m0 545 pand m1, m10 546 por m0, m1 ; l[x][] ? l[x][] : l[x-stride][] 547 pshufb m0, [pb_4x1_4x5_4x9_4x13] ; l[x][1] 548 pcmpeqb m10, m2, m0 ; !L 549 psrlq m2, m0, [lutq+128] 550 pand m2, [pb_63] 551 vpbroadcastb m1, [lutq+136] 552 pminub m2, m1 553 pmaxub m2, [pb_1] ; I 554 pand m1, m0, [pb_240] 555 psrlq m1, 4 ; H 556 paddb m0, [pb_2] 557 paddb m0, m0 558 paddb m0, m2 ; E 559 pxor m1, [pb_128] 560 pxor m2, [pb_128] 561 pxor m0, [pb_128] 562 563 ABSSUB m8, m3, m4, m9 ; abs(p1-p0) 564 pmaxub m8, m10 565 ABSSUB m9, m5, m6, m10 ; abs(q1-q0) 566 pmaxub m8, m9 567%if %1 == 4 568 pxor m8, [pb_128] 569 pcmpgtb m7, m8, m1 ; hev 570%else 571 pxor m7, m8, [pb_128] 572 pcmpgtb m7, m1 ; hev 573 574%if %1 == 6 575 ABSSUB m9, m13, m4, m10 ; abs(p2-p0) 576 pmaxub m9, m8 577%else 578 ABSSUB m9, m12, m4, m10 ; abs(p3-p0) 579 pmaxub m9, m8 580 ABSSUB m10, m13, m4, m11 ; abs(p2-p0) 581 pmaxub m9, m10 582%endif 583 ABSSUB m10, m5, m14, m11 ; abs(q2-q0) 584 pmaxub m9, m10 585%if %1 != 6 586 ABSSUB m10, m5, m15, m11 ; abs(q3-q0) 587 pmaxub m9, m10 588%endif 589 pxor m9, [pb_128] 590 pcmpgtb m9, [pb_129] ; !flat8in 591 592%if %1 == 6 593 ABSSUB m10, m13, m3, m1 ; abs(p2-p1) 594%else 595 ABSSUB m10, m12, m13, m11 ; abs(p3-p2) 596 ABSSUB m11, m13, m3, m1 ; abs(p2-p1) 597 pmaxub m10, m11 598 ABSSUB m11, m14, m15, m1 ; abs(q3-q2) 599 pmaxub m10, m11 600%endif 601 ABSSUB m11, m14, m6, m1 ; abs(q2-q1) 602 pmaxub m10, m11 603%if %1 == 16 604 vpbroadcastd m11, [maskq+8] 605 vpbroadcastd m1, [maskq+4] 606 por m11, m1 607 pand m11, [pb_mask] 608 pcmpeqd m11, [pb_mask] 609 pand m10, m11 610%else 611 vpbroadcastd m11, [maskq+4] 612 pand m11, [pb_mask] 613 pcmpeqd m11, [pb_mask] 614 pand m10, m11 ; only apply fm-wide to wd>4 blocks 615%endif 616 pmaxub m8, m10 617 618 pxor m8, [pb_128] 619%endif 620 pcmpgtb m8, m2 621 622 ABSSUB m10, m3, m6, m11 ; abs(p1-q1) 623 ABSSUB m11, m4, m5, m2 ; abs(p0-q0) 624 paddusb m11, m11 625 pand m10, [pb_254] 626 psrlq m10, 1 627 paddusb m10, m11 ; abs(p0-q0)*2+(abs(p1-q1)>>1) 628 pxor m10, [pb_128] 629 pcmpgtb m10, m0 ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E 630 por m8, m10 631 632%if %1 == 16 633%ifidn %2, v 634 lea tmpq, [dstq+mstrideq*8] 635 mova m0, [tmpq+strideq*1] 636%else 637 mova m0, [rsp+12*32] 638%endif 639 ABSSUB m1, m0, m4, m2 640%ifidn %2, v 641 mova m0, [tmpq+strideq*2] 642%else 643 mova m0, [rsp+13*32] 644%endif 645 ABSSUB m2, m0, m4, m10 646 pmaxub m1, m2 647%ifidn %2, v 648 mova m0, [tmpq+stride3q] 649%else 650 mova m0, [rsp+14*32] 651%endif 652 ABSSUB m2, m0, m4, m10 653 pmaxub m1, m2 654%ifidn %2, v 655 lea tmpq, [dstq+strideq*4] 656 mova m0, [tmpq+strideq*0] 657%else 658 mova m0, [rsp+15*32] 659%endif 660 ABSSUB m2, m0, m5, m10 661 pmaxub m1, m2 662%ifidn %2, v 663 mova m0, [tmpq+strideq*1] 664%else 665 mova m0, [rsp+16*32] 666%endif 667 ABSSUB m2, m0, m5, m10 668 pmaxub m1, m2 669%ifidn %2, v 670 mova m0, [tmpq+strideq*2] 671%else 672 mova m0, [rsp+17*32] 673%endif 674 ABSSUB m2, m0, m5, m10 675 pmaxub m1, m2 676 pxor m1, [pb_128] 677 pcmpgtb m1, [pb_129] ; !flat8out 678 por m1, m9 ; !flat8in | !flat8out 679 vpbroadcastd m2, [maskq+8] 680 pand m10, m2, [pb_mask] 681 pcmpeqd m10, [pb_mask] 682 pandn m1, m10 ; flat16 683 pandn m1, m8, m1 ; flat16 & fm 684 685 vpbroadcastd m10, [maskq+4] 686 por m10, m2 687 pand m2, m10, [pb_mask] 688 pcmpeqd m2, [pb_mask] 689 pandn m9, m2 ; flat8in 690 pandn m9, m8, m9 691 vpbroadcastd m2, [maskq+0] 692 por m2, m10 693 pand m2, [pb_mask] 694 pcmpeqd m2, [pb_mask] 695 pandn m8, m2 696 pandn m8, m9, m8 ; fm & !flat8 & !flat16 697 pandn m9, m1, m9 ; flat8 & !flat16 698%elif %1 != 4 699 vpbroadcastd m0, [maskq+4] 700 pand m2, m0, [pb_mask] 701 pcmpeqd m2, [pb_mask] 702 pandn m9, m2 703 pandn m9, m8, m9 ; flat8 & fm 704 vpbroadcastd m2, [maskq+0] 705 por m0, m2 706 pand m0, [pb_mask] 707 pcmpeqd m0, [pb_mask] 708 pandn m8, m0 709 pandn m8, m9, m8 ; fm & !flat8 710%else 711 vpbroadcastd m0, [maskq+0] 712 pand m0, [pb_mask] 713 pcmpeqd m0, [pb_mask] 714 pandn m8, m0 ; fm 715%endif 716 717 ; short filter 718 719 pxor m3, [pb_128] 720 pxor m6, [pb_128] 721 psubsb m10, m3, m6 ; iclip_diff(p1-q1) 722 pand m10, m7 ; f=iclip_diff(p1-q1)&hev 723 pxor m4, [pb_128] 724 pxor m5, [pb_128] 725 psubsb m11, m5, m4 726 paddsb m10, m11 727 paddsb m10, m11 728 paddsb m10, m11 ; f=iclip_diff(3*(q0-p0)+f) 729 pand m8, m10 ; f&=fm 730 paddsb m10, m8, [pb_3] 731 paddsb m8, [pb_4] 732 pand m10, [pb_248] 733 pand m8, [pb_248] 734 psrlq m10, 3 735 psrlq m8, 3 736 pxor m10, [pb_16] 737 pxor m8, [pb_16] 738 psubb m10, [pb_16] ; f2 739 psubb m8, [pb_16] ; f1 740 paddsb m4, m10 741 psubsb m5, m8 742 pxor m4, [pb_128] 743 pxor m5, [pb_128] 744 745 pxor m8, [pb_128] 746 pxor m10, m10 747 pavgb m8, m10 ; f=(f1+1)>>1 748 psubb m8, [pb_64] 749 pandn m8, m7, m8 ; f&=!hev 750 paddsb m3, m8 751 psubsb m6, m8 752 pxor m3, [pb_128] 753 pxor m6, [pb_128] 754 755%if %1 == 16 756 ; flat16 filter 757%ifidn %2, v 758 lea tmpq, [dstq+mstrideq*8] 759 mova m0, [tmpq+strideq*1] ; p6 760 mova m2, [tmpq+strideq*2] ; p5 761 mova m7, [tmpq+stride3q] ; p4 762%else 763 mova m0, [rsp+12*32] 764 mova m2, [rsp+13*32] 765 mova m7, [rsp+14*32] 766%endif 767 768 mova [rsp+0*32], m9 769 mova [rsp+1*32], m14 770 mova [rsp+2*32], m15 771 772 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A 773 ; write -6 774 punpcklbw m14, m0, m12 775 punpckhbw m15, m0, m12 776 pmaddubsw m10, m14, [pb_7_1] 777 pmaddubsw m11, m15, [pb_7_1] ; p6*7+p3 778 punpcklbw m8, m2, m7 779 punpckhbw m9, m2, m7 780 pmaddubsw m8, [pb_2] 781 pmaddubsw m9, [pb_2] 782 paddw m10, m8 783 paddw m11, m9 ; p6*7+p5*2+p4*2+p3 784 punpcklbw m8, m13, m3 785 punpckhbw m9, m13, m3 786 pmaddubsw m8, [pb_1] 787 pmaddubsw m9, [pb_1] 788 paddw m10, m8 789 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1 790 punpcklbw m8, m4, m5 791 punpckhbw m9, m4, m5 792 pmaddubsw m8, [pb_1] 793 pmaddubsw m9, [pb_1] 794 paddw m10, m8 795 paddw m11, m9 ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 796 pmulhrsw m8, m10, [pw_2048] 797 pmulhrsw m9, m11, [pw_2048] 798 packuswb m8, m9 799 pand m8, m1 800 pandn m9, m1, m2 801 por m8, m9 802%ifidn %2, v 803 mova [tmpq+strideq*2], m8 ; p5 804%else 805 mova [rsp+13*32], m8 806%endif 807 808 ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B 809 ; write -5 810 pmaddubsw m14, [pb_m1_1] 811 pmaddubsw m15, [pb_m1_1] 812 paddw m10, m14 813 paddw m11, m15 ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0 814 punpcklbw m8, m0, m6 815 punpckhbw m9, m0, m6 816 pmaddubsw m8, [pb_m1_1] 817 pmaddubsw m9, [pb_m1_1] 818 mova [rsp+3*32], m8 819 mova [rsp+4*32], m9 820 paddw m10, m8 821 paddw m11, m9 ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1 822 pmulhrsw m8, m10, [pw_2048] 823 pmulhrsw m9, m11, [pw_2048] 824 packuswb m8, m9 825 vpblendvb m8, m7, m8, m1 826%ifidn %2, v 827 mova [tmpq+stride3q], m8 ; p4 828%else 829 mova [rsp+14*32], m8 830%endif 831 832 ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C 833 ; write -4 834 mova m14, [rsp+1*32] 835 punpcklbw m8, m0, m13 836 punpckhbw m9, m0, m13 837 pmaddubsw m8, [pb_m1_1] 838 pmaddubsw m9, [pb_m1_1] 839 paddw m10, m8 840 paddw m11, m9 ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1 841 punpcklbw m8, m2, m14 842 punpckhbw m2, m14 843 pmaddubsw m8, [pb_m1_1] 844 pmaddubsw m2, [pb_m1_1] 845 mova [rsp+1*32], m8 846 paddw m10, m8 847 paddw m11, m2 ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2 848 pmulhrsw m8, m10, [pw_2048] 849 pmulhrsw m9, m11, [pw_2048] 850 packuswb m8, m9 851 vpblendvb m8, m12, m8, m1 852%ifidn %2, v 853 mova [tmpq+strideq*4], m8 ; p3 854%else 855 mova [rsp+19*32], m8 856%endif 857 858 ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D 859 ; write -3 860 mova m15, [rsp+2*32] 861 punpcklbw m8, m0, m3 862 punpckhbw m9, m0, m3 863 pmaddubsw m8, [pb_m1_1] 864 pmaddubsw m9, [pb_m1_1] 865 paddw m10, m8 866 paddw m11, m9 ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2 867 punpcklbw m8, m7, m15 868 punpckhbw m7, m15 869 pmaddubsw m8, [pb_m1_1] 870 pmaddubsw m7, [pb_m1_1] 871 mova [rsp+2*32], m8 872 paddw m10, m8 873 paddw m11, m7 ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3 874 pmulhrsw m8, m10, [pw_2048] 875 pmulhrsw m9, m11, [pw_2048] 876 packuswb m8, m9 877 vpblendvb m8, m13, m8, m1 878 mova [rsp+6*32], m8 ; don't clobber p2/m13 since we need it in F 879 880 ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E 881 ; write -2 882%ifidn %2, v 883 lea tmpq, [dstq+strideq*4] 884%endif 885 punpcklbw m8, m0, m4 886 punpckhbw m9, m0, m4 887 pmaddubsw m8, [pb_m1_1] 888 pmaddubsw m9, [pb_m1_1] 889 paddw m10, m8 890 paddw m11, m9 ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3 891%ifidn %2, v 892 mova m9, [tmpq+strideq*0] ; q4 893%else 894 mova m9, [rsp+15*32] 895%endif 896 punpcklbw m8, m12, m9 897 punpckhbw m9, m12, m9 898 pmaddubsw m8, [pb_m1_1] 899 pmaddubsw m9, [pb_m1_1] 900 mova [rsp+7*32], m8 901 mova [rsp+5*32], m9 902 paddw m10, m8 903 paddw m11, m9 ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4 904 pmulhrsw m8, m10, [pw_2048] 905 pmulhrsw m9, m11, [pw_2048] 906 packuswb m8, m9 907 vpblendvb m8, m3, m8, m1 908 mova [rsp+8*32], m8 ; don't clobber p1/m3 since we need it in G 909 910 ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F 911 ; write -1 912%ifidn %2, v 913 mova m9, [tmpq+strideq*1] ; q5 914%else 915 mova m9, [rsp+16*32] 916%endif 917 punpcklbw m8, m0, m5 918 punpckhbw m0, m5 919 pmaddubsw m8, [pb_m1_1] 920 pmaddubsw m0, [pb_m1_1] 921 paddw m10, m8 922 paddw m11, m0 ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4 923 punpcklbw m0, m13, m9 924 punpckhbw m9, m13, m9 925 mova m13, [rsp+6*32] 926 pmaddubsw m0, [pb_m1_1] 927 pmaddubsw m9, [pb_m1_1] 928 mova [rsp+ 9*32], m0 929 mova [rsp+10*32], m9 930 paddw m10, m0 931 paddw m11, m9 ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5 932 pmulhrsw m0, m10, [pw_2048] 933 pmulhrsw m8, m11, [pw_2048] 934 packuswb m0, m8 935 vpblendvb m0, m4, m0, m1 936 mova [rsp+6*32], m0 ; don't clobber p0/m4 since we need it in H 937 938 ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G 939 ; write +0 940%ifidn %2, v 941 mova m0, [tmpq+strideq*2] ; q6 942%else 943 mova m0, [rsp+17*32] 944%endif 945 paddw m10, [rsp+3*32] 946 paddw m11, [rsp+4*32] ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5 947 punpcklbw m8, m3, m0 948 punpckhbw m9, m3, m0 949 mova m3, [rsp+8*32] 950 pmaddubsw m8, [pb_m1_1] 951 pmaddubsw m9, [pb_m1_1] 952 mova [rsp+3*32], m8 953 mova [rsp+4*32], m9 954 paddw m10, m8 955 paddw m11, m9 ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6 956 pmulhrsw m8, m10, [pw_2048] 957 pmulhrsw m9, m11, [pw_2048] 958 packuswb m8, m9 959 vpblendvb m8, m5, m8, m1 960 mova [rsp+8*32], m8 ; don't clobber q0/m5 since we need it in I 961 962 ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H 963 ; write +1 964 paddw m10, [rsp+1*32] 965 paddw m11, m2 ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6 966 punpcklbw m8, m4, m0 967 punpckhbw m2, m4, m0 968 mova m4, [rsp+6*32] 969 pmaddubsw m8, [pb_m1_1] 970 pmaddubsw m2, [pb_m1_1] 971 paddw m10, m8 972 paddw m11, m2 ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2 973 pmulhrsw m2, m10, [pw_2048] 974 pmulhrsw m9, m11, [pw_2048] 975 packuswb m2, m9 976 vpblendvb m2, m6, m2, m1 ; don't clobber q1/m6 since we need it in K 977 978 ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I 979 ; write +2 980 paddw m10, [rsp+2*32] 981 paddw m11, m7 ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2 982 punpcklbw m8, m5, m0 983 punpckhbw m9, m5, m0 984 mova m5, [rsp+8*32] 985 pmaddubsw m8, [pb_m1_1] 986 pmaddubsw m9, [pb_m1_1] 987 paddw m10, m8 988 paddw m11, m9 ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3 989 pmulhrsw m7, m10, [pw_2048] 990 pmulhrsw m9, m11, [pw_2048] 991 packuswb m7, m9 992 vpblendvb m7, m14, m7, m1 ; don't clobber q2/m14 since we need it in K 993 994 ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J 995 ; write +3 996 paddw m10, [rsp+7*32] 997 paddw m11, [rsp+5*32] ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3 998 punpcklbw m8, m6, m0 999 punpckhbw m9, m6, m0 1000 SWAP 2, 6 1001 pmaddubsw m8, [pb_m1_1] 1002 pmaddubsw m9, [pb_m1_1] 1003 paddw m10, m8 1004 paddw m11, m9 ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4 1005 pmulhrsw m8, m10, [pw_2048] 1006 pmulhrsw m9, m11, [pw_2048] 1007 packuswb m8, m9 1008 vpblendvb m8, m15, m8, m1 1009%ifidn %2, v 1010 mova [tmpq+mstrideq], m8 ; q3 1011%else 1012 mova [rsp+20*32], m8 1013%endif 1014 1015 ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K 1016 ; write +4 1017 paddw m10, [rsp+ 9*32] 1018 paddw m11, [rsp+10*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1019 punpcklbw m8, m14, m0 1020 punpckhbw m9, m14, m0 1021 SWAP 14, 7 1022 pmaddubsw m8, [pb_m1_1] 1023 pmaddubsw m9, [pb_m1_1] 1024 paddw m10, m8 1025 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1026 pmulhrsw m8, m10, [pw_2048] 1027 pmulhrsw m9, m11, [pw_2048] 1028 packuswb m8, m9 1029%ifidn %2, v 1030 mova m9, [tmpq+strideq*0] 1031%else 1032 mova m9, [rsp+15*32] 1033%endif 1034 vpblendvb m8, m9, m8, m1 1035%ifidn %2, v 1036 mova [tmpq+strideq*0], m8 ; q4 1037%else 1038 mova [rsp+15*32], m8 1039%endif 1040 1041 ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L 1042 ; write +5 1043 paddw m10, [rsp+3*32] 1044 paddw m11, [rsp+4*32] ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4 1045 punpcklbw m8, m15, m0 1046 punpckhbw m9, m15, m0 1047 pmaddubsw m8, [pb_m1_1] 1048 pmaddubsw m9, [pb_m1_1] 1049 paddw m10, m8 1050 paddw m11, m9 ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5 1051 pmulhrsw m10, [pw_2048] 1052 pmulhrsw m11, [pw_2048] 1053 packuswb m10, m11 1054%ifidn %2, v 1055 mova m11, [tmpq+strideq*1] 1056%else 1057 mova m11, [rsp+16*32] 1058%endif 1059 vpblendvb m10, m11, m10, m1 1060%ifidn %2, v 1061 mova [tmpq+strideq*1], m10 ; q5 1062%else 1063 mova [rsp+16*32], m10 1064%endif 1065 1066 mova m9, [rsp+0*32] 1067%ifidn %2, v 1068 lea tmpq, [dstq+mstrideq*4] 1069%endif 1070%endif 1071%if %1 >= 8 1072 ; flat8 filter 1073 punpcklbw m0, m12, m3 1074 punpckhbw m1, m12, m3 1075 pmaddubsw m2, m0, [pb_3_1] 1076 pmaddubsw m7, m1, [pb_3_1] ; 3 * p3 + p1 1077 punpcklbw m8, m13, m4 1078 punpckhbw m11, m13, m4 1079 pmaddubsw m8, [pb_2_1] 1080 pmaddubsw m11, [pb_2_1] 1081 paddw m2, m8 1082 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 1083 punpcklbw m8, m5, [pb_4] 1084 punpckhbw m11, m5, [pb_4] 1085 pmaddubsw m8, [pb_1] 1086 pmaddubsw m11, [pb_1] 1087 paddw m2, m8 1088 paddw m7, m11 ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4 1089 psrlw m8, m2, 3 1090 psrlw m11, m7, 3 1091 packuswb m8, m11 1092 vpblendvb m10, m13, m8, m9 ; p2 1093%ifidn %2, v 1094 mova [tmpq+strideq*1], m10 ; p2 1095%endif 1096 1097 pmaddubsw m8, m0, [pb_m1_1] 1098 pmaddubsw m11, m1, [pb_m1_1] 1099 paddw m2, m8 1100 paddw m7, m11 1101 punpcklbw m8, m13, m6 1102 punpckhbw m11, m13, m6 1103 pmaddubsw m8, [pb_m1_1] 1104 pmaddubsw m11, [pb_m1_1] 1105 paddw m2, m8 1106 paddw m7, m11 ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4 1107 psrlw m8, m2, 3 1108 psrlw m11, m7, 3 1109 packuswb m8, m11 1110 vpblendvb m8, m3, m8, m9 ; p1 1111%ifidn %2, v 1112 mova [tmpq+strideq*2], m8 ; p1 1113%else 1114 mova [rsp+0*32], m8 1115%endif 1116 1117 pmaddubsw m0, [pb_1] 1118 pmaddubsw m1, [pb_1] 1119 psubw m2, m0 1120 psubw m7, m1 1121 punpcklbw m8, m4, m14 1122 punpckhbw m11, m4, m14 1123 pmaddubsw m8, [pb_1] 1124 pmaddubsw m11, [pb_1] 1125 paddw m2, m8 1126 paddw m7, m11 ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4 1127 psrlw m8, m2, 3 1128 psrlw m11, m7, 3 1129 packuswb m8, m11 1130 vpblendvb m8, m4, m8, m9 ; p0 1131%ifidn %2, v 1132 mova [tmpq+stride3q ], m8 ; p0 1133%else 1134 mova [rsp+1*32], m8 1135%endif 1136 1137 punpcklbw m0, m5, m15 1138 punpckhbw m1, m5, m15 1139 pmaddubsw m8, m0, [pb_1] 1140 pmaddubsw m11, m1, [pb_1] 1141 paddw m2, m8 1142 paddw m7, m11 1143 punpcklbw m8, m4, m12 1144 punpckhbw m11, m4, m12 1145 pmaddubsw m8, [pb_1] 1146 pmaddubsw m11, [pb_1] 1147 psubw m2, m8 1148 psubw m7, m11 ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4 1149 psrlw m8, m2, 3 1150 psrlw m11, m7, 3 1151 packuswb m8, m11 1152 vpblendvb m11, m5, m8, m9 ; q0 1153%ifidn %2, v 1154 mova [dstq+strideq*0], m11 ; q0 1155%endif 1156 1157 pmaddubsw m0, [pb_m1_1] 1158 pmaddubsw m1, [pb_m1_1] 1159 paddw m2, m0 1160 paddw m7, m1 1161 punpcklbw m8, m13, m6 1162 punpckhbw m13, m6 1163 pmaddubsw m8, [pb_m1_1] 1164 pmaddubsw m13, [pb_m1_1] 1165 paddw m2, m8 1166 paddw m7, m13 ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4 1167 psrlw m8, m2, 3 1168 psrlw m13, m7, 3 1169 packuswb m8, m13 1170 vpblendvb m13, m6, m8, m9 ; q1 1171%ifidn %2, v 1172 mova [dstq+strideq*1], m13 ; q1 1173%endif 1174 1175 punpcklbw m0, m3, m6 1176 punpckhbw m1, m3, m6 1177 pmaddubsw m0, [pb_1] 1178 pmaddubsw m1, [pb_1] 1179 psubw m2, m0 1180 psubw m7, m1 1181 punpcklbw m0, m14, m15 1182 punpckhbw m1, m14, m15 1183 pmaddubsw m0, [pb_1] 1184 pmaddubsw m1, [pb_1] 1185 paddw m2, m0 1186 paddw m7, m1 ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4 1187 psrlw m2, 3 1188 psrlw m7, 3 1189 packuswb m2, m7 1190 vpblendvb m2, m14, m2, m9 ; q2 1191%ifidn %2, v 1192 mova [dstq+strideq*2], m2 ; q2 1193%else 1194 mova m0, [rsp+0*32] 1195 mova m1, [rsp+1*32] 1196%if %1 == 8 1197 ; 16x8 transpose 1198 punpcklbw m3, m12, m10 1199 punpckhbw m12, m10 1200 punpcklbw m10, m0, m1 1201 punpckhbw m0, m1 1202 punpcklbw m1, m11, m13 1203 punpckhbw m11, m13 1204 punpcklbw m13, m2, m15 1205 punpckhbw m2, m15 1206 1207 punpcklwd m15, m3, m10 1208 punpckhwd m3, m10 1209 punpcklwd m10, m12, m0 1210 punpckhwd m12, m0 1211 punpcklwd m0, m1, m13 1212 punpckhwd m1, m13 1213 punpcklwd m13, m11, m2 1214 punpckhwd m11, m2 1215 1216 punpckldq m2, m15, m0 1217 punpckhdq m15, m0 1218 punpckldq m0, m3, m1 1219 punpckhdq m3, m1 1220 punpckldq m1, m10, m13 1221 punpckhdq m10, m13 1222 punpckldq m13, m12, m11 1223 punpckhdq m12, m11 1224 1225 ; write 8x32 1226 movq [dstq+strideq*0-4], xm2 1227 movhps [dstq+strideq*1-4], xm2 1228 movq [dstq+strideq*2-4], xm15 1229 movhps [dstq+stride3q -4], xm15 1230 lea dstq, [dstq+strideq*4] 1231 movq [dstq+strideq*0-4], xm0 1232 movhps [dstq+strideq*1-4], xm0 1233 movq [dstq+strideq*2-4], xm3 1234 movhps [dstq+stride3q -4], xm3 1235 lea dstq, [dstq+strideq*4] 1236 movq [dstq+strideq*0-4], xm1 1237 movhps [dstq+strideq*1-4], xm1 1238 movq [dstq+strideq*2-4], xm10 1239 movhps [dstq+stride3q -4], xm10 1240 lea dstq, [dstq+strideq*4] 1241 movq [dstq+strideq*0-4], xm13 1242 movhps [dstq+strideq*1-4], xm13 1243 movq [dstq+strideq*2-4], xm12 1244 movhps [dstq+stride3q -4], xm12 1245 lea dstq, [dstq+strideq*4] 1246 1247 vextracti128 xm2, m2, 1 1248 vextracti128 xm15, m15, 1 1249 vextracti128 xm0, m0, 1 1250 vextracti128 xm3, m3, 1 1251 vextracti128 xm1, m1, 1 1252 vextracti128 xm10, m10, 1 1253 vextracti128 xm13, m13, 1 1254 vextracti128 xm12, m12, 1 1255 1256 movq [dstq+strideq*0-4], xm2 1257 movhps [dstq+strideq*1-4], xm2 1258 movq [dstq+strideq*2-4], xm15 1259 movhps [dstq+stride3q -4], xm15 1260 lea dstq, [dstq+strideq*4] 1261 movq [dstq+strideq*0-4], xm0 1262 movhps [dstq+strideq*1-4], xm0 1263 movq [dstq+strideq*2-4], xm3 1264 movhps [dstq+stride3q -4], xm3 1265 lea dstq, [dstq+strideq*4] 1266 movq [dstq+strideq*0-4], xm1 1267 movhps [dstq+strideq*1-4], xm1 1268 movq [dstq+strideq*2-4], xm10 1269 movhps [dstq+stride3q -4], xm10 1270 lea dstq, [dstq+strideq*4] 1271 movq [dstq+strideq*0-4], xm13 1272 movhps [dstq+strideq*1-4], xm13 1273 movq [dstq+strideq*2-4], xm12 1274 movhps [dstq+stride3q -4], xm12 1275 lea dstq, [dstq+strideq*4] 1276%else 1277 ; 16x16 transpose and store 1278 SWAP 5, 10, 2 1279 SWAP 6, 0 1280 SWAP 7, 1 1281 SWAP 8, 11 1282 SWAP 9, 13 1283 mova m0, [rsp+11*32] 1284 mova m1, [rsp+12*32] 1285 mova m2, [rsp+13*32] 1286 mova m3, [rsp+14*32] 1287 mova m4, [rsp+19*32] 1288 mova m11, [rsp+20*32] 1289 mova m12, [rsp+15*32] 1290 mova m13, [rsp+16*32] 1291 mova m14, [rsp+17*32] 1292 TRANSPOSE_16X16B 1, 0, [rsp+18*32] 1293 movu [dstq+strideq*0-8], xm0 1294 movu [dstq+strideq*1-8], xm1 1295 movu [dstq+strideq*2-8], xm2 1296 movu [dstq+stride3q -8], xm3 1297 lea dstq, [dstq+strideq*4] 1298 movu [dstq+strideq*0-8], xm4 1299 movu [dstq+strideq*1-8], xm5 1300 movu [dstq+strideq*2-8], xm6 1301 movu [dstq+stride3q -8], xm7 1302 lea dstq, [dstq+strideq*4] 1303 movu [dstq+strideq*0-8], xm8 1304 movu [dstq+strideq*1-8], xm9 1305 movu [dstq+strideq*2-8], xm10 1306 movu [dstq+stride3q -8], xm11 1307 lea dstq, [dstq+strideq*4] 1308 movu [dstq+strideq*0-8], xm12 1309 movu [dstq+strideq*1-8], xm13 1310 movu [dstq+strideq*2-8], xm14 1311 movu [dstq+stride3q -8], xm15 1312 lea dstq, [dstq+strideq*4] 1313 vextracti128 [dstq+strideq*0-8], m0, 1 1314 vextracti128 [dstq+strideq*1-8], m1, 1 1315 vextracti128 [dstq+strideq*2-8], m2, 1 1316 vextracti128 [dstq+stride3q -8], m3, 1 1317 lea dstq, [dstq+strideq*4] 1318 vextracti128 [dstq+strideq*0-8], m4, 1 1319 vextracti128 [dstq+strideq*1-8], m5, 1 1320 vextracti128 [dstq+strideq*2-8], m6, 1 1321 vextracti128 [dstq+stride3q -8], m7, 1 1322 lea dstq, [dstq+strideq*4] 1323 vextracti128 [dstq+strideq*0-8], m8, 1 1324 vextracti128 [dstq+strideq*1-8], m9, 1 1325 vextracti128 [dstq+strideq*2-8], m10, 1 1326 vextracti128 [dstq+stride3q -8], m11, 1 1327 lea dstq, [dstq+strideq*4] 1328 vextracti128 [dstq+strideq*0-8], m12, 1 1329 vextracti128 [dstq+strideq*1-8], m13, 1 1330 vextracti128 [dstq+strideq*2-8], m14, 1 1331 vextracti128 [dstq+stride3q -8], m15, 1 1332 lea dstq, [dstq+strideq*4] 1333%endif 1334%endif 1335%elif %1 == 6 1336 ; flat6 filter 1337 1338 punpcklbw m8, m13, m5 1339 punpckhbw m11, m13, m5 1340 pmaddubsw m0, m8, [pb_3_1] 1341 pmaddubsw m1, m11, [pb_3_1] 1342 punpcklbw m7, m4, m3 1343 punpckhbw m10, m4, m3 1344 pmaddubsw m2, m7, [pb_2] 1345 pmaddubsw m12, m10, [pb_2] 1346 paddw m0, m2 1347 paddw m1, m12 1348 pmulhrsw m2, m0, [pw_4096] 1349 pmulhrsw m12, m1, [pw_4096] 1350 packuswb m2, m12 1351 vpblendvb m2, m3, m2, m9 1352%ifidn %2, v 1353 mova [tmpq+strideq*2], m2 ; p1 1354%endif 1355 1356 pmaddubsw m8, [pb_m1_1] 1357 pmaddubsw m11, [pb_m1_1] 1358 paddw m0, m8 1359 paddw m1, m11 1360 punpcklbw m8, m13, m6 1361 punpckhbw m11, m13, m6 1362 pmaddubsw m8, [pb_m1_1] 1363 pmaddubsw m11, [pb_m1_1] 1364 paddw m0, m8 1365 paddw m1, m11 1366 pmulhrsw m12, m0, [pw_4096] 1367 pmulhrsw m13, m1, [pw_4096] 1368 packuswb m12, m13 1369 vpblendvb m12, m4, m12, m9 1370%ifidn %2, v 1371 mova [tmpq+stride3q], m12 ; p0 1372%endif 1373 1374 paddw m0, m8 1375 paddw m1, m11 1376 punpcklbw m8, m3, m14 1377 punpckhbw m11, m3, m14 1378 pmaddubsw m14, m8, [pb_m1_1] 1379 pmaddubsw m13, m11, [pb_m1_1] 1380 paddw m0, m14 1381 paddw m1, m13 1382 pmulhrsw m14, m0, [pw_4096] 1383 pmulhrsw m13, m1, [pw_4096] 1384 packuswb m14, m13 1385 vpblendvb m14, m5, m14, m9 1386%ifidn %2, v 1387 mova [dstq+strideq*0], m14 ; q0 1388%endif 1389 1390 pmaddubsw m8, [pb_m1_2] 1391 pmaddubsw m11, [pb_m1_2] 1392 paddw m0, m8 1393 paddw m1, m11 1394 pmaddubsw m7, [pb_m1_0] 1395 pmaddubsw m10, [pb_m1_0] 1396 paddw m0, m7 1397 paddw m1, m10 1398 pmulhrsw m0, [pw_4096] 1399 pmulhrsw m1, [pw_4096] 1400 packuswb m0, m1 1401 vpblendvb m0, m6, m0, m9 1402%ifidn %2, v 1403 mova [dstq+strideq*1], m0 ; q1 1404%else 1405 TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1 1406%endif 1407%else 1408%ifidn %2, v 1409 mova [tmpq+strideq*0], m3 ; p1 1410 mova [tmpq+strideq*1], m4 ; p0 1411 mova [tmpq+strideq*2], m5 ; q0 1412 mova [tmpq+stride3q ], m6 ; q1 1413%else 1414 TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7 1415%endif 1416%endif 1417%endmacro 1418 1419INIT_YMM avx2 1420cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \ 1421 dst, stride, mask, l, l_stride, lut, \ 1422 w, stride3, mstride, tmp 1423 shl l_strideq, 2 1424 sub lq, l_strideq 1425 mov mstrideq, strideq 1426 neg mstrideq 1427 lea stride3q, [strideq*3] 1428 1429.loop: 1430 cmp byte [maskq+8], 0 ; vmask[2] 1431 je .no_flat16 1432 1433 FILTER 16, v 1434 jmp .end 1435 1436.no_flat16: 1437 cmp byte [maskq+4], 0 ; vmask[1] 1438 je .no_flat 1439 1440 FILTER 8, v 1441 jmp .end 1442 1443.no_flat: 1444 cmp byte [maskq+0], 0 ; vmask[0] 1445 je .end 1446 1447 call .v4 1448 1449.end: 1450 add lq, 32 1451 add dstq, 32 1452 add maskq, 1 1453 sub wd, 8 1454 jg .loop 1455 RET 1456ALIGN function_align 1457.v4: 1458 FILTER 4, v 1459 ret 1460 1461INIT_YMM avx2 1462cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \ 1463 dst, stride, mask, l, l_stride, lut, \ 1464 h, stride3, l_stride3, tmp 1465 shl l_strideq, 2 1466 sub lq, 4 1467 lea stride3q, [strideq*3] 1468 lea l_stride3q, [l_strideq*3] 1469 1470.loop: 1471 cmp byte [maskq+8], 0 ; vmask[2] 1472 je .no_flat16 1473 1474 FILTER 16, h 1475 jmp .end 1476 1477.no_flat16: 1478 cmp byte [maskq+4], 0 ; vmask[1] 1479 je .no_flat 1480 1481 FILTER 8, h 1482 jmp .end 1483 1484.no_flat: 1485 cmp byte [maskq+0], 0 ; vmask[0] 1486 je .no_filter 1487 1488 call .h4 1489 jmp .end 1490 1491.no_filter: 1492 lea dstq, [dstq+stride3q*8] 1493 lea lq, [lq+l_strideq*8] 1494 lea dstq, [dstq+strideq*8] 1495.end: 1496 add maskq, 1 1497 sub hd, 8 1498 jg .loop 1499 RET 1500ALIGN function_align 1501.h4: 1502 FILTER 4, h 1503 ret 1504 1505INIT_YMM avx2 1506cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \ 1507 dst, stride, mask, l, l_stride, lut, \ 1508 w, stride3, mstride, tmp 1509 shl l_strideq, 2 1510 sub lq, l_strideq 1511 mov mstrideq, strideq 1512 neg mstrideq 1513 lea stride3q, [strideq*3] 1514 1515.loop: 1516 cmp byte [maskq+4], 0 ; vmask[1] 1517 je .no_flat 1518 1519 FILTER 6, v 1520 jmp .end 1521 1522.no_flat: 1523 cmp byte [maskq+0], 0 ; vmask[0] 1524 je .end 1525 1526 call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4 1527 1528.end: 1529 add lq, 32 1530 add dstq, 32 1531 add maskq, 1 1532 sub wd, 8 1533 jg .loop 1534 RET 1535 1536INIT_YMM avx2 1537cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \ 1538 dst, stride, mask, l, l_stride, lut, \ 1539 h, stride3, l_stride3, tmp 1540 shl l_strideq, 2 1541 sub lq, 4 1542 lea stride3q, [strideq*3] 1543 lea l_stride3q, [l_strideq*3] 1544 1545.loop: 1546 cmp byte [maskq+4], 0 ; vmask[1] 1547 je .no_flat 1548 1549 FILTER 6, h 1550 jmp .end 1551 1552.no_flat: 1553 cmp byte [maskq+0], 0 ; vmask[0] 1554 je .no_filter 1555 1556 call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4 1557 jmp .end 1558 1559.no_filter: 1560 lea dstq, [dstq+stride3q*8] 1561 lea lq, [lq+l_strideq*8] 1562 lea dstq, [dstq+strideq*8] 1563.end: 1564 add maskq, 1 1565 sub hd, 8 1566 jg .loop 1567 RET 1568 1569%endif ; ARCH_X86_64 1570