1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 30 31; dav1d_obmc_masks[] << 9 32obmc_masks: dw 0, 0, 9728, 0, 12800, 7168, 2560, 0 33 dw 14336, 11264, 8192, 5632, 3584, 1536, 0, 0 34 dw 15360, 13824, 12288, 10752, 9216, 7680, 6144, 5120 35 dw 4096, 3072, 2048, 1536, 0, 0, 0, 0 36 dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240 37 dw 9728, 8704, 8192, 7168, 6656, 6144, 5632, 4608 38 dw 4096, 3584, 3072, 2560, 2048, 2048, 1536, 1024 39 40blend_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3 41spel_h_shufA: db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 42spel_h_shufB: db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13 43spel_h_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 44spel_s_shuf2: db 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 45spel_s_shuf8: db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15 46unpckw: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 47rescale_mul: dd 0, 1, 2, 3 48resize_shuf: db 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 49 db 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15 50bdct_lb_q: times 8 db 0 51 times 8 db 4 52 times 8 db 8 53 times 8 db 12 54 55pw_2: times 8 dw 2 56pw_16: times 4 dw 16 57prep_mul: times 4 dw 16 58 times 8 dw 4 59pw_64: times 8 dw 64 60pw_256: times 8 dw 256 61pw_2048: times 4 dw 2048 62bidir_mul: times 4 dw 2048 63pw_8192: times 8 dw 8192 64pw_27615: times 8 dw 27615 65pw_32766: times 8 dw 32766 66pw_m512: times 8 dw -512 67pd_63: times 4 dd 63 68pd_64: times 4 dd 64 69pd_512: times 4 dd 512 70pd_m524256: times 4 dd -524256 ; -8192 << 6 + 32 71pd_0x3ff: times 4 dd 0x3ff 72pd_0x4000: times 4 dd 0x4000 73pq_0x400000: times 2 dq 0x400000 74pq_0x40000000: times 2 dq 0x40000000 75pd_65538: times 2 dd 65538 76 77put_bilin_h_rnd: times 4 dw 8 78 times 4 dw 10 79s_8tap_h_rnd: times 2 dd 2 80 times 2 dd 8 81put_s_8tap_v_rnd: times 2 dd 512 82 times 2 dd 128 83s_8tap_h_sh: dd 2, 4 84put_s_8tap_v_sh: dd 10, 8 85bidir_rnd: times 4 dw -16400 86 times 4 dw -16388 87put_8tap_h_rnd: dd 34, 34, 40, 40 88prep_8tap_1d_rnd: times 2 dd 8 - (8192 << 4) 89prep_8tap_2d_rnd: times 4 dd 32 - (8192 << 5) 90 91warp8x8_shift: dd 11, 13 92warp8x8_rnd1: dd 1024, 1024, 4096, 4096 93warp8x8_rnd2: times 4 dw 4096 94 times 4 dw 16384 95warp8x8t_rnd: times 2 dd 16384 - (8192 << 15) 96 97%macro BIDIR_JMP_TABLE 2-* 98 %xdefine %1_%2_table (%%table - 2*%3) 99 %xdefine %%base %1_%2_table 100 %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2) 101 %%table: 102 %rep %0 - 2 103 dd %%prefix %+ .w%3 - %%base 104 %rotate 1 105 %endrep 106%endmacro 107 108BIDIR_JMP_TABLE avg, ssse3, 4, 8, 16, 32, 64, 128 109BIDIR_JMP_TABLE w_avg, ssse3, 4, 8, 16, 32, 64, 128 110BIDIR_JMP_TABLE mask, ssse3, 4, 8, 16, 32, 64, 128 111BIDIR_JMP_TABLE w_mask_420, ssse3, 4, 8, 16, 32, 64, 128 112BIDIR_JMP_TABLE w_mask_422, ssse3, 4, 8, 16, 32, 64, 128 113BIDIR_JMP_TABLE w_mask_444, ssse3, 4, 8, 16, 32, 64, 128 114BIDIR_JMP_TABLE blend, ssse3, 4, 8, 16, 32 115BIDIR_JMP_TABLE blend_v, ssse3, 2, 4, 8, 16, 32 116BIDIR_JMP_TABLE blend_h, ssse3, 2, 4, 8, 16, 32, 64, 128 117 118%macro BASE_JMP_TABLE 3-* 119 %xdefine %1_%2_table (%%table - %3) 120 %xdefine %%base %1_%2 121 %%table: 122 %rep %0 - 2 123 dw %%base %+ _w%3 - %%base 124 %rotate 1 125 %endrep 126%endmacro 127 128%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put) 129%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep) 130 131BASE_JMP_TABLE put, ssse3, 2, 4, 8, 16, 32, 64, 128 132BASE_JMP_TABLE prep, ssse3, 4, 8, 16, 32, 64, 128 133 134%macro SCALED_JMP_TABLE 2-* 135 %xdefine %1_%2_table (%%table - %3) 136 %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2) 137%%table: 138 %rep %0 - 2 139 dw %%base %+ .w%3 - %%base 140 %rotate 1 141 %endrep 142 %rotate 2 143%%dy_1024: 144 %xdefine %1_%2_dy1_table (%%dy_1024 - %3) 145 %rep %0 - 2 146 dw %%base %+ .dy1_w%3 - %%base 147 %rotate 1 148 %endrep 149 %rotate 2 150%%dy_2048: 151 %xdefine %1_%2_dy2_table (%%dy_2048 - %3) 152 %rep %0 - 2 153 dw %%base %+ .dy2_w%3 - %%base 154 %rotate 1 155 %endrep 156%endmacro 157 158SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128 159SCALED_JMP_TABLE prep_8tap_scaled, ssse3, 4, 8, 16, 32, 64, 128 160 161cextern mc_subpel_filters 162%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8) 163 164cextern mc_warp_filter 165cextern resize_filter 166 167SECTION .text 168 169%if UNIX64 170DECLARE_REG_TMP 7 171%else 172DECLARE_REG_TMP 5 173%endif 174 175INIT_XMM ssse3 176cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy 177%define base t0-put_ssse3 178 mov mxyd, r6m ; mx 179 LEA t0, put_ssse3 180 movifnidn wd, wm 181 test mxyd, mxyd 182 jnz .h 183 mov mxyd, r7m ; my 184 test mxyd, mxyd 185 jnz .v 186.put: 187 tzcnt wd, wd 188 movzx wd, word [base+put_ssse3_table+wq*2] 189 add wq, t0 190 movifnidn hd, hm 191 jmp wq 192.put_w2: 193 mov r4d, [srcq+ssq*0] 194 mov r6d, [srcq+ssq*1] 195 lea srcq, [srcq+ssq*2] 196 mov [dstq+dsq*0], r4d 197 mov [dstq+dsq*1], r6d 198 lea dstq, [dstq+dsq*2] 199 sub hd, 2 200 jg .put_w2 201 RET 202.put_w4: 203 movq m0, [srcq+ssq*0] 204 movq m1, [srcq+ssq*1] 205 lea srcq, [srcq+ssq*2] 206 movq [dstq+dsq*0], m0 207 movq [dstq+dsq*1], m1 208 lea dstq, [dstq+dsq*2] 209 sub hd, 2 210 jg .put_w4 211 RET 212.put_w8: 213 movu m0, [srcq+ssq*0] 214 movu m1, [srcq+ssq*1] 215 lea srcq, [srcq+ssq*2] 216 mova [dstq+dsq*0], m0 217 mova [dstq+dsq*1], m1 218 lea dstq, [dstq+dsq*2] 219 sub hd, 2 220 jg .put_w8 221 RET 222.put_w16: 223 movu m0, [srcq+ssq*0+16*0] 224 movu m1, [srcq+ssq*0+16*1] 225 movu m2, [srcq+ssq*1+16*0] 226 movu m3, [srcq+ssq*1+16*1] 227 lea srcq, [srcq+ssq*2] 228 mova [dstq+dsq*0+16*0], m0 229 mova [dstq+dsq*0+16*1], m1 230 mova [dstq+dsq*1+16*0], m2 231 mova [dstq+dsq*1+16*1], m3 232 lea dstq, [dstq+dsq*2] 233 sub hd, 2 234 jg .put_w16 235 RET 236.put_w32: 237 movu m0, [srcq+16*0] 238 movu m1, [srcq+16*1] 239 movu m2, [srcq+16*2] 240 movu m3, [srcq+16*3] 241 add srcq, ssq 242 mova [dstq+16*0], m0 243 mova [dstq+16*1], m1 244 mova [dstq+16*2], m2 245 mova [dstq+16*3], m3 246 add dstq, dsq 247 dec hd 248 jg .put_w32 249 RET 250.put_w64: 251 movu m0, [srcq+16*0] 252 movu m1, [srcq+16*1] 253 movu m2, [srcq+16*2] 254 movu m3, [srcq+16*3] 255 mova [dstq+16*0], m0 256 mova [dstq+16*1], m1 257 mova [dstq+16*2], m2 258 mova [dstq+16*3], m3 259 movu m0, [srcq+16*4] 260 movu m1, [srcq+16*5] 261 movu m2, [srcq+16*6] 262 movu m3, [srcq+16*7] 263 add srcq, ssq 264 mova [dstq+16*4], m0 265 mova [dstq+16*5], m1 266 mova [dstq+16*6], m2 267 mova [dstq+16*7], m3 268 add dstq, dsq 269 dec hd 270 jg .put_w64 271 RET 272.put_w128: 273 add srcq, 16*8 274 add dstq, 16*8 275.put_w128_loop: 276 movu m0, [srcq-16*8] 277 movu m1, [srcq-16*7] 278 movu m2, [srcq-16*6] 279 movu m3, [srcq-16*5] 280 mova [dstq-16*8], m0 281 mova [dstq-16*7], m1 282 mova [dstq-16*6], m2 283 mova [dstq-16*5], m3 284 movu m0, [srcq-16*4] 285 movu m1, [srcq-16*3] 286 movu m2, [srcq-16*2] 287 movu m3, [srcq-16*1] 288 mova [dstq-16*4], m0 289 mova [dstq-16*3], m1 290 mova [dstq-16*2], m2 291 mova [dstq-16*1], m3 292 movu m0, [srcq+16*0] 293 movu m1, [srcq+16*1] 294 movu m2, [srcq+16*2] 295 movu m3, [srcq+16*3] 296 mova [dstq+16*0], m0 297 mova [dstq+16*1], m1 298 mova [dstq+16*2], m2 299 mova [dstq+16*3], m3 300 movu m0, [srcq+16*4] 301 movu m1, [srcq+16*5] 302 movu m2, [srcq+16*6] 303 movu m3, [srcq+16*7] 304 add srcq, ssq 305 mova [dstq+16*4], m0 306 mova [dstq+16*5], m1 307 mova [dstq+16*6], m2 308 mova [dstq+16*7], m3 309 add dstq, dsq 310 dec hd 311 jg .put_w128_loop 312 RET 313.h: 314 movd m5, mxyd 315 mov mxyd, r7m ; my 316 mova m4, [base+pw_16] 317 pshufb m5, [base+pw_256] 318 psubw m4, m5 319 test mxyd, mxyd 320 jnz .hv 321 ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v 322 mov r6d, r8m ; bitdepth_max 323 shr r6d, 11 324 movddup m3, [base+put_bilin_h_rnd+r6*8] 325 movifnidn hd, hm 326 sub wd, 8 327 jg .h_w16 328 je .h_w8 329 cmp wd, -4 330 je .h_w4 331.h_w2: 332 movq m1, [srcq+ssq*0] 333 movhps m1, [srcq+ssq*1] 334 lea srcq, [srcq+ssq*2] 335 pmullw m0, m4, m1 336 psrlq m1, 16 337 pmullw m1, m5 338 paddw m0, m3 339 paddw m0, m1 340 psrlw m0, 4 341 movd [dstq+dsq*0], m0 342 punpckhqdq m0, m0 343 movd [dstq+dsq*1], m0 344 lea dstq, [dstq+dsq*2] 345 sub hd, 2 346 jg .h_w2 347 RET 348.h_w4: 349 movq m0, [srcq+ssq*0] 350 movhps m0, [srcq+ssq*1] 351 movq m1, [srcq+ssq*0+2] 352 movhps m1, [srcq+ssq*1+2] 353 lea srcq, [srcq+ssq*2] 354 pmullw m0, m4 355 pmullw m1, m5 356 paddw m0, m3 357 paddw m0, m1 358 psrlw m0, 4 359 movq [dstq+dsq*0], m0 360 movhps [dstq+dsq*1], m0 361 lea dstq, [dstq+dsq*2] 362 sub hd, 2 363 jg .h_w4 364 RET 365.h_w8: 366 movu m0, [srcq+ssq*0] 367 movu m1, [srcq+ssq*0+2] 368 pmullw m0, m4 369 pmullw m1, m5 370 paddw m0, m3 371 paddw m0, m1 372 movu m1, [srcq+ssq*1] 373 movu m2, [srcq+ssq*1+2] 374 lea srcq, [srcq+ssq*2] 375 pmullw m1, m4 376 pmullw m2, m5 377 paddw m1, m3 378 paddw m1, m2 379 psrlw m0, 4 380 psrlw m1, 4 381 mova [dstq+dsq*0], m0 382 mova [dstq+dsq*1], m1 383 lea dstq, [dstq+dsq*2] 384 sub hd, 2 385 jg .h_w8 386 RET 387.h_w16: 388 lea srcq, [srcq+wq*2] 389 lea dstq, [dstq+wq*2] 390 neg wq 391.h_w16_loop0: 392 mov r6, wq 393.h_w16_loop: 394 movu m0, [srcq+r6*2+ 0] 395 movu m1, [srcq+r6*2+ 2] 396 pmullw m0, m4 397 pmullw m1, m5 398 paddw m0, m3 399 paddw m0, m1 400 movu m1, [srcq+r6*2+16] 401 movu m2, [srcq+r6*2+18] 402 pmullw m1, m4 403 pmullw m2, m5 404 paddw m1, m3 405 paddw m1, m2 406 psrlw m0, 4 407 psrlw m1, 4 408 mova [dstq+r6*2+16*0], m0 409 mova [dstq+r6*2+16*1], m1 410 add r6, 16 411 jl .h_w16_loop 412 add srcq, ssq 413 add dstq, dsq 414 dec hd 415 jg .h_w16_loop0 416 RET 417.v: 418 shl mxyd, 11 419 movd m5, mxyd 420 pshufb m5, [base+pw_256] 421 movifnidn hd, hm 422 cmp wd, 4 423 jg .v_w8 424 je .v_w4 425.v_w2: 426 movd m0, [srcq+ssq*0] 427.v_w2_loop: 428 movd m1, [srcq+ssq*1] 429 lea srcq, [srcq+ssq*2] 430 punpcklqdq m2, m0, m1 431 movd m0, [srcq+ssq*0] 432 punpcklqdq m1, m0 433 psubw m1, m2 434 pmulhrsw m1, m5 435 paddw m1, m2 436 movd [dstq+dsq*0], m1 437 punpckhqdq m1, m1 438 movd [dstq+dsq*1], m1 439 lea dstq, [dstq+dsq*2] 440 sub hd, 2 441 jg .v_w2_loop 442 RET 443.v_w4: 444 movq m0, [srcq+ssq*0] 445.v_w4_loop: 446 movq m1, [srcq+ssq*1] 447 lea srcq, [srcq+ssq*2] 448 punpcklqdq m2, m0, m1 449 movq m0, [srcq+ssq*0] 450 punpcklqdq m1, m0 451 psubw m1, m2 452 pmulhrsw m1, m5 453 paddw m1, m2 454 movq [dstq+dsq*0], m1 455 movhps [dstq+dsq*1], m1 456 lea dstq, [dstq+dsq*2] 457 sub hd, 2 458 jg .v_w4_loop 459 RET 460.v_w8: 461%if ARCH_X86_64 462%if WIN64 463 push r7 464%endif 465 shl wd, 5 466 mov r7, srcq 467 lea r6d, [wq+hq-256] 468 mov r4, dstq 469%else 470 mov r6, srcq 471%endif 472.v_w8_loop0: 473 movu m0, [srcq+ssq*0] 474.v_w8_loop: 475 movu m3, [srcq+ssq*1] 476 lea srcq, [srcq+ssq*2] 477 psubw m1, m3, m0 478 pmulhrsw m1, m5 479 paddw m1, m0 480 movu m0, [srcq+ssq*0] 481 psubw m2, m0, m3 482 pmulhrsw m2, m5 483 paddw m2, m3 484 mova [dstq+dsq*0], m1 485 mova [dstq+dsq*1], m2 486 lea dstq, [dstq+dsq*2] 487 sub hd, 2 488 jg .v_w8_loop 489%if ARCH_X86_64 490 add r7, 16 491 add r4, 16 492 movzx hd, r6b 493 mov srcq, r7 494 mov dstq, r4 495 sub r6d, 1<<8 496%else 497 mov dstq, dstmp 498 add r6, 16 499 mov hd, hm 500 add dstq, 16 501 mov srcq, r6 502 mov dstmp, dstq 503 sub wd, 8 504%endif 505 jg .v_w8_loop0 506%if WIN64 507 pop r7 508%endif 509 RET 510.hv: 511 WIN64_SPILL_XMM 8 512 shl mxyd, 11 513 mova m3, [base+pw_2] 514 movd m6, mxyd 515 mova m7, [base+pw_8192] 516 pshufb m6, [base+pw_256] 517 test dword r8m, 0x800 518 jnz .hv_12bpc 519 psllw m4, 2 520 psllw m5, 2 521 mova m7, [base+pw_2048] 522.hv_12bpc: 523 movifnidn hd, hm 524 cmp wd, 4 525 jg .hv_w8 526 je .hv_w4 527.hv_w2: 528 movddup m0, [srcq+ssq*0] 529 pshufhw m1, m0, q0321 530 pmullw m0, m4 531 pmullw m1, m5 532 paddw m0, m3 533 paddw m0, m1 534 psrlw m0, 2 535.hv_w2_loop: 536 movq m2, [srcq+ssq*1] 537 lea srcq, [srcq+ssq*2] 538 movhps m2, [srcq+ssq*0] 539 pmullw m1, m4, m2 540 psrlq m2, 16 541 pmullw m2, m5 542 paddw m1, m3 543 paddw m1, m2 544 psrlw m1, 2 ; 1 _ 2 _ 545 shufpd m2, m0, m1, 0x01 ; 0 _ 1 _ 546 mova m0, m1 547 psubw m1, m2 548 paddw m1, m1 549 pmulhw m1, m6 550 paddw m1, m2 551 pmulhrsw m1, m7 552 movd [dstq+dsq*0], m1 553 punpckhqdq m1, m1 554 movd [dstq+dsq*1], m1 555 lea dstq, [dstq+dsq*2] 556 sub hd, 2 557 jg .hv_w2_loop 558 RET 559.hv_w4: 560 movddup m0, [srcq+ssq*0] 561 movddup m1, [srcq+ssq*0+2] 562 pmullw m0, m4 563 pmullw m1, m5 564 paddw m0, m3 565 paddw m0, m1 566 psrlw m0, 2 567.hv_w4_loop: 568 movq m1, [srcq+ssq*1] 569 movq m2, [srcq+ssq*1+2] 570 lea srcq, [srcq+ssq*2] 571 movhps m1, [srcq+ssq*0] 572 movhps m2, [srcq+ssq*0+2] 573 pmullw m1, m4 574 pmullw m2, m5 575 paddw m1, m3 576 paddw m1, m2 577 psrlw m1, 2 ; 1 2 578 shufpd m2, m0, m1, 0x01 ; 0 1 579 mova m0, m1 580 psubw m1, m2 581 paddw m1, m1 582 pmulhw m1, m6 583 paddw m1, m2 584 pmulhrsw m1, m7 585 movq [dstq+dsq*0], m1 586 movhps [dstq+dsq*1], m1 587 lea dstq, [dstq+dsq*2] 588 sub hd, 2 589 jg .hv_w4_loop 590 RET 591.hv_w8: 592%if ARCH_X86_64 593%if WIN64 594 push r7 595%endif 596 shl wd, 5 597 lea r6d, [wq+hq-256] 598 mov r4, srcq 599 mov r7, dstq 600%else 601 mov r6, srcq 602%endif 603.hv_w8_loop0: 604 movu m0, [srcq+ssq*0] 605 movu m1, [srcq+ssq*0+2] 606 pmullw m0, m4 607 pmullw m1, m5 608 paddw m0, m3 609 paddw m0, m1 610 psrlw m0, 2 611.hv_w8_loop: 612 movu m1, [srcq+ssq*1] 613 movu m2, [srcq+ssq*1+2] 614 lea srcq, [srcq+ssq*2] 615 pmullw m1, m4 616 pmullw m2, m5 617 paddw m1, m3 618 paddw m1, m2 619 psrlw m1, 2 620 psubw m2, m1, m0 621 paddw m2, m2 622 pmulhw m2, m6 623 paddw m2, m0 624 pmulhrsw m2, m7 625 mova [dstq+dsq*0], m2 626 movu m0, [srcq+ssq*0] 627 movu m2, [srcq+ssq*0+2] 628 pmullw m0, m4 629 pmullw m2, m5 630 paddw m0, m3 631 paddw m0, m2 632 psrlw m0, 2 633 psubw m2, m0, m1 634 paddw m2, m2 635 pmulhw m2, m6 636 paddw m2, m1 637 pmulhrsw m2, m7 638 mova [dstq+dsq*1], m2 639 lea dstq, [dstq+dsq*2] 640 sub hd, 2 641 jg .hv_w8_loop 642%if ARCH_X86_64 643 add r4, 16 644 add r7, 16 645 movzx hd, r6b 646 mov srcq, r4 647 mov dstq, r7 648 sub r6d, 1<<8 649%else 650 mov dstq, dstmp 651 add r6, 16 652 mov hd, hm 653 add dstq, 16 654 mov srcq, r6 655 mov dstmp, dstq 656 sub wd, 8 657%endif 658 jg .hv_w8_loop0 659%if WIN64 660 pop r7 661%endif 662 RET 663 664cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3 665%define base r6-prep_ssse3 666 movifnidn mxyd, r5m ; mx 667 LEA r6, prep_ssse3 668 movifnidn hd, hm 669 test mxyd, mxyd 670 jnz .h 671 mov mxyd, r6m ; my 672 test mxyd, mxyd 673 jnz .v 674.prep: 675 tzcnt wd, wd 676 movzx wd, word [base+prep_ssse3_table+wq*2] 677 mov r5d, r7m ; bitdepth_max 678 mova m5, [base+pw_8192] 679 add wq, r6 680 shr r5d, 11 681 movddup m4, [base+prep_mul+r5*8] 682 lea stride3q, [strideq*3] 683 jmp wq 684.prep_w4: 685 movq m0, [srcq+strideq*0] 686 movhps m0, [srcq+strideq*1] 687 movq m1, [srcq+strideq*2] 688 movhps m1, [srcq+stride3q ] 689 lea srcq, [srcq+strideq*4] 690 pmullw m0, m4 691 pmullw m1, m4 692 psubw m0, m5 693 psubw m1, m5 694 mova [tmpq+16*0], m0 695 mova [tmpq+16*1], m1 696 add tmpq, 16*2 697 sub hd, 4 698 jg .prep_w4 699 RET 700.prep_w8: 701 movu m0, [srcq+strideq*0] 702 movu m1, [srcq+strideq*1] 703 movu m2, [srcq+strideq*2] 704 movu m3, [srcq+stride3q ] 705 lea srcq, [srcq+strideq*4] 706 REPX {pmullw x, m4}, m0, m1, m2, m3 707 REPX {psubw x, m5}, m0, m1, m2, m3 708 mova [tmpq+16*0], m0 709 mova [tmpq+16*1], m1 710 mova [tmpq+16*2], m2 711 mova [tmpq+16*3], m3 712 add tmpq, 16*4 713 sub hd, 4 714 jg .prep_w8 715 RET 716.prep_w16: 717 movu m0, [srcq+strideq*0+16*0] 718 movu m1, [srcq+strideq*0+16*1] 719 movu m2, [srcq+strideq*1+16*0] 720 movu m3, [srcq+strideq*1+16*1] 721 lea srcq, [srcq+strideq*2] 722 REPX {pmullw x, m4}, m0, m1, m2, m3 723 REPX {psubw x, m5}, m0, m1, m2, m3 724 mova [tmpq+16*0], m0 725 mova [tmpq+16*1], m1 726 mova [tmpq+16*2], m2 727 mova [tmpq+16*3], m3 728 add tmpq, 16*4 729 sub hd, 2 730 jg .prep_w16 731 RET 732.prep_w32: 733 movu m0, [srcq+16*0] 734 movu m1, [srcq+16*1] 735 movu m2, [srcq+16*2] 736 movu m3, [srcq+16*3] 737 add srcq, strideq 738 REPX {pmullw x, m4}, m0, m1, m2, m3 739 REPX {psubw x, m5}, m0, m1, m2, m3 740 mova [tmpq+16*0], m0 741 mova [tmpq+16*1], m1 742 mova [tmpq+16*2], m2 743 mova [tmpq+16*3], m3 744 add tmpq, 16*4 745 dec hd 746 jg .prep_w32 747 RET 748.prep_w64: 749 movu m0, [srcq+16*0] 750 movu m1, [srcq+16*1] 751 movu m2, [srcq+16*2] 752 movu m3, [srcq+16*3] 753 REPX {pmullw x, m4}, m0, m1, m2, m3 754 REPX {psubw x, m5}, m0, m1, m2, m3 755 mova [tmpq+16*0], m0 756 mova [tmpq+16*1], m1 757 mova [tmpq+16*2], m2 758 mova [tmpq+16*3], m3 759 movu m0, [srcq+16*4] 760 movu m1, [srcq+16*5] 761 movu m2, [srcq+16*6] 762 movu m3, [srcq+16*7] 763 add srcq, strideq 764 REPX {pmullw x, m4}, m0, m1, m2, m3 765 REPX {psubw x, m5}, m0, m1, m2, m3 766 mova [tmpq+16*4], m0 767 mova [tmpq+16*5], m1 768 mova [tmpq+16*6], m2 769 mova [tmpq+16*7], m3 770 add tmpq, 16*8 771 dec hd 772 jg .prep_w64 773 RET 774.prep_w128: 775 movu m0, [srcq+16* 0] 776 movu m1, [srcq+16* 1] 777 movu m2, [srcq+16* 2] 778 movu m3, [srcq+16* 3] 779 REPX {pmullw x, m4}, m0, m1, m2, m3 780 REPX {psubw x, m5}, m0, m1, m2, m3 781 mova [tmpq+16*0], m0 782 mova [tmpq+16*1], m1 783 mova [tmpq+16*2], m2 784 mova [tmpq+16*3], m3 785 movu m0, [srcq+16* 4] 786 movu m1, [srcq+16* 5] 787 movu m2, [srcq+16* 6] 788 movu m3, [srcq+16* 7] 789 REPX {pmullw x, m4}, m0, m1, m2, m3 790 REPX {psubw x, m5}, m0, m1, m2, m3 791 mova [tmpq+16*4], m0 792 mova [tmpq+16*5], m1 793 mova [tmpq+16*6], m2 794 mova [tmpq+16*7], m3 795 movu m0, [srcq+16* 8] 796 movu m1, [srcq+16* 9] 797 movu m2, [srcq+16*10] 798 movu m3, [srcq+16*11] 799 add tmpq, 16*16 800 REPX {pmullw x, m4}, m0, m1, m2, m3 801 REPX {psubw x, m5}, m0, m1, m2, m3 802 mova [tmpq-16*8], m0 803 mova [tmpq-16*7], m1 804 mova [tmpq-16*6], m2 805 mova [tmpq-16*5], m3 806 movu m0, [srcq+16*12] 807 movu m1, [srcq+16*13] 808 movu m2, [srcq+16*14] 809 movu m3, [srcq+16*15] 810 add srcq, strideq 811 REPX {pmullw x, m4}, m0, m1, m2, m3 812 REPX {psubw x, m5}, m0, m1, m2, m3 813 mova [tmpq-16*4], m0 814 mova [tmpq-16*3], m1 815 mova [tmpq-16*2], m2 816 mova [tmpq-16*1], m3 817 dec hd 818 jg .prep_w128 819 RET 820.h: 821 movd m4, mxyd 822 mov mxyd, r6m ; my 823 mova m3, [base+pw_16] 824 pshufb m4, [base+pw_256] 825 mova m5, [base+pw_32766] 826 psubw m3, m4 827 test dword r7m, 0x800 828 jnz .h_12bpc 829 psllw m3, 2 830 psllw m4, 2 831.h_12bpc: 832 test mxyd, mxyd 833 jnz .hv 834 sub wd, 8 835 je .h_w8 836 jg .h_w16 837.h_w4: 838 movq m0, [srcq+strideq*0] 839 movhps m0, [srcq+strideq*1] 840 movq m1, [srcq+strideq*0+2] 841 movhps m1, [srcq+strideq*1+2] 842 lea srcq, [srcq+strideq*2] 843 pmullw m0, m3 844 pmullw m1, m4 845 psubw m0, m5 846 paddw m0, m1 847 psraw m0, 2 848 mova [tmpq], m0 849 add tmpq, 16 850 sub hd, 2 851 jg .h_w4 852 RET 853.h_w8: 854 movu m0, [srcq+strideq*0] 855 movu m1, [srcq+strideq*0+2] 856 pmullw m0, m3 857 pmullw m1, m4 858 psubw m0, m5 859 paddw m0, m1 860 movu m1, [srcq+strideq*1] 861 movu m2, [srcq+strideq*1+2] 862 lea srcq, [srcq+strideq*2] 863 pmullw m1, m3 864 pmullw m2, m4 865 psubw m1, m5 866 paddw m1, m2 867 psraw m0, 2 868 psraw m1, 2 869 mova [tmpq+16*0], m0 870 mova [tmpq+16*1], m1 871 add tmpq, 16*2 872 sub hd, 2 873 jg .h_w8 874 RET 875.h_w16: 876 lea srcq, [srcq+wq*2] 877 neg wq 878.h_w16_loop0: 879 mov r6, wq 880.h_w16_loop: 881 movu m0, [srcq+r6*2+ 0] 882 movu m1, [srcq+r6*2+ 2] 883 pmullw m0, m3 884 pmullw m1, m4 885 psubw m0, m5 886 paddw m0, m1 887 movu m1, [srcq+r6*2+16] 888 movu m2, [srcq+r6*2+18] 889 pmullw m1, m3 890 pmullw m2, m4 891 psubw m1, m5 892 paddw m1, m2 893 psraw m0, 2 894 psraw m1, 2 895 mova [tmpq+16*0], m0 896 mova [tmpq+16*1], m1 897 add tmpq, 16*2 898 add r6, 16 899 jl .h_w16_loop 900 add srcq, strideq 901 dec hd 902 jg .h_w16_loop0 903 RET 904.v: 905 movd m4, mxyd 906 mova m3, [base+pw_16] 907 pshufb m4, [base+pw_256] 908 mova m5, [base+pw_32766] 909 psubw m3, m4 910 test dword r7m, 0x800 911 jnz .v_12bpc 912 psllw m3, 2 913 psllw m4, 2 914.v_12bpc: 915 cmp wd, 8 916 je .v_w8 917 jg .v_w16 918.v_w4: 919 movq m0, [srcq+strideq*0] 920.v_w4_loop: 921 movq m2, [srcq+strideq*1] 922 lea srcq, [srcq+strideq*2] 923 punpcklqdq m1, m0, m2 ; 0 1 924 movq m0, [srcq+strideq*0] 925 punpcklqdq m2, m0 ; 1 2 926 pmullw m1, m3 927 pmullw m2, m4 928 psubw m1, m5 929 paddw m1, m2 930 psraw m1, 2 931 mova [tmpq], m1 932 add tmpq, 16 933 sub hd, 2 934 jg .v_w4_loop 935 RET 936.v_w8: 937 movu m0, [srcq+strideq*0] 938.v_w8_loop: 939 movu m2, [srcq+strideq*1] 940 lea srcq, [srcq+strideq*2] 941 pmullw m0, m3 942 pmullw m1, m4, m2 943 psubw m0, m5 944 paddw m1, m0 945 movu m0, [srcq+strideq*0] 946 psraw m1, 2 947 pmullw m2, m3 948 mova [tmpq+16*0], m1 949 pmullw m1, m4, m0 950 psubw m2, m5 951 paddw m1, m2 952 psraw m1, 2 953 mova [tmpq+16*1], m1 954 add tmpq, 16*2 955 sub hd, 2 956 jg .v_w8_loop 957 RET 958.v_w16: 959%if WIN64 960 push r7 961%endif 962 mov r5, srcq 963%if ARCH_X86_64 964 lea r6d, [wq*4-32] 965 mov wd, wd 966 lea r6d, [hq+r6*8] 967 mov r7, tmpq 968%else 969 mov r6d, wd 970%endif 971.v_w16_loop0: 972 movu m0, [srcq+strideq*0] 973.v_w16_loop: 974 movu m2, [srcq+strideq*1] 975 lea srcq, [srcq+strideq*2] 976 pmullw m0, m3 977 pmullw m1, m4, m2 978 psubw m0, m5 979 paddw m1, m0 980 movu m0, [srcq+strideq*0] 981 psraw m1, 2 982 pmullw m2, m3 983 mova [tmpq+wq*0], m1 984 pmullw m1, m4, m0 985 psubw m2, m5 986 paddw m1, m2 987 psraw m1, 2 988 mova [tmpq+wq*2], m1 989 lea tmpq, [tmpq+wq*4] 990 sub hd, 2 991 jg .v_w16_loop 992%if ARCH_X86_64 993 add r5, 16 994 add r7, 16 995 movzx hd, r6b 996 mov srcq, r5 997 mov tmpq, r7 998 sub r6d, 1<<8 999%else 1000 mov tmpq, tmpmp 1001 add r5, 16 1002 mov hd, hm 1003 add tmpq, 16 1004 mov srcq, r5 1005 mov tmpmp, tmpq 1006 sub r6d, 8 1007%endif 1008 jg .v_w16_loop0 1009%if WIN64 1010 pop r7 1011%endif 1012 RET 1013.hv: 1014 WIN64_SPILL_XMM 7 1015 shl mxyd, 11 1016 movd m6, mxyd 1017 pshufb m6, [base+pw_256] 1018 cmp wd, 8 1019 je .hv_w8 1020 jg .hv_w16 1021.hv_w4: 1022 movddup m0, [srcq+strideq*0] 1023 movddup m1, [srcq+strideq*0+2] 1024 pmullw m0, m3 1025 pmullw m1, m4 1026 psubw m0, m5 1027 paddw m0, m1 1028 psraw m0, 2 1029.hv_w4_loop: 1030 movq m1, [srcq+strideq*1] 1031 movq m2, [srcq+strideq*1+2] 1032 lea srcq, [srcq+strideq*2] 1033 movhps m1, [srcq+strideq*0] 1034 movhps m2, [srcq+strideq*0+2] 1035 pmullw m1, m3 1036 pmullw m2, m4 1037 psubw m1, m5 1038 paddw m1, m2 1039 psraw m1, 2 ; 1 2 1040 shufpd m2, m0, m1, 0x01 ; 0 1 1041 mova m0, m1 1042 psubw m1, m2 1043 pmulhrsw m1, m6 1044 paddw m1, m2 1045 mova [tmpq], m1 1046 add tmpq, 16 1047 sub hd, 2 1048 jg .hv_w4_loop 1049 RET 1050.hv_w8: 1051 movu m0, [srcq+strideq*0] 1052 movu m1, [srcq+strideq*0+2] 1053 pmullw m0, m3 1054 pmullw m1, m4 1055 psubw m0, m5 1056 paddw m0, m1 1057 psraw m0, 2 1058.hv_w8_loop: 1059 movu m1, [srcq+strideq*1] 1060 movu m2, [srcq+strideq*1+2] 1061 lea srcq, [srcq+strideq*2] 1062 pmullw m1, m3 1063 pmullw m2, m4 1064 psubw m1, m5 1065 paddw m1, m2 1066 psraw m1, 2 1067 psubw m2, m1, m0 1068 pmulhrsw m2, m6 1069 paddw m2, m0 1070 mova [tmpq+16*0], m2 1071 movu m0, [srcq+strideq*0] 1072 movu m2, [srcq+strideq*0+2] 1073 pmullw m0, m3 1074 pmullw m2, m4 1075 psubw m0, m5 1076 paddw m0, m2 1077 psraw m0, 2 1078 psubw m2, m0, m1 1079 pmulhrsw m2, m6 1080 paddw m2, m1 1081 mova [tmpq+16*1], m2 1082 add tmpq, 16*2 1083 sub hd, 2 1084 jg .hv_w8_loop 1085 RET 1086.hv_w16: 1087%if WIN64 1088 push r7 1089%endif 1090 mov r5, srcq 1091%if ARCH_X86_64 1092 lea r6d, [wq*4-32] 1093 mov wd, wd 1094 lea r6d, [hq+r6*8] 1095 mov r7, tmpq 1096%else 1097 mov r6d, wd 1098%endif 1099.hv_w16_loop0: 1100 movu m0, [srcq+strideq*0] 1101 movu m1, [srcq+strideq*0+2] 1102 pmullw m0, m3 1103 pmullw m1, m4 1104 psubw m0, m5 1105 paddw m0, m1 1106 psraw m0, 2 1107.hv_w16_loop: 1108 movu m1, [srcq+strideq*1] 1109 movu m2, [srcq+strideq*1+2] 1110 lea srcq, [srcq+strideq*2] 1111 pmullw m1, m3 1112 pmullw m2, m4 1113 psubw m1, m5 1114 paddw m1, m2 1115 psraw m1, 2 1116 psubw m2, m1, m0 1117 pmulhrsw m2, m6 1118 paddw m2, m0 1119 mova [tmpq+wq*0], m2 1120 movu m0, [srcq+strideq*0] 1121 movu m2, [srcq+strideq*0+2] 1122 pmullw m0, m3 1123 pmullw m2, m4 1124 psubw m0, m5 1125 paddw m0, m2 1126 psraw m0, 2 1127 psubw m2, m0, m1 1128 pmulhrsw m2, m6 1129 paddw m2, m1 1130 mova [tmpq+wq*2], m2 1131 lea tmpq, [tmpq+wq*4] 1132 sub hd, 2 1133 jg .hv_w16_loop 1134%if ARCH_X86_64 1135 add r5, 16 1136 add r7, 16 1137 movzx hd, r6b 1138 mov srcq, r5 1139 mov tmpq, r7 1140 sub r6d, 1<<8 1141%else 1142 mov tmpq, tmpmp 1143 add r5, 16 1144 mov hd, hm 1145 add tmpq, 16 1146 mov srcq, r5 1147 mov tmpmp, tmpq 1148 sub r6d, 8 1149%endif 1150 jg .hv_w16_loop0 1151%if WIN64 1152 pop r7 1153%endif 1154 RET 1155 1156; int8_t subpel_filters[5][15][8] 1157%assign FILTER_REGULAR (0*15 << 16) | 3*15 1158%assign FILTER_SMOOTH (1*15 << 16) | 4*15 1159%assign FILTER_SHARP (2*15 << 16) | 3*15 1160 1161%macro FN 4 ; prefix, type, type_h, type_v 1162cglobal %1_%2_16bpc 1163 mov t0d, FILTER_%3 1164%ifidn %3, %4 1165 mov t1d, t0d 1166%else 1167 mov t1d, FILTER_%4 1168%endif 1169%ifnidn %2, regular ; skip the jump in the last filter 1170 jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX) 1171%endif 1172%endmacro 1173 1174%if ARCH_X86_32 1175DECLARE_REG_TMP 1, 2, 6 1176%elif WIN64 1177DECLARE_REG_TMP 4, 5, 8 1178%else 1179DECLARE_REG_TMP 7, 8, 8 1180%endif 1181 1182%define PUT_8TAP_FN FN put_8tap, 1183PUT_8TAP_FN sharp, SHARP, SHARP 1184PUT_8TAP_FN sharp_smooth, SHARP, SMOOTH 1185PUT_8TAP_FN smooth_sharp, SMOOTH, SHARP 1186PUT_8TAP_FN smooth, SMOOTH, SMOOTH 1187PUT_8TAP_FN sharp_regular, SHARP, REGULAR 1188PUT_8TAP_FN regular_sharp, REGULAR, SHARP 1189PUT_8TAP_FN smooth_regular, SMOOTH, REGULAR 1190PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH 1191PUT_8TAP_FN regular, REGULAR, REGULAR 1192 1193%if ARCH_X86_32 1194cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my 1195%define mxb r0b 1196%define mxd r0 1197%define mxq r0 1198%define myb r1b 1199%define myd r1 1200%define myq r1 1201%define m8 [esp+16*0] 1202%define m9 [esp+16*1] 1203%define m10 [esp+16*2] 1204%define m11 [esp+16*3] 1205%define m12 [esp+16*4] 1206%define m13 [esp+16*5] 1207%define m14 [esp+16*6] 1208%define m15 [esp+16*7] 1209%else 1210cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my 1211%endif 1212%define base t2-put_ssse3 1213 imul mxd, mxm, 0x010101 1214 add mxd, t0d ; 8tap_h, mx, 4tap_h 1215 imul myd, mym, 0x010101 1216 add myd, t1d ; 8tap_v, my, 4tap_v 1217 LEA t2, put_ssse3 1218 movifnidn wd, wm 1219 movifnidn srcq, srcmp 1220 movifnidn ssq, ssmp 1221 movifnidn hd, hm 1222 test mxd, 0xf00 1223 jnz .h 1224 test myd, 0xf00 1225 jnz .v 1226 tzcnt wd, wd 1227 movzx wd, word [base+put_ssse3_table+wq*2] 1228 movifnidn dstq, dstmp 1229 movifnidn dsq, dsmp 1230 add wq, t2 1231%if WIN64 1232 pop r8 1233 pop r7 1234%endif 1235 jmp wq 1236.h: 1237 test myd, 0xf00 1238 jnz .hv 1239 mov myd, r8m 1240 movd m5, r8m 1241 shr myd, 11 1242 movddup m4, [base+put_8tap_h_rnd+myq*8] 1243 movifnidn dsq, dsmp 1244 pshufb m5, [base+pw_256] 1245 cmp wd, 4 1246 jg .h_w8 1247 movzx mxd, mxb 1248 lea srcq, [srcq-2] 1249 movq m3, [base+subpel_filters+mxq*8] 1250 movifnidn dstq, dstmp 1251 punpcklbw m3, m3 1252 psraw m3, 8 ; sign-extend 1253 je .h_w4 1254.h_w2: 1255 mova m2, [base+spel_h_shuf2] 1256 pshufd m3, m3, q2121 1257.h_w2_loop: 1258 movu m0, [srcq+ssq*0] 1259 movu m1, [srcq+ssq*1] 1260 lea srcq, [srcq+ssq*2] 1261 pshufb m0, m2 1262 pshufb m1, m2 1263 pmaddwd m0, m3 1264 pmaddwd m1, m3 1265 phaddd m0, m1 1266 paddd m0, m4 1267 psrad m0, 6 1268 packssdw m0, m0 1269 pxor m1, m1 1270 pminsw m0, m5 1271 pmaxsw m0, m1 1272 movd [dstq+dsq*0], m0 1273 pshuflw m0, m0, q3232 1274 movd [dstq+dsq*1], m0 1275 lea dstq, [dstq+dsq*2] 1276 sub hd, 2 1277 jg .h_w2_loop 1278 RET 1279.h_w4: 1280 WIN64_SPILL_XMM 8 1281 mova m6, [base+spel_h_shufA] 1282 mova m7, [base+spel_h_shufB] 1283 pshufd m2, m3, q1111 1284 pshufd m3, m3, q2222 1285.h_w4_loop: 1286 movu m1, [srcq] 1287 add srcq, ssq 1288 pshufb m0, m1, m6 ; 0 1 1 2 2 3 3 4 1289 pshufb m1, m7 ; 2 3 3 4 4 5 5 6 1290 pmaddwd m0, m2 1291 pmaddwd m1, m3 1292 paddd m0, m4 1293 paddd m0, m1 1294 psrad m0, 6 1295 packssdw m0, m0 1296 pxor m1, m1 1297 pminsw m0, m5 1298 pmaxsw m0, m1 1299 movq [dstq], m0 1300 add dstq, dsq 1301 dec hd 1302 jg .h_w4_loop 1303 RET 1304.h_w8: 1305 WIN64_SPILL_XMM 12 1306 shr mxd, 16 1307 movq m3, [base+subpel_filters+mxq*8] 1308 movifnidn dstq, dstmp 1309 mova m6, [base+spel_h_shufA] 1310 mova m7, [base+spel_h_shufB] 1311%if UNIX64 1312 mov wd, wd 1313%endif 1314 lea srcq, [srcq+wq*2] 1315 punpcklbw m3, m3 1316 lea dstq, [dstq+wq*2] 1317 psraw m3, 8 1318 neg wq 1319%if ARCH_X86_32 1320 ALLOC_STACK -16*4 1321 pshufd m0, m3, q0000 1322 pshufd m1, m3, q1111 1323 pshufd m2, m3, q2222 1324 pshufd m3, m3, q3333 1325 mova m8, m0 1326 mova m9, m1 1327 mova m10, m2 1328 mova m11, m3 1329%else 1330 pshufd m8, m3, q0000 1331 pshufd m9, m3, q1111 1332 pshufd m10, m3, q2222 1333 pshufd m11, m3, q3333 1334%endif 1335.h_w8_loop0: 1336 mov r6, wq 1337.h_w8_loop: 1338 movu m0, [srcq+r6*2- 6] 1339 movu m1, [srcq+r6*2+ 2] 1340 pshufb m2, m0, m6 ; 0 1 1 2 2 3 3 4 1341 pshufb m0, m7 ; 2 3 3 4 4 5 5 6 1342 pmaddwd m2, m8 ; abcd0 1343 pmaddwd m0, m9 ; abcd1 1344 pshufb m3, m1, m6 ; 4 5 5 6 6 7 7 8 1345 pshufb m1, m7 ; 6 7 7 8 8 9 9 a 1346 paddd m2, m4 1347 paddd m0, m2 1348 pmaddwd m2, m10, m3 ; abcd2 1349 pmaddwd m3, m8 ; efgh0 1350 paddd m0, m2 1351 pmaddwd m2, m11, m1 ; abcd3 1352 pmaddwd m1, m9 ; efgh1 1353 paddd m0, m2 1354 movu m2, [srcq+r6*2+10] 1355 paddd m3, m4 1356 paddd m1, m3 1357 pshufb m3, m2, m6 ; 8 9 9 a a b b c 1358 pshufb m2, m7 ; a b b c c d d e 1359 pmaddwd m3, m10 ; efgh2 1360 pmaddwd m2, m11 ; efgh3 1361 paddd m1, m3 1362 paddd m1, m2 1363 psrad m0, 6 1364 psrad m1, 6 1365 packssdw m0, m1 1366 pxor m1, m1 1367 pminsw m0, m5 1368 pmaxsw m0, m1 1369 mova [dstq+r6*2], m0 1370 add r6, 8 1371 jl .h_w8_loop 1372 add srcq, ssq 1373 add dstq, dsq 1374 dec hd 1375 jg .h_w8_loop0 1376 RET 1377.v: 1378 movzx mxd, myb 1379 shr myd, 16 1380 cmp hd, 6 1381 cmovb myd, mxd 1382 movq m3, [base+subpel_filters+myq*8] 1383 WIN64_SPILL_XMM 15 1384 movd m7, r8m 1385 movifnidn dstq, dstmp 1386 movifnidn dsq, dsmp 1387 punpcklbw m3, m3 1388 pshufb m7, [base+pw_256] 1389 psraw m3, 8 ; sign-extend 1390%if ARCH_X86_32 1391 ALLOC_STACK -16*7 1392 pshufd m0, m3, q0000 1393 pshufd m1, m3, q1111 1394 pshufd m2, m3, q2222 1395 pshufd m3, m3, q3333 1396 mova m8, m0 1397 mova m9, m1 1398 mova m10, m2 1399 mova m11, m3 1400%else 1401 pshufd m8, m3, q0000 1402 pshufd m9, m3, q1111 1403 pshufd m10, m3, q2222 1404 pshufd m11, m3, q3333 1405%endif 1406 lea r6, [ssq*3] 1407 sub srcq, r6 1408 cmp wd, 2 1409 jne .v_w4 1410.v_w2: 1411 movd m1, [srcq+ssq*0] 1412 movd m4, [srcq+ssq*1] 1413 movd m2, [srcq+ssq*2] 1414 add srcq, r6 1415 movd m5, [srcq+ssq*0] 1416 movd m3, [srcq+ssq*1] 1417 movd m6, [srcq+ssq*2] 1418 add srcq, r6 1419 movd m0, [srcq+ssq*0] 1420 punpckldq m1, m4 ; 0 1 1421 punpckldq m4, m2 ; 1 2 1422 punpckldq m2, m5 ; 2 3 1423 punpckldq m5, m3 ; 3 4 1424 punpckldq m3, m6 ; 4 5 1425 punpckldq m6, m0 ; 5 6 1426 punpcklwd m1, m4 ; 01 12 1427 punpcklwd m2, m5 ; 23 34 1428 punpcklwd m3, m6 ; 45 56 1429 pxor m6, m6 1430.v_w2_loop: 1431 movd m4, [srcq+ssq*1] 1432 lea srcq, [srcq+ssq*2] 1433 pmaddwd m5, m8, m1 ; a0 b0 1434 mova m1, m2 1435 pmaddwd m2, m9 ; a1 b1 1436 paddd m5, m2 1437 mova m2, m3 1438 pmaddwd m3, m10 ; a2 b2 1439 paddd m5, m3 1440 punpckldq m3, m0, m4 ; 6 7 1441 movd m0, [srcq+ssq*0] 1442 punpckldq m4, m0 ; 7 8 1443 punpcklwd m3, m4 ; 67 78 1444 pmaddwd m4, m11, m3 ; a3 b3 1445 paddd m5, m4 1446 psrad m5, 5 1447 packssdw m5, m5 1448 pmaxsw m5, m6 1449 pavgw m5, m6 1450 pminsw m5, m7 1451 movd [dstq+dsq*0], m5 1452 pshuflw m5, m5, q3232 1453 movd [dstq+dsq*1], m5 1454 lea dstq, [dstq+dsq*2] 1455 sub hd, 2 1456 jg .v_w2_loop 1457 RET 1458.v_w4: 1459%if ARCH_X86_32 1460 shl wd, 14 1461%if STACK_ALIGNMENT < 16 1462 mov [esp+4*29], srcq 1463 mov [esp+4*30], dstq 1464%else 1465 mov srcmp, srcq 1466%endif 1467 lea wd, [wq+hq-(1<<16)] 1468%else 1469 shl wd, 6 1470 mov r7, srcq 1471 mov r8, dstq 1472 lea wd, [wq+hq-(1<<8)] 1473%endif 1474.v_w4_loop0: 1475 movq m1, [srcq+ssq*0] 1476 movq m2, [srcq+ssq*1] 1477 movq m3, [srcq+ssq*2] 1478 add srcq, r6 1479 movq m4, [srcq+ssq*0] 1480 movq m5, [srcq+ssq*1] 1481 movq m6, [srcq+ssq*2] 1482 add srcq, r6 1483 movq m0, [srcq+ssq*0] 1484 punpcklwd m1, m2 ; 01 1485 punpcklwd m2, m3 ; 12 1486 punpcklwd m3, m4 ; 23 1487 punpcklwd m4, m5 ; 34 1488 punpcklwd m5, m6 ; 45 1489 punpcklwd m6, m0 ; 56 1490%if ARCH_X86_32 1491 jmp .v_w4_loop_start 1492.v_w4_loop: 1493 mova m1, m12 1494 mova m2, m13 1495 mova m3, m14 1496.v_w4_loop_start: 1497 pmaddwd m1, m8 ; a0 1498 pmaddwd m2, m8 ; b0 1499 mova m12, m3 1500 mova m13, m4 1501 pmaddwd m3, m9 ; a1 1502 pmaddwd m4, m9 ; b1 1503 paddd m1, m3 1504 paddd m2, m4 1505 mova m14, m5 1506 mova m4, m6 1507 pmaddwd m5, m10 ; a2 1508 pmaddwd m6, m10 ; b2 1509 paddd m1, m5 1510 paddd m2, m6 1511 movq m6, [srcq+ssq*1] 1512 lea srcq, [srcq+ssq*2] 1513 punpcklwd m5, m0, m6 ; 67 1514 movq m0, [srcq+ssq*0] 1515 pmaddwd m3, m11, m5 ; a3 1516 punpcklwd m6, m0 ; 78 1517 paddd m1, m3 1518 pmaddwd m3, m11, m6 ; b3 1519 paddd m2, m3 1520 psrad m1, 5 1521 psrad m2, 5 1522 packssdw m1, m2 1523 pxor m2, m2 1524 pmaxsw m1, m2 1525 pavgw m1, m2 1526 pminsw m1, m7 1527 movq [dstq+dsq*0], m1 1528 movhps [dstq+dsq*1], m1 1529 lea dstq, [dstq+dsq*2] 1530 sub hd, 2 1531 jg .v_w4_loop 1532%if STACK_ALIGNMENT < 16 1533 mov srcq, [esp+4*29] 1534 mov dstq, [esp+4*30] 1535 movzx hd, ww 1536 add srcq, 8 1537 add dstq, 8 1538 mov [esp+4*29], srcq 1539 mov [esp+4*30], dstq 1540%else 1541 mov srcq, srcmp 1542 mov dstq, dstmp 1543 movzx hd, ww 1544 add srcq, 8 1545 add dstq, 8 1546 mov srcmp, srcq 1547 mov dstmp, dstq 1548%endif 1549 sub wd, 1<<16 1550%else 1551.v_w4_loop: 1552 pmaddwd m12, m8, m1 ; a0 1553 pmaddwd m13, m8, m2 ; b0 1554 mova m1, m3 1555 mova m2, m4 1556 pmaddwd m3, m9 ; a1 1557 pmaddwd m4, m9 ; b1 1558 paddd m12, m3 1559 paddd m13, m4 1560 mova m3, m5 1561 mova m4, m6 1562 pmaddwd m5, m10 ; a2 1563 pmaddwd m6, m10 ; b2 1564 paddd m12, m5 1565 paddd m13, m6 1566 movq m6, [srcq+ssq*1] 1567 lea srcq, [srcq+ssq*2] 1568 punpcklwd m5, m0, m6 ; 67 1569 movq m0, [srcq+ssq*0] 1570 pmaddwd m14, m11, m5 ; a3 1571 punpcklwd m6, m0 ; 78 1572 paddd m12, m14 1573 pmaddwd m14, m11, m6 ; b3 1574 paddd m13, m14 1575 psrad m12, 5 1576 psrad m13, 5 1577 packssdw m12, m13 1578 pxor m13, m13 1579 pmaxsw m12, m13 1580 pavgw m12, m13 1581 pminsw m12, m7 1582 movq [dstq+dsq*0], m12 1583 movhps [dstq+dsq*1], m12 1584 lea dstq, [dstq+dsq*2] 1585 sub hd, 2 1586 jg .v_w4_loop 1587 add r7, 8 1588 add r8, 8 1589 movzx hd, wb 1590 mov srcq, r7 1591 mov dstq, r8 1592 sub wd, 1<<8 1593%endif 1594 jg .v_w4_loop0 1595 RET 1596.hv: 1597 RESET_STACK_STATE 1598%if ARCH_X86_32 1599 movd m4, r8m 1600 mova m6, [base+pd_512] 1601 pshufb m4, [base+pw_256] 1602%else 1603%if WIN64 1604 ALLOC_STACK 16*6, 16 1605%endif 1606 movd m15, r8m 1607 pshufb m15, [base+pw_256] 1608%endif 1609 cmp wd, 4 1610 jg .hv_w8 1611 movzx mxd, mxb 1612 je .hv_w4 1613 movq m0, [base+subpel_filters+mxq*8] 1614 movzx mxd, myb 1615 shr myd, 16 1616 cmp hd, 6 1617 cmovb myd, mxd 1618 movq m3, [base+subpel_filters+myq*8] 1619%if ARCH_X86_32 1620 mov dstq, dstmp 1621 mov dsq, dsmp 1622 mova m5, [base+spel_h_shuf2] 1623 ALLOC_STACK -16*8 1624%else 1625 mova m6, [base+pd_512] 1626 mova m9, [base+spel_h_shuf2] 1627%endif 1628 pshuflw m0, m0, q2121 1629 pxor m7, m7 1630 punpcklbw m7, m0 1631 punpcklbw m3, m3 1632 psraw m3, 8 ; sign-extend 1633 test dword r8m, 0x800 1634 jz .hv_w2_10bpc 1635 psraw m7, 2 1636 psllw m3, 2 1637.hv_w2_10bpc: 1638 lea r6, [ssq*3] 1639 sub srcq, 2 1640 sub srcq, r6 1641%if ARCH_X86_32 1642 pshufd m0, m3, q0000 1643 pshufd m1, m3, q1111 1644 pshufd m2, m3, q2222 1645 pshufd m3, m3, q3333 1646 mova m9, m5 1647 mova m11, m0 1648 mova m12, m1 1649 mova m13, m2 1650 mova m14, m3 1651 mova m15, m4 1652%else 1653 pshufd m11, m3, q0000 1654 pshufd m12, m3, q1111 1655 pshufd m13, m3, q2222 1656 pshufd m14, m3, q3333 1657%endif 1658 movu m2, [srcq+ssq*0] 1659 movu m3, [srcq+ssq*1] 1660 movu m1, [srcq+ssq*2] 1661 add srcq, r6 1662 movu m4, [srcq+ssq*0] 1663%if ARCH_X86_32 1664 REPX {pshufb x, m5}, m2, m3, m1, m4 1665%else 1666 REPX {pshufb x, m9}, m2, m3, m1, m4 1667%endif 1668 REPX {pmaddwd x, m7}, m2, m3, m1, m4 1669 phaddd m2, m3 ; 0 1 1670 phaddd m1, m4 ; 2 3 1671 movu m3, [srcq+ssq*1] 1672 movu m4, [srcq+ssq*2] 1673 add srcq, r6 1674 movu m0, [srcq+ssq*0] 1675%if ARCH_X86_32 1676 REPX {pshufb x, m5}, m3, m4, m0 1677%else 1678 REPX {pshufb x, m9}, m3, m4, m0 1679%endif 1680 REPX {pmaddwd x, m7}, m3, m4, m0 1681 phaddd m3, m4 ; 4 5 1682 phaddd m0, m0 ; 6 6 1683 REPX {paddd x, m6}, m2, m1, m3, m0 1684 REPX {psrad x, 10}, m2, m1, m3, m0 1685 packssdw m2, m1 ; 0 1 2 3 1686 packssdw m3, m0 ; 4 5 6 _ 1687 palignr m4, m3, m2, 4 ; 1 2 3 4 1688 pshufd m5, m3, q0321 ; 5 6 _ _ 1689 punpcklwd m1, m2, m4 ; 01 12 1690 punpckhwd m2, m4 ; 23 34 1691 punpcklwd m3, m5 ; 45 56 1692.hv_w2_loop: 1693 movu m4, [srcq+ssq*1] 1694 lea srcq, [srcq+ssq*2] 1695 movu m5, [srcq+ssq*0] 1696 pshufb m4, m9 1697 pshufb m5, m9 1698 pmaddwd m4, m7 1699 pmaddwd m5, m7 1700 phaddd m4, m5 1701 pmaddwd m5, m11, m1 ; a0 b0 1702 mova m1, m2 1703 pmaddwd m2, m12 ; a1 b1 1704 paddd m5, m2 1705 mova m2, m3 1706 pmaddwd m3, m13 ; a2 b2 1707 paddd m5, m3 1708 paddd m4, m6 1709 psrad m4, 10 ; 7 8 1710 packssdw m0, m4 1711 pshufd m3, m0, q2103 1712 punpckhwd m3, m0 ; 67 78 1713 mova m0, m4 1714 pmaddwd m4, m14, m3 ; a3 b3 1715 paddd m5, m6 1716 paddd m5, m4 1717 psrad m5, 10 1718 packssdw m5, m5 1719 pxor m4, m4 1720 pminsw m5, m15 1721 pmaxsw m5, m4 1722 movd [dstq+dsq*0], m5 1723 pshuflw m5, m5, q3232 1724 movd [dstq+dsq*1], m5 1725 lea dstq, [dstq+dsq*2] 1726 sub hd, 2 1727 jg .hv_w2_loop 1728 RET 1729.hv_w8: 1730 shr mxd, 16 1731.hv_w4: 1732 movq m2, [base+subpel_filters+mxq*8] 1733 movzx mxd, myb 1734 shr myd, 16 1735 cmp hd, 6 1736 cmovb myd, mxd 1737 movq m3, [base+subpel_filters+myq*8] 1738%if ARCH_X86_32 1739 RESET_STACK_STATE 1740 mov dstq, dstmp 1741 mov dsq, dsmp 1742 mova m0, [base+spel_h_shufA] 1743 mova m1, [base+spel_h_shufB] 1744 ALLOC_STACK -16*15 1745 mova m8, m0 1746 mova m9, m1 1747 mova m14, m6 1748%else 1749 mova m8, [base+spel_h_shufA] 1750 mova m9, [base+spel_h_shufB] 1751%endif 1752 pxor m0, m0 1753 punpcklbw m0, m2 1754 punpcklbw m3, m3 1755 psraw m3, 8 1756 test dword r8m, 0x800 1757 jz .hv_w4_10bpc 1758 psraw m0, 2 1759 psllw m3, 2 1760.hv_w4_10bpc: 1761 lea r6, [ssq*3] 1762 sub srcq, 6 1763 sub srcq, r6 1764%if ARCH_X86_32 1765 %define tmp esp+16*8 1766 shl wd, 14 1767%if STACK_ALIGNMENT < 16 1768 mov [esp+4*61], srcq 1769 mov [esp+4*62], dstq 1770%else 1771 mov srcmp, srcq 1772%endif 1773 mova [tmp+16*5], m4 1774 lea wd, [wq+hq-(1<<16)] 1775 pshufd m1, m0, q0000 1776 pshufd m2, m0, q1111 1777 pshufd m5, m0, q2222 1778 pshufd m0, m0, q3333 1779 mova m10, m1 1780 mova m11, m2 1781 mova m12, m5 1782 mova m13, m0 1783%else 1784%if WIN64 1785 %define tmp rsp 1786%else 1787 %define tmp rsp-104 ; red zone 1788%endif 1789 shl wd, 6 1790 mov r7, srcq 1791 mov r8, dstq 1792 lea wd, [wq+hq-(1<<8)] 1793 pshufd m10, m0, q0000 1794 pshufd m11, m0, q1111 1795 pshufd m12, m0, q2222 1796 pshufd m13, m0, q3333 1797 mova [tmp+16*5], m15 1798%endif 1799 pshufd m0, m3, q0000 1800 pshufd m1, m3, q1111 1801 pshufd m2, m3, q2222 1802 pshufd m3, m3, q3333 1803 mova [tmp+16*1], m0 1804 mova [tmp+16*2], m1 1805 mova [tmp+16*3], m2 1806 mova [tmp+16*4], m3 1807%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512] 1808 pshufb m%3, m%1, m8 ; 0 1 1 2 2 3 3 4 1809 pshufb m%1, m9 ; 2 3 3 4 4 5 5 6 1810 pmaddwd m%3, m10 1811 pmaddwd m%1, m11 1812 paddd m%3, %5 1813 paddd m%1, m%3 1814 pshufb m%3, m%2, m8 ; 4 5 5 6 6 7 7 8 1815 pshufb m%2, m9 ; 6 7 7 8 8 9 9 a 1816 pmaddwd m%3, m12 1817 pmaddwd m%2, m13 1818 paddd m%1, m%3 1819 paddd m%1, m%2 1820 psrad m%1, %4 1821%endmacro 1822.hv_w4_loop0: 1823%if ARCH_X86_64 1824 mova m14, [pd_512] 1825%endif 1826 movu m4, [srcq+ssq*0+0] 1827 movu m1, [srcq+ssq*0+8] 1828 movu m5, [srcq+ssq*1+0] 1829 movu m2, [srcq+ssq*1+8] 1830 movu m6, [srcq+ssq*2+0] 1831 movu m3, [srcq+ssq*2+8] 1832 add srcq, r6 1833 PUT_8TAP_HV_H 4, 1, 0, 10 1834 PUT_8TAP_HV_H 5, 2, 0, 10 1835 PUT_8TAP_HV_H 6, 3, 0, 10 1836 movu m7, [srcq+ssq*0+0] 1837 movu m2, [srcq+ssq*0+8] 1838 movu m1, [srcq+ssq*1+0] 1839 movu m3, [srcq+ssq*1+8] 1840 PUT_8TAP_HV_H 7, 2, 0, 10 1841 PUT_8TAP_HV_H 1, 3, 0, 10 1842 movu m2, [srcq+ssq*2+0] 1843 movu m3, [srcq+ssq*2+8] 1844 add srcq, r6 1845 PUT_8TAP_HV_H 2, 3, 0, 10 1846 packssdw m4, m7 ; 0 3 1847 packssdw m5, m1 ; 1 4 1848 movu m0, [srcq+ssq*0+0] 1849 movu m1, [srcq+ssq*0+8] 1850 PUT_8TAP_HV_H 0, 1, 3, 10 1851 packssdw m6, m2 ; 2 5 1852 packssdw m7, m0 ; 3 6 1853 punpcklwd m1, m4, m5 ; 01 1854 punpckhwd m4, m5 ; 34 1855 punpcklwd m2, m5, m6 ; 12 1856 punpckhwd m5, m6 ; 45 1857 punpcklwd m3, m6, m7 ; 23 1858 punpckhwd m6, m7 ; 56 1859%if ARCH_X86_32 1860 jmp .hv_w4_loop_start 1861.hv_w4_loop: 1862 mova m1, [tmp+16*6] 1863 mova m2, m15 1864.hv_w4_loop_start: 1865 mova m7, [tmp+16*1] 1866 pmaddwd m1, m7 ; a0 1867 pmaddwd m2, m7 ; b0 1868 mova m7, [tmp+16*2] 1869 mova [tmp+16*6], m3 1870 pmaddwd m3, m7 ; a1 1871 mova m15, m4 1872 pmaddwd m4, m7 ; b1 1873 mova m7, [tmp+16*3] 1874 paddd m1, m3 1875 paddd m2, m4 1876 mova m3, m5 1877 pmaddwd m5, m7 ; a2 1878 mova m4, m6 1879 pmaddwd m6, m7 ; b2 1880 paddd m1, m5 1881 paddd m2, m6 1882 movu m7, [srcq+ssq*1+0] 1883 movu m5, [srcq+ssq*1+8] 1884 lea srcq, [srcq+ssq*2] 1885 PUT_8TAP_HV_H 7, 5, 6, 10 1886 packssdw m0, m7 ; 6 7 1887 mova [tmp+16*0], m0 1888 movu m0, [srcq+ssq*0+0] 1889 movu m5, [srcq+ssq*0+8] 1890 PUT_8TAP_HV_H 0, 5, 6, 10 1891 mova m6, [tmp+16*0] 1892 packssdw m7, m0 ; 7 8 1893 punpcklwd m5, m6, m7 ; 67 1894 punpckhwd m6, m7 ; 78 1895 pmaddwd m7, m5, [tmp+16*4] 1896 paddd m1, m7 ; a3 1897 pmaddwd m7, m6, [tmp+16*4] 1898 paddd m2, m7 ; b3 1899 psrad m1, 9 1900 psrad m2, 9 1901 packssdw m1, m2 1902 pxor m7, m7 1903 pmaxsw m1, m7 1904 pavgw m7, m1 1905 pminsw m7, [tmp+16*5] 1906 movq [dstq+dsq*0], m7 1907 movhps [dstq+dsq*1], m7 1908 lea dstq, [dstq+dsq*2] 1909 sub hd, 2 1910 jg .hv_w4_loop 1911%if STACK_ALIGNMENT < 16 1912 mov srcq, [esp+4*61] 1913 mov dstq, [esp+4*62] 1914 add srcq, 8 1915 add dstq, 8 1916 mov [esp+4*61], srcq 1917 mov [esp+4*62], dstq 1918%else 1919 mov srcq, srcmp 1920 mov dstq, dstmp 1921 add srcq, 8 1922 add dstq, 8 1923 mov srcmp, srcq 1924 mov dstmp, dstq 1925%endif 1926 movzx hd, ww 1927 sub wd, 1<<16 1928%else 1929.hv_w4_loop: 1930 mova m15, [tmp+16*1] 1931 pmaddwd m14, m15, m1 ; a0 1932 pmaddwd m15, m2 ; b0 1933 mova m7, [tmp+16*2] 1934 mova m1, m3 1935 pmaddwd m3, m7 ; a1 1936 mova m2, m4 1937 pmaddwd m4, m7 ; b1 1938 mova m7, [tmp+16*3] 1939 paddd m14, m3 1940 paddd m15, m4 1941 mova m3, m5 1942 pmaddwd m5, m7 ; a2 1943 mova m4, m6 1944 pmaddwd m6, m7 ; b2 1945 paddd m14, m5 1946 paddd m15, m6 1947 movu m7, [srcq+ssq*1+0] 1948 movu m5, [srcq+ssq*1+8] 1949 lea srcq, [srcq+ssq*2] 1950 PUT_8TAP_HV_H 7, 5, 6, 10, [pd_512] 1951 packssdw m0, m7 ; 6 7 1952 mova [tmp+16*0], m0 1953 movu m0, [srcq+ssq*0+0] 1954 movu m5, [srcq+ssq*0+8] 1955 PUT_8TAP_HV_H 0, 5, 6, 10, [pd_512] 1956 mova m6, [tmp+16*0] 1957 packssdw m7, m0 ; 7 8 1958 punpcklwd m5, m6, m7 ; 67 1959 punpckhwd m6, m7 ; 78 1960 pmaddwd m7, m5, [tmp+16*4] 1961 paddd m14, m7 ; a3 1962 pmaddwd m7, m6, [tmp+16*4] 1963 paddd m15, m7 ; b3 1964 psrad m14, 9 1965 psrad m15, 9 1966 packssdw m14, m15 1967 pxor m7, m7 1968 pmaxsw m14, m7 1969 pavgw m7, m14 1970 pminsw m7, [tmp+16*5] 1971 movq [dstq+dsq*0], m7 1972 movhps [dstq+dsq*1], m7 1973 lea dstq, [dstq+dsq*2] 1974 sub hd, 2 1975 jg .hv_w4_loop 1976 add r7, 8 1977 add r8, 8 1978 movzx hd, wb 1979 mov srcq, r7 1980 mov dstq, r8 1981 sub wd, 1<<8 1982%endif 1983 jg .hv_w4_loop0 1984 RET 1985%undef tmp 1986 1987%if ARCH_X86_32 1988DECLARE_REG_TMP 2, 1, 6, 4 1989%elif WIN64 1990DECLARE_REG_TMP 6, 4, 7, 4 1991%else 1992DECLARE_REG_TMP 6, 7, 7, 8 1993%endif 1994 1995%define PREP_8TAP_FN FN prep_8tap, 1996PREP_8TAP_FN sharp, SHARP, SHARP 1997PREP_8TAP_FN sharp_smooth, SHARP, SMOOTH 1998PREP_8TAP_FN smooth_sharp, SMOOTH, SHARP 1999PREP_8TAP_FN smooth, SMOOTH, SMOOTH 2000PREP_8TAP_FN sharp_regular, SHARP, REGULAR 2001PREP_8TAP_FN regular_sharp, REGULAR, SHARP 2002PREP_8TAP_FN smooth_regular, SMOOTH, REGULAR 2003PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH 2004PREP_8TAP_FN regular, REGULAR, REGULAR 2005 2006%if ARCH_X86_32 2007cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my 2008%define mxb r0b 2009%define mxd r0 2010%define mxq r0 2011%define myb r2b 2012%define myd r2 2013%define myq r2 2014%else 2015cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my 2016%endif 2017%define base t2-prep_ssse3 2018 imul mxd, mxm, 0x010101 2019 add mxd, t0d ; 8tap_h, mx, 4tap_h 2020 imul myd, mym, 0x010101 2021 add myd, t1d ; 8tap_v, my, 4tap_v 2022 LEA t2, prep_ssse3 2023 movifnidn wd, wm 2024 movifnidn srcq, srcmp 2025 test mxd, 0xf00 2026 jnz .h 2027 movifnidn hd, hm 2028 test myd, 0xf00 2029 jnz .v 2030 tzcnt wd, wd 2031 mov myd, r7m ; bitdepth_max 2032 movzx wd, word [base+prep_ssse3_table+wq*2] 2033 mova m5, [base+pw_8192] 2034 shr myd, 11 2035 add wq, t2 2036 movddup m4, [base+prep_mul+myq*8] 2037 movifnidn ssq, ssmp 2038 movifnidn tmpq, tmpmp 2039 lea r6, [ssq*3] 2040%if WIN64 2041 pop r7 2042%endif 2043 jmp wq 2044.h: 2045 test myd, 0xf00 2046 jnz .hv 2047 movifnidn ssq, r2mp 2048 movifnidn hd, r4m 2049 movddup m5, [base+prep_8tap_1d_rnd] 2050 cmp wd, 4 2051 jne .h_w8 2052 movzx mxd, mxb 2053 movq m0, [base+subpel_filters+mxq*8] 2054 mova m3, [base+spel_h_shufA] 2055 mova m4, [base+spel_h_shufB] 2056 movifnidn tmpq, tmpmp 2057 sub srcq, 2 2058 WIN64_SPILL_XMM 8 2059 punpcklbw m0, m0 2060 psraw m0, 8 2061 test dword r7m, 0x800 2062 jnz .h_w4_12bpc 2063 psllw m0, 2 2064.h_w4_12bpc: 2065 pshufd m6, m0, q1111 2066 pshufd m7, m0, q2222 2067.h_w4_loop: 2068 movu m1, [srcq+ssq*0] 2069 movu m2, [srcq+ssq*1] 2070 lea srcq, [srcq+ssq*2] 2071 pshufb m0, m1, m3 ; 0 1 1 2 2 3 3 4 2072 pshufb m1, m4 ; 2 3 3 4 4 5 5 6 2073 pmaddwd m0, m6 2074 pmaddwd m1, m7 2075 paddd m0, m5 2076 paddd m0, m1 2077 pshufb m1, m2, m3 2078 pshufb m2, m4 2079 pmaddwd m1, m6 2080 pmaddwd m2, m7 2081 paddd m1, m5 2082 paddd m1, m2 2083 psrad m0, 4 2084 psrad m1, 4 2085 packssdw m0, m1 2086 mova [tmpq], m0 2087 add tmpq, 16 2088 sub hd, 2 2089 jg .h_w4_loop 2090 RET 2091.h_w8: 2092 WIN64_SPILL_XMM 11 2093 shr mxd, 16 2094 movq m2, [base+subpel_filters+mxq*8] 2095 mova m4, [base+spel_h_shufA] 2096 mova m6, [base+spel_h_shufB] 2097 movifnidn tmpq, r0mp 2098 add wd, wd 2099 punpcklbw m2, m2 2100 add srcq, wq 2101 psraw m2, 8 2102 add tmpq, wq 2103 neg wq 2104 test dword r7m, 0x800 2105 jnz .h_w8_12bpc 2106 psllw m2, 2 2107.h_w8_12bpc: 2108 pshufd m7, m2, q0000 2109%if ARCH_X86_32 2110 ALLOC_STACK -16*3 2111 pshufd m0, m2, q1111 2112 pshufd m1, m2, q2222 2113 pshufd m2, m2, q3333 2114 mova m8, m0 2115 mova m9, m1 2116 mova m10, m2 2117%else 2118 pshufd m8, m2, q1111 2119 pshufd m9, m2, q2222 2120 pshufd m10, m2, q3333 2121%endif 2122.h_w8_loop0: 2123 mov r6, wq 2124.h_w8_loop: 2125 movu m0, [srcq+r6- 6] 2126 movu m1, [srcq+r6+ 2] 2127 pshufb m2, m0, m4 ; 0 1 1 2 2 3 3 4 2128 pshufb m0, m6 ; 2 3 3 4 4 5 5 6 2129 pmaddwd m2, m7 ; abcd0 2130 pmaddwd m0, m8 ; abcd1 2131 pshufb m3, m1, m4 ; 4 5 5 6 6 7 7 8 2132 pshufb m1, m6 ; 6 7 7 8 8 9 9 a 2133 paddd m2, m5 2134 paddd m0, m2 2135 pmaddwd m2, m9, m3 ; abcd2 2136 pmaddwd m3, m7 ; efgh0 2137 paddd m0, m2 2138 pmaddwd m2, m10, m1 ; abcd3 2139 pmaddwd m1, m8 ; efgh1 2140 paddd m0, m2 2141 movu m2, [srcq+r6+10] 2142 paddd m3, m5 2143 paddd m1, m3 2144 pshufb m3, m2, m4 ; a b b c c d d e 2145 pshufb m2, m6 ; 8 9 9 a a b b c 2146 pmaddwd m3, m9 ; efgh2 2147 pmaddwd m2, m10 ; efgh3 2148 paddd m1, m3 2149 paddd m1, m2 2150 psrad m0, 4 2151 psrad m1, 4 2152 packssdw m0, m1 2153 mova [tmpq+r6], m0 2154 add r6, 16 2155 jl .h_w8_loop 2156 add srcq, ssq 2157 sub tmpq, wq 2158 dec hd 2159 jg .h_w8_loop0 2160 RET 2161.v: 2162 movzx mxd, myb 2163 shr myd, 16 2164 cmp hd, 4 2165 cmove myd, mxd 2166 movq m3, [base+subpel_filters+myq*8] 2167 WIN64_SPILL_XMM 15 2168 movddup m7, [base+prep_8tap_1d_rnd] 2169 movifnidn ssq, r2mp 2170 movifnidn tmpq, r0mp 2171 punpcklbw m3, m3 2172 psraw m3, 8 ; sign-extend 2173 test dword r7m, 0x800 2174 jnz .v_12bpc 2175 psllw m3, 2 2176.v_12bpc: 2177%if ARCH_X86_32 2178 ALLOC_STACK -16*7 2179 pshufd m0, m3, q0000 2180 pshufd m1, m3, q1111 2181 pshufd m2, m3, q2222 2182 pshufd m3, m3, q3333 2183 mova m8, m0 2184 mova m9, m1 2185 mova m10, m2 2186 mova m11, m3 2187%else 2188 pshufd m8, m3, q0000 2189 pshufd m9, m3, q1111 2190 pshufd m10, m3, q2222 2191 pshufd m11, m3, q3333 2192%endif 2193 lea r6, [ssq*3] 2194 sub srcq, r6 2195 mov r6d, wd 2196 shl wd, 6 2197 mov r5, srcq 2198%if ARCH_X86_64 2199 mov r7, tmpq 2200%elif STACK_ALIGNMENT < 16 2201 mov [esp+4*29], tmpq 2202%endif 2203 lea wd, [wq+hq-(1<<8)] 2204.v_loop0: 2205 movq m1, [srcq+ssq*0] 2206 movq m2, [srcq+ssq*1] 2207 lea srcq, [srcq+ssq*2] 2208 movq m3, [srcq+ssq*0] 2209 movq m4, [srcq+ssq*1] 2210 lea srcq, [srcq+ssq*2] 2211 movq m5, [srcq+ssq*0] 2212 movq m6, [srcq+ssq*1] 2213 lea srcq, [srcq+ssq*2] 2214 movq m0, [srcq+ssq*0] 2215 punpcklwd m1, m2 ; 01 2216 punpcklwd m2, m3 ; 12 2217 punpcklwd m3, m4 ; 23 2218 punpcklwd m4, m5 ; 34 2219 punpcklwd m5, m6 ; 45 2220 punpcklwd m6, m0 ; 56 2221%if ARCH_X86_32 2222 jmp .v_loop_start 2223.v_loop: 2224 mova m1, m12 2225 mova m2, m13 2226 mova m3, m14 2227.v_loop_start: 2228 pmaddwd m1, m8 ; a0 2229 pmaddwd m2, m8 ; b0 2230 mova m12, m3 2231 mova m13, m4 2232 pmaddwd m3, m9 ; a1 2233 pmaddwd m4, m9 ; b1 2234 paddd m1, m3 2235 paddd m2, m4 2236 mova m14, m5 2237 mova m4, m6 2238 pmaddwd m5, m10 ; a2 2239 pmaddwd m6, m10 ; b2 2240 paddd m1, m5 2241 paddd m2, m6 2242 movq m6, [srcq+ssq*1] 2243 lea srcq, [srcq+ssq*2] 2244 punpcklwd m5, m0, m6 ; 67 2245 movq m0, [srcq+ssq*0] 2246 pmaddwd m3, m11, m5 ; a3 2247 punpcklwd m6, m0 ; 78 2248 paddd m1, m7 2249 paddd m1, m3 2250 pmaddwd m3, m11, m6 ; b3 2251 paddd m2, m7 2252 paddd m2, m3 2253 psrad m1, 4 2254 psrad m2, 4 2255 packssdw m1, m2 2256 movq [tmpq+r6*0], m1 2257 movhps [tmpq+r6*2], m1 2258 lea tmpq, [tmpq+r6*4] 2259 sub hd, 2 2260 jg .v_loop 2261%if STACK_ALIGNMENT < 16 2262 mov tmpq, [esp+4*29] 2263 add r5, 8 2264 add tmpq, 8 2265 mov srcq, r5 2266 mov [esp+4*29], tmpq 2267%else 2268 mov tmpq, tmpmp 2269 add r5, 8 2270 add tmpq, 8 2271 mov srcq, r5 2272 mov tmpmp, tmpq 2273%endif 2274%else 2275.v_loop: 2276 pmaddwd m12, m8, m1 ; a0 2277 pmaddwd m13, m8, m2 ; b0 2278 mova m1, m3 2279 mova m2, m4 2280 pmaddwd m3, m9 ; a1 2281 pmaddwd m4, m9 ; b1 2282 paddd m12, m3 2283 paddd m13, m4 2284 mova m3, m5 2285 mova m4, m6 2286 pmaddwd m5, m10 ; a2 2287 pmaddwd m6, m10 ; b2 2288 paddd m12, m5 2289 paddd m13, m6 2290 movq m6, [srcq+ssq*1] 2291 lea srcq, [srcq+ssq*2] 2292 punpcklwd m5, m0, m6 ; 67 2293 movq m0, [srcq+ssq*0] 2294 pmaddwd m14, m11, m5 ; a3 2295 punpcklwd m6, m0 ; 78 2296 paddd m12, m7 2297 paddd m12, m14 2298 pmaddwd m14, m11, m6 ; b3 2299 paddd m13, m7 2300 paddd m13, m14 2301 psrad m12, 4 2302 psrad m13, 4 2303 packssdw m12, m13 2304 movq [tmpq+r6*0], m12 2305 movhps [tmpq+r6*2], m12 2306 lea tmpq, [tmpq+r6*4] 2307 sub hd, 2 2308 jg .v_loop 2309 add r5, 8 2310 add r7, 8 2311 mov srcq, r5 2312 mov tmpq, r7 2313%endif 2314 movzx hd, wb 2315 sub wd, 1<<8 2316 jg .v_loop0 2317 RET 2318.hv: 2319 RESET_STACK_STATE 2320 movzx t3d, mxb 2321 shr mxd, 16 2322 cmp wd, 4 2323 cmove mxd, t3d 2324 movifnidn hd, r4m 2325 movq m2, [base+subpel_filters+mxq*8] 2326 movzx mxd, myb 2327 shr myd, 16 2328 cmp hd, 4 2329 cmove myd, mxd 2330 movq m3, [base+subpel_filters+myq*8] 2331%if ARCH_X86_32 2332 mov ssq, r2mp 2333 mov tmpq, r0mp 2334 mova m0, [base+spel_h_shufA] 2335 mova m1, [base+spel_h_shufB] 2336 mova m4, [base+prep_8tap_2d_rnd] 2337 ALLOC_STACK -16*14 2338 mova m8, m0 2339 mova m9, m1 2340 mova m14, m4 2341%else 2342%if WIN64 2343 ALLOC_STACK 16*6, 16 2344%endif 2345 mova m8, [base+spel_h_shufA] 2346 mova m9, [base+spel_h_shufB] 2347%endif 2348 pxor m0, m0 2349 punpcklbw m0, m2 2350 punpcklbw m3, m3 2351 psraw m0, 4 2352 psraw m3, 8 2353 test dword r7m, 0x800 2354 jz .hv_10bpc 2355 psraw m0, 2 2356.hv_10bpc: 2357 lea r6, [ssq*3] 2358 sub srcq, 6 2359 sub srcq, r6 2360 mov r6d, wd 2361 shl wd, 6 2362 mov r5, srcq 2363%if ARCH_X86_32 2364 %define tmp esp+16*8 2365%if STACK_ALIGNMENT < 16 2366 mov [esp+4*61], tmpq 2367%endif 2368 pshufd m1, m0, q0000 2369 pshufd m2, m0, q1111 2370 pshufd m5, m0, q2222 2371 pshufd m0, m0, q3333 2372 mova m10, m1 2373 mova m11, m2 2374 mova m12, m5 2375 mova m13, m0 2376%else 2377%if WIN64 2378 %define tmp rsp 2379%else 2380 %define tmp rsp-88 ; red zone 2381%endif 2382 mov r7, tmpq 2383 pshufd m10, m0, q0000 2384 pshufd m11, m0, q1111 2385 pshufd m12, m0, q2222 2386 pshufd m13, m0, q3333 2387%endif 2388 lea wd, [wq+hq-(1<<8)] 2389 pshufd m0, m3, q0000 2390 pshufd m1, m3, q1111 2391 pshufd m2, m3, q2222 2392 pshufd m3, m3, q3333 2393 mova [tmp+16*1], m0 2394 mova [tmp+16*2], m1 2395 mova [tmp+16*3], m2 2396 mova [tmp+16*4], m3 2397.hv_loop0: 2398%if ARCH_X86_64 2399 mova m14, [prep_8tap_2d_rnd] 2400%endif 2401 movu m4, [srcq+ssq*0+0] 2402 movu m1, [srcq+ssq*0+8] 2403 movu m5, [srcq+ssq*1+0] 2404 movu m2, [srcq+ssq*1+8] 2405 lea srcq, [srcq+ssq*2] 2406 movu m6, [srcq+ssq*0+0] 2407 movu m3, [srcq+ssq*0+8] 2408 PUT_8TAP_HV_H 4, 1, 0, 6 2409 PUT_8TAP_HV_H 5, 2, 0, 6 2410 PUT_8TAP_HV_H 6, 3, 0, 6 2411 movu m7, [srcq+ssq*1+0] 2412 movu m2, [srcq+ssq*1+8] 2413 lea srcq, [srcq+ssq*2] 2414 movu m1, [srcq+ssq*0+0] 2415 movu m3, [srcq+ssq*0+8] 2416 PUT_8TAP_HV_H 7, 2, 0, 6 2417 PUT_8TAP_HV_H 1, 3, 0, 6 2418 movu m2, [srcq+ssq*1+0] 2419 movu m3, [srcq+ssq*1+8] 2420 lea srcq, [srcq+ssq*2] 2421 PUT_8TAP_HV_H 2, 3, 0, 6 2422 packssdw m4, m7 ; 0 3 2423 packssdw m5, m1 ; 1 4 2424 movu m0, [srcq+ssq*0+0] 2425 movu m1, [srcq+ssq*0+8] 2426 PUT_8TAP_HV_H 0, 1, 3, 6 2427 packssdw m6, m2 ; 2 5 2428 packssdw m7, m0 ; 3 6 2429 punpcklwd m1, m4, m5 ; 01 2430 punpckhwd m4, m5 ; 34 2431 punpcklwd m2, m5, m6 ; 12 2432 punpckhwd m5, m6 ; 45 2433 punpcklwd m3, m6, m7 ; 23 2434 punpckhwd m6, m7 ; 56 2435%if ARCH_X86_32 2436 jmp .hv_loop_start 2437.hv_loop: 2438 mova m1, [tmp+16*5] 2439 mova m2, m15 2440.hv_loop_start: 2441 mova m7, [tmp+16*1] 2442 pmaddwd m1, m7 ; a0 2443 pmaddwd m2, m7 ; b0 2444 mova m7, [tmp+16*2] 2445 mova [tmp+16*5], m3 2446 pmaddwd m3, m7 ; a1 2447 mova m15, m4 2448 pmaddwd m4, m7 ; b1 2449 mova m7, [tmp+16*3] 2450 paddd m1, m14 2451 paddd m2, m14 2452 paddd m1, m3 2453 paddd m2, m4 2454 mova m3, m5 2455 pmaddwd m5, m7 ; a2 2456 mova m4, m6 2457 pmaddwd m6, m7 ; b2 2458 paddd m1, m5 2459 paddd m2, m6 2460 movu m7, [srcq+ssq*1+0] 2461 movu m5, [srcq+ssq*1+8] 2462 lea srcq, [srcq+ssq*2] 2463 PUT_8TAP_HV_H 7, 5, 6, 6 2464 packssdw m0, m7 ; 6 7 2465 mova [tmp+16*0], m0 2466 movu m0, [srcq+ssq*0+0] 2467 movu m5, [srcq+ssq*0+8] 2468 PUT_8TAP_HV_H 0, 5, 6, 6 2469 mova m6, [tmp+16*0] 2470 packssdw m7, m0 ; 7 8 2471 punpcklwd m5, m6, m7 ; 67 2472 punpckhwd m6, m7 ; 78 2473 pmaddwd m7, m5, [tmp+16*4] 2474 paddd m1, m7 ; a3 2475 pmaddwd m7, m6, [tmp+16*4] 2476 paddd m2, m7 ; b3 2477 psrad m1, 6 2478 psrad m2, 6 2479 packssdw m1, m2 2480 movq [tmpq+r6*0], m1 2481 movhps [tmpq+r6*2], m1 2482 lea tmpq, [tmpq+r6*4] 2483 sub hd, 2 2484 jg .hv_loop 2485%if STACK_ALIGNMENT < 16 2486 mov tmpq, [esp+4*61] 2487 add r5, 8 2488 add tmpq, 8 2489 mov srcq, r5 2490 mov [esp+4*61], tmpq 2491%else 2492 mov tmpq, tmpmp 2493 add r5, 8 2494 add tmpq, 8 2495 mov srcq, r5 2496 mov tmpmp, tmpq 2497%endif 2498%else 2499.hv_loop: 2500 mova m15, [tmp+16*1] 2501 mova m7, [prep_8tap_2d_rnd] 2502 pmaddwd m14, m15, m1 ; a0 2503 pmaddwd m15, m2 ; b0 2504 paddd m14, m7 2505 paddd m15, m7 2506 mova m7, [tmp+16*2] 2507 mova m1, m3 2508 pmaddwd m3, m7 ; a1 2509 mova m2, m4 2510 pmaddwd m4, m7 ; b1 2511 mova m7, [tmp+16*3] 2512 paddd m14, m3 2513 paddd m15, m4 2514 mova m3, m5 2515 pmaddwd m5, m7 ; a2 2516 mova m4, m6 2517 pmaddwd m6, m7 ; b2 2518 paddd m14, m5 2519 paddd m15, m6 2520 movu m7, [srcq+ssq*1+0] 2521 movu m5, [srcq+ssq*1+8] 2522 lea srcq, [srcq+ssq*2] 2523 PUT_8TAP_HV_H 7, 5, 6, 6, [prep_8tap_2d_rnd] 2524 packssdw m0, m7 ; 6 7 2525 mova [tmp+16*0], m0 2526 movu m0, [srcq+ssq*0+0] 2527 movu m5, [srcq+ssq*0+8] 2528 PUT_8TAP_HV_H 0, 5, 6, 6, [prep_8tap_2d_rnd] 2529 mova m6, [tmp+16*0] 2530 packssdw m7, m0 ; 7 8 2531 punpcklwd m5, m6, m7 ; 67 2532 punpckhwd m6, m7 ; 78 2533 pmaddwd m7, m5, [tmp+16*4] 2534 paddd m14, m7 ; a3 2535 pmaddwd m7, m6, [tmp+16*4] 2536 paddd m15, m7 ; b3 2537 psrad m14, 6 2538 psrad m15, 6 2539 packssdw m14, m15 2540 movq [tmpq+r6*0], m14 2541 movhps [tmpq+r6*2], m14 2542 lea tmpq, [tmpq+r6*4] 2543 sub hd, 2 2544 jg .hv_loop 2545 add r5, 8 2546 add r7, 8 2547 mov srcq, r5 2548 mov tmpq, r7 2549%endif 2550 movzx hd, wb 2551 sub wd, 1<<8 2552 jg .hv_loop0 2553 RET 2554%undef tmp 2555 2556%macro movifprep 2 2557 %if isprep 2558 mov %1, %2 2559 %endif 2560%endmacro 2561 2562%macro SAVE_REG 1 2563 %xdefine r%1_save r%1 2564 %xdefine r%1q_save r%1q 2565 %xdefine r%1d_save r%1d 2566 %if ARCH_X86_32 2567 %define r%1m_save [rstk+stack_offset+(%1+1)*4] 2568 %endif 2569%endmacro 2570 2571%macro LOAD_REG 1 2572 %xdefine r%1 r%1_save 2573 %xdefine r%1q r%1q_save 2574 %xdefine r%1d r%1d_save 2575 %if ARCH_X86_32 2576 %define r%1m r%1m_save 2577 %endif 2578 %undef r%1d_save 2579 %undef r%1q_save 2580 %undef r%1_save 2581%endmacro 2582 2583%macro REMAP_REG 2-3 2584 %xdefine r%1 r%2 2585 %xdefine r%1q r%2q 2586 %xdefine r%1d r%2d 2587 %if ARCH_X86_32 2588 %if %3 == 0 2589 %xdefine r%1m r%2m 2590 %else 2591 %define r%1m [rstk+stack_offset+(%1+1)*4] 2592 %endif 2593 %endif 2594%endmacro 2595 2596%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0 2597 %if isprep 2598 %if ARCH_X86_64 2599 SAVE_REG 14 2600 %assign %%i 14 2601 %rep 14 2602 %assign %%j %%i-1 2603 REMAP_REG %%i, %%j 2604 %assign %%i %%i-1 2605 %endrep 2606 %else 2607 SAVE_REG 5 2608 %assign %%i 5 2609 %rep 5 2610 %assign %%j %%i-1 2611 REMAP_REG %%i, %%j, 0 2612 %assign %%i %%i-1 2613 %endrep 2614 %endif 2615 %endif 2616%endmacro 2617 2618%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0 2619 %if isprep 2620 %assign %%i 1 2621 %if ARCH_X86_64 2622 %rep 13 2623 %assign %%j %%i+1 2624 REMAP_REG %%i, %%j 2625 %assign %%i %%i+1 2626 %endrep 2627 LOAD_REG 14 2628 %else 2629 %rep 4 2630 %assign %%j %%i+1 2631 REMAP_REG %%i, %%j, 1 2632 %assign %%i %%i+1 2633 %endrep 2634 LOAD_REG 5 2635 %endif 2636 %endif 2637%endmacro 2638 2639%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged 2640 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 2641 RET 2642 %if %1 2643 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 2644 %endif 2645%endmacro 2646 2647%if ARCH_X86_32 2648 %macro MC_4TAP_SCALED_H 1 ; dst_mem 2649 movu m7, [srcq+ssq*0] 2650 movu m2, [srcq+ssq*1] 2651 movu m5, [r4 +ssq*0] 2652 movu m6, [r4 +ssq*1] 2653 lea srcq, [srcq+ssq*2] 2654 lea r4, [r4 +ssq*2] 2655 REPX {pshufb x, m12}, m7, m2 2656 REPX {pmaddwd x, m13}, m7, m2 2657 REPX {pshufb x, m14}, m5, m6 2658 REPX {pmaddwd x, m15}, m5, m6 2659 phaddd m7, m5 2660 phaddd m2, m6 2661 mova m5, [esp+0x00] 2662 movd m6, [esp+0x10] 2663 paddd m7, m5 2664 paddd m2, m5 2665 psrad m7, m6 2666 psrad m2, m6 2667 packssdw m7, m2 2668 mova [stk+%1], m7 2669 %endmacro 2670%endif 2671 2672%if ARCH_X86_64 2673 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6] 2674 movu m%1, [srcq+ r4*2] 2675 movu m%2, [srcq+ r6*2] 2676 movu m%3, [srcq+ r7*2] 2677 movu m%4, [srcq+ r9*2] 2678 movu m%5, [srcq+r10*2] 2679 movu m%6, [srcq+r11*2] 2680 movu m%7, [srcq+r13*2] 2681 movu m%8, [srcq+ rX*2] 2682 add srcq, ssq 2683 pmaddwd m%1, [stk+0x10] 2684 pmaddwd m%2, [stk+0x20] 2685 pmaddwd m%3, [stk+0x30] 2686 pmaddwd m%4, [stk+0x40] 2687 pmaddwd m%5, [stk+0x50] 2688 pmaddwd m%6, [stk+0x60] 2689 pmaddwd m%7, [stk+0x70] 2690 pmaddwd m%8, [stk+0x80] 2691 phaddd m%1, m%2 2692 phaddd m%3, m%4 2693 phaddd m%5, m%6 2694 phaddd m%7, m%8 2695 phaddd m%1, m%3 2696 phaddd m%5, m%7 2697 paddd m%1, hround 2698 paddd m%5, hround 2699 psrad m%1, m12 2700 psrad m%5, m12 2701 packssdw m%1, m%5 2702 %endmacro 2703%else 2704 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets 2705 %if %3 == 1 2706 mov r0, [stk+ 0] 2707 mov rX, [stk+ 4] 2708 mov r4, [stk+ 8] 2709 mov r5, [stk+12] 2710 %endif 2711 movu m0, [srcq+r0*2] 2712 movu m1, [srcq+rX*2] 2713 movu m2, [srcq+r4*2] 2714 movu m3, [srcq+r5*2] 2715 mov r0, [stk+16] 2716 mov rX, [stk+20] 2717 mov r4, [stk+24] 2718 mov r5, [stk+28] 2719 pmaddwd m0, [stk+%1+0x00] 2720 pmaddwd m1, [stk+%1+0x10] 2721 pmaddwd m2, [stk+%1+0x20] 2722 pmaddwd m3, [stk+%1+0x30] 2723 phaddd m0, m1 2724 phaddd m2, m3 2725 movu m4, [srcq+r0*2] 2726 movu m5, [srcq+rX*2] 2727 movu m6, [srcq+r4*2] 2728 movu m7, [srcq+r5*2] 2729 add srcq, ssq 2730 pmaddwd m4, [stk+%1+0xa0] 2731 pmaddwd m5, [stk+%1+0xb0] 2732 pmaddwd m6, [stk+%1+0xc0] 2733 pmaddwd m7, [stk+%1+0xd0] 2734 phaddd m4, m5 2735 phaddd m6, m7 2736 phaddd m0, m2 2737 phaddd m4, m6 2738 paddd m0, hround 2739 paddd m4, hround 2740 psrad m0, m12 2741 psrad m4, m12 2742 packssdw m0, m4 2743 %if %2 != 0 2744 mova [stk+%2], m0 2745 %endif 2746 %endmacro 2747%endif 2748 2749%macro MC_8TAP_SCALED 1 2750%ifidn %1, put 2751 %assign isput 1 2752 %assign isprep 0 2753 %if ARCH_X86_64 2754 %if required_stack_alignment <= STACK_ALIGNMENT 2755cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 2756 %else 2757cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 2758 %endif 2759 %else ; ARCH_X86_32 2760 %if required_stack_alignment <= STACK_ALIGNMENT 2761cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 2762 %else 2763cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax 2764 %endif 2765 %endif 2766 %xdefine base_reg r12 2767%else ; prep 2768 %assign isput 0 2769 %assign isprep 1 2770 %if ARCH_X86_64 2771 %if required_stack_alignment <= STACK_ALIGNMENT 2772cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 2773 %xdefine tmp_stridem r14q 2774 %else 2775cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 2776 %define tmp_stridem qword [stk+0x138] 2777 %endif 2778 %xdefine base_reg r11 2779 %else ; ARCH_X86_32 2780 %if required_stack_alignment <= STACK_ALIGNMENT 2781cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 2782 %else 2783cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax 2784 %endif 2785 %define tmp_stridem dword [stk+0x138] 2786 %endif 2787%endif 2788%if ARCH_X86_32 2789 mov [esp+0x1f0], t0d 2790 mov [esp+0x1f4], t1d 2791 %if isput && required_stack_alignment > STACK_ALIGNMENT 2792 mov dstd, dstm 2793 mov dsd, dsm 2794 mov srcd, srcm 2795 mov ssd, ssm 2796 mov hd, hm 2797 mov r4, mxm 2798 %define r0m [esp+0x200] 2799 %define dsm [esp+0x204] 2800 %define dsmp dsm 2801 %define r1m dsm 2802 %define r2m [esp+0x208] 2803 %define ssm [esp+0x20c] 2804 %define r3m ssm 2805 %define hm [esp+0x210] 2806 %define mxm [esp+0x214] 2807 mov r0m, dstd 2808 mov dsm, dsd 2809 mov r2m, srcd 2810 mov ssm, ssd 2811 mov hm, hd 2812 mov r0, mym 2813 mov r1, dxm 2814 mov r2, dym 2815 %define mym [esp+0x218] 2816 %define dxm [esp+0x21c] 2817 %define dym [esp+0x220] 2818 mov mxm, r4 2819 mov mym, r0 2820 mov dxm, r1 2821 mov dym, r2 2822 tzcnt wd, wm 2823 %endif 2824 %if isput 2825 mov r3, pxmaxm 2826 %define pxmaxm r3 2827 %else 2828 mov r2, pxmaxm 2829 %endif 2830 %if isprep && required_stack_alignment > STACK_ALIGNMENT 2831 %xdefine base_reg r5 2832 %else 2833 %xdefine base_reg r6 2834 %endif 2835%endif 2836 LEA base_reg, %1_8tap_scaled_16bpc_ssse3 2837%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3 2838%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT 2839 tzcnt wd, wm 2840%endif 2841%if ARCH_X86_64 2842 %if isput 2843 mov r7d, pxmaxm 2844 %endif 2845%else 2846 %define m8 m0 2847 %define m9 m1 2848 %define m14 m4 2849 %define m15 m3 2850%endif 2851 movd m8, dxm 2852 movd m14, mxm 2853%if isput 2854 movd m15, pxmaxm 2855%endif 2856 pshufd m8, m8, q0000 2857 pshufd m14, m14, q0000 2858%if isput 2859 pshuflw m15, m15, q0000 2860 punpcklqdq m15, m15 2861%endif 2862%if isprep 2863 %if UNIX64 2864 mov r5d, t0d 2865 DECLARE_REG_TMP 5, 7 2866 %endif 2867 %if ARCH_X86_64 2868 mov r6d, pxmaxm 2869 %endif 2870%endif 2871%if ARCH_X86_64 2872 mov dyd, dym 2873%endif 2874%if isput 2875 %if WIN64 2876 mov r8d, hm 2877 DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3 2878 %define hm r5m 2879 %define dxm r8m 2880 %elif ARCH_X86_64 2881 DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3 2882 %define hm r6m 2883 %else 2884 %endif 2885 %if ARCH_X86_64 2886 %if required_stack_alignment > STACK_ALIGNMENT 2887 %define dsm [rsp+0x138] 2888 %define rX r1 2889 %define rXd r1d 2890 %else 2891 %define dsm dsq 2892 %define rX r14 2893 %define rXd r14d 2894 %endif 2895 %else 2896 %define rX r1 2897 %endif 2898%else ; prep 2899 %if WIN64 2900 mov r7d, hm 2901 DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3 2902 %define hm r4m 2903 %define dxm r7m 2904 %elif ARCH_X86_64 2905 DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3 2906 %xdefine hm r7m 2907 %endif 2908 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 2909 %if ARCH_X86_64 2910 %define rX r14 2911 %define rXd r14d 2912 %else 2913 %define rX r3 2914 %endif 2915%endif 2916%if ARCH_X86_64 2917 shr r7d, 11 2918 mova m10, [base+pd_0x3ff] 2919 movddup m11, [base+s_8tap_h_rnd+r7*8] 2920 movd m12, [base+s_8tap_h_sh+r7*4] 2921 %if isput 2922 movddup m13, [base+put_s_8tap_v_rnd+r7*8] 2923 movd m7, [base+put_s_8tap_v_sh+r7*4] 2924 %define pxmaxm [rsp] 2925 mova pxmaxm, m15 2926 punpcklqdq m12, m7 2927 %endif 2928 lea ss3q, [ssq*3] 2929 movzx r7d, t1b 2930 shr t1d, 16 2931 cmp hd, 6 2932 cmovs t1d, r7d 2933 sub srcq, ss3q 2934%else 2935 %define m10 [base+pd_0x3ff] 2936 %define m11 [esp+0x00] 2937 %define m12 [esp+0x10] 2938 shr r3, 11 2939 movddup m1, [base+s_8tap_h_rnd+r3*8] 2940 movd m2, [base+s_8tap_h_sh+r3*4] 2941 %if isput 2942 %define m13 [esp+0x20] 2943 %define pxmaxm [esp+0x30] 2944 %define stk esp+0x40 2945 movddup m5, [base+put_s_8tap_v_rnd+r3*8] 2946 movd m6, [base+put_s_8tap_v_sh+r3*4] 2947 mova pxmaxm, m15 2948 punpcklqdq m2, m6 2949 mova m13, m5 2950 %else 2951 %define m13 [base+pd_m524256] 2952 %endif 2953 mov ssd, ssm 2954 mova m11, m1 2955 mova m12, m2 2956 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 2957 mov r1, [esp+0x1f4] 2958 lea r0, [ssd*3] 2959 movzx r2, r1b 2960 shr r1, 16 2961 cmp dword hm, 6 2962 cmovs r1, r2 2963 mov [esp+0x1f4], r1 2964 %if isprep 2965 mov r1, r1m 2966 %endif 2967 mov r2, r2m 2968 sub srcq, r0 2969 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 2970 %define ss3q r0 2971 %define myd r4 2972 %define dyd dword dym 2973 %define hd dword hm 2974%endif 2975 cmp dyd, 1024 2976 je .dy1 2977 cmp dyd, 2048 2978 je .dy2 2979 movzx wd, word [base+%1_8tap_scaled_ssse3_table+wq*2] 2980 add wq, base_reg 2981 jmp wq 2982%if isput 2983.w2: 2984 %if ARCH_X86_64 2985 mov myd, mym 2986 movzx t0d, t0b 2987 sub srcq, 2 2988 movd m15, t0d 2989 %else 2990 movzx r4, byte [esp+0x1f0] 2991 sub srcq, 2 2992 movd m15, r4 2993 %endif 2994 pxor m9, m9 2995 punpckldq m9, m8 2996 paddd m14, m9 ; mx+dx*[0-1] 2997 %if ARCH_X86_64 2998 mova m9, [base+pd_0x4000] 2999 %endif 3000 pshufd m15, m15, q0000 3001 pand m8, m14, m10 3002 psrld m8, 6 3003 paddd m15, m8 3004 movd r4d, m15 3005 pshufd m15, m15, q0321 3006 %if ARCH_X86_64 3007 movd r6d, m15 3008 %else 3009 movd r3d, m15 3010 %endif 3011 mova m5, [base+bdct_lb_q] 3012 mova m6, [base+spel_s_shuf2] 3013 movd m15, [base+subpel_filters+r4*8+2] 3014 %if ARCH_X86_64 3015 movd m7, [base+subpel_filters+r6*8+2] 3016 %else 3017 movd m7, [base+subpel_filters+r3*8+2] 3018 %endif 3019 pxor m2, m2 3020 pcmpeqd m8, m2 3021 psrld m14, 10 3022 paddd m14, m14 3023 %if ARCH_X86_32 3024 mov r3, r3m 3025 pshufb m14, m5 3026 paddb m14, m6 3027 mova [stk], m14 3028 SWAP m5, m0 3029 SWAP m6, m3 3030 %define m15 m6 3031 %endif 3032 movu m0, [srcq+ssq*0] 3033 movu m1, [srcq+ssq*1] 3034 movu m2, [srcq+ssq*2] 3035 movu m3, [srcq+ss3q ] 3036 lea srcq, [srcq+ssq*4] 3037 punpckldq m15, m7 3038 %if ARCH_X86_64 3039 pshufb m14, m5 3040 paddb m14, m6 3041 pand m9, m8 3042 pandn m8, m15 3043 SWAP m15, m8 3044 por m15, m9 3045 movu m4, [srcq+ssq*0] 3046 movu m5, [srcq+ssq*1] 3047 movu m6, [srcq+ssq*2] 3048 movu m7, [srcq+ss3q ] 3049 lea srcq, [srcq+ssq*4] 3050 %else 3051 pand m7, m5, [base+pd_0x4000] 3052 pandn m5, m15 3053 por m5, m7 3054 %define m15 m5 3055 %endif 3056 punpcklbw m15, m15 3057 psraw m15, 8 3058 REPX {pshufb x, m14}, m0, m1, m2, m3 3059 REPX {pmaddwd x, m15}, m0, m1, m2, m3 3060 %if ARCH_X86_64 3061 REPX {pshufb x, m14}, m4, m5, m6, m7 3062 REPX {pmaddwd x, m15}, m4, m5, m6, m7 3063 phaddd m0, m1 3064 phaddd m2, m3 3065 phaddd m4, m5 3066 phaddd m6, m7 3067 REPX {paddd x, m11}, m0, m2, m4, m6 3068 REPX {psrad x, m12}, m0, m2, m4, m6 3069 packssdw m0, m2 ; 0 1 2 3 3070 packssdw m4, m6 ; 4 5 6 7 3071 SWAP m1, m4 3072 %else 3073 mova [stk+0x10], m15 3074 phaddd m0, m1 3075 phaddd m2, m3 3076 movu m1, [srcq+ssq*0] 3077 movu m7, [srcq+ssq*1] 3078 movu m6, [srcq+ssq*2] 3079 movu m3, [srcq+ss3q ] 3080 lea srcq, [srcq+ssq*4] 3081 REPX {pshufb x, m14}, m1, m7, m6, m3 3082 REPX {pmaddwd x, m15}, m1, m7, m6, m3 3083 phaddd m1, m7 3084 phaddd m6, m3 3085 REPX {paddd x, m11}, m0, m2, m1, m6 3086 REPX {psrad x, m12}, m0, m2, m1, m6 3087 packssdw m0, m2 3088 packssdw m1, m6 3089 %define m14 [stk+0x00] 3090 %define m15 [stk+0x10] 3091 %endif 3092 palignr m2, m1, m0, 4 ; 1 2 3 4 3093 punpcklwd m3, m0, m2 ; 01 12 3094 punpckhwd m0, m2 ; 23 34 3095 pshufd m5, m1, q0321 ; 5 6 7 _ 3096 punpcklwd m2, m1, m5 ; 45 56 3097 punpckhwd m4, m1, m5 ; 67 __ 3098 %if ARCH_X86_32 3099 mov myd, mym 3100 mov r0, r0m 3101 mova [stk+0x20], m3 3102 mova [stk+0x30], m0 3103 mova [stk+0x40], m2 3104 mova [stk+0x50], m4 3105 %endif 3106.w2_loop: 3107 and myd, 0x3ff 3108 %if ARCH_X86_64 3109 mov r6d, 64 << 24 3110 mov r4d, myd 3111 shr r4d, 6 3112 lea r4d, [t1+r4] 3113 cmovnz r6q, [base+subpel_filters+r4*8] 3114 movq m10, r6q 3115 punpcklbw m10, m10 3116 psraw m10, 8 3117 pshufd m7, m10, q0000 3118 pshufd m8, m10, q1111 3119 pmaddwd m5, m3, m7 3120 pmaddwd m6, m0, m8 3121 pshufd m9, m10, q2222 3122 pshufd m10, m10, q3333 3123 pmaddwd m7, m2, m9 3124 pmaddwd m8, m4, m10 3125 paddd m5, m6 3126 paddd m7, m8 3127 %else 3128 mov r1, [esp+0x1f4] 3129 xor r3, r3 3130 mov r5, myd 3131 shr r5, 6 3132 lea r1, [r1+r5] 3133 mov r5, 64 << 24 3134 cmovnz r3, [base+subpel_filters+r1*8+4] 3135 cmovnz r5, [base+subpel_filters+r1*8+0] 3136 movd m6, r3 3137 movd m7, r5 3138 punpckldq m7, m6 3139 punpcklbw m7, m7 3140 psraw m7, 8 3141 pshufd m5, m7, q0000 3142 pshufd m6, m7, q1111 3143 pmaddwd m3, m5 3144 pmaddwd m0, m6 3145 pshufd m5, m7, q2222 3146 pshufd m7, m7, q3333 3147 pmaddwd m2, m5 3148 pmaddwd m4, m7 3149 paddd m3, m0 3150 paddd m2, m4 3151 SWAP m5, m3 3152 SWAP m7, m2 3153 %define m8 m3 3154 %endif 3155 paddd m5, m13 3156 pshufd m6, m12, q1032 3157 pxor m8, m8 3158 paddd m5, m7 3159 psrad m5, m6 3160 packssdw m5, m5 3161 pmaxsw m5, m8 3162 pminsw m5, pxmaxm 3163 movd [dstq], m5 3164 add dstq, dsmp 3165 dec hd 3166 jz .ret 3167 %if ARCH_X86_64 3168 add myd, dyd 3169 %else 3170 add myd, dym 3171 %endif 3172 test myd, ~0x3ff 3173 %if ARCH_X86_32 3174 SWAP m3, m5 3175 SWAP m2, m7 3176 mova m3, [stk+0x20] 3177 mova m0, [stk+0x30] 3178 mova m2, [stk+0x40] 3179 mova m4, [stk+0x50] 3180 %endif 3181 jz .w2_loop 3182 %if ARCH_X86_32 3183 mov r3, r3m 3184 %endif 3185 movu m5, [srcq] 3186 test myd, 0x400 3187 jz .w2_skip_line 3188 add srcq, ssq 3189 shufps m3, m0, q1032 ; 01 12 3190 shufps m0, m2, q1032 ; 23 34 3191 shufps m2, m4, q1032 ; 45 56 3192 pshufb m5, m14 3193 pmaddwd m5, m15 3194 phaddd m5, m5 3195 paddd m5, m11 3196 psrad m5, m12 3197 packssdw m5, m5 3198 palignr m4, m5, m1, 12 3199 punpcklqdq m1, m4, m4 ; 6 7 6 7 3200 punpcklwd m4, m1, m5 ; 67 __ 3201 %if ARCH_X86_32 3202 mova [stk+0x20], m3 3203 mova [stk+0x30], m0 3204 mova [stk+0x40], m2 3205 mova [stk+0x50], m4 3206 %endif 3207 jmp .w2_loop 3208.w2_skip_line: 3209 movu m6, [srcq+ssq*1] 3210 lea srcq, [srcq+ssq*2] 3211 mova m3, m0 ; 01 12 3212 mova m0, m2 ; 23 34 3213 pshufb m5, m14 3214 pshufb m6, m14 3215 pmaddwd m5, m15 3216 pmaddwd m6, m15 3217 phaddd m5, m6 3218 paddd m5, m11 3219 psrad m5, m12 3220 packssdw m5, m5 ; 6 7 6 7 3221 punpckhqdq m1, m5 ; 4 5 6 7 3222 pshufd m5, m1, q0321 ; 5 6 7 _ 3223 punpcklwd m2, m1, m5 ; 45 56 3224 punpckhwd m4, m1, m5 ; 67 __ 3225 %if ARCH_X86_32 3226 mova [stk+0x20], m3 3227 mova [stk+0x30], m0 3228 mova [stk+0x40], m2 3229 mova [stk+0x50], m4 3230 %endif 3231 jmp .w2_loop 3232%endif 3233INIT_XMM ssse3 3234.w4: 3235%if ARCH_X86_64 3236 mov myd, mym 3237 mova [rsp+0x10], m11 3238 mova [rsp+0x20], m12 3239 %if isput 3240 mova [rsp+0x30], m13 3241 %endif 3242 movzx t0d, t0b 3243 sub srcq, 2 3244 movd m15, t0d 3245%else 3246 %define m8 m0 3247 %xdefine m14 m4 3248 %define m15 m3 3249 movzx r4, byte [esp+0x1f0] 3250 sub srcq, 2 3251 movd m15, r4 3252%endif 3253 pmaddwd m8, [base+rescale_mul] 3254%if ARCH_X86_64 3255 mova m9, [base+pd_0x4000] 3256%else 3257 %define m9 [base+pd_0x4000] 3258%endif 3259 pshufd m15, m15, q0000 3260 paddd m14, m8 ; mx+dx*[0-3] 3261 pand m0, m14, m10 3262 psrld m0, 6 3263 paddd m15, m0 3264 pshufd m7, m15, q1032 3265%if ARCH_X86_64 3266 movd r4d, m15 3267 movd r11d, m7 3268 pshufd m15, m15, q0321 3269 pshufd m7, m7, q0321 3270 movd r6d, m15 3271 movd r13d, m7 3272 mova m10, [base+bdct_lb_q+ 0] 3273 mova m11, [base+bdct_lb_q+16] 3274 movd m13, [base+subpel_filters+ r4*8+2] 3275 movd m2, [base+subpel_filters+ r6*8+2] 3276 movd m15, [base+subpel_filters+r11*8+2] 3277 movd m4, [base+subpel_filters+r13*8+2] 3278%else 3279 movd r0, m15 3280 movd r4, m7 3281 pshufd m15, m15, q0321 3282 pshufd m7, m7, q0321 3283 movd rX, m15 3284 movd r5, m7 3285 mova m5, [base+bdct_lb_q+ 0] 3286 mova m6, [base+bdct_lb_q+16] 3287 movd m1, [base+subpel_filters+r0*8+2] 3288 movd m2, [base+subpel_filters+rX*8+2] 3289 movd m3, [base+subpel_filters+r4*8+2] 3290 movd m7, [base+subpel_filters+r5*8+2] 3291 movifprep r3, r3m 3292 SWAP m4, m7 3293 %define m10 m5 3294 %define m11 m6 3295 %define m12 m1 3296 %define m13 m1 3297%endif 3298 psrld m14, 10 3299 paddd m14, m14 3300 punpckldq m13, m2 3301 punpckldq m15, m4 3302 punpcklqdq m13, m15 3303 pxor m2, m2 3304 pcmpeqd m0, m2 3305%if ARCH_X86_64 3306 pand m9, m0 3307%else 3308 pand m2, m9, m0 3309 %define m9 m2 3310 SWAP m7, m4 3311%endif 3312 pandn m0, m13 3313%if ARCH_X86_64 3314 SWAP m13, m0 3315%else 3316 %define m13 m0 3317%endif 3318 por m13, m9 3319 punpckhbw m15, m13, m13 3320 punpcklbw m13, m13 3321 psraw m15, 8 3322 psraw m13, 8 3323 pshufb m12, m14, m10 3324 pshufb m14, m11 3325 mova m10, [base+spel_s_shuf2] 3326 movd r4d, m14 3327 shr r4d, 24 3328%if ARCH_X86_32 3329 mova [stk+0x20], m13 3330 mova [stk+0x30], m15 3331 pxor m2, m2 3332%endif 3333 pshufb m7, m14, m2 3334 psubb m14, m7 3335 paddb m12, m10 3336 paddb m14, m10 3337%if ARCH_X86_64 3338 lea r6, [r4+ssq*1] 3339 lea r11, [r4+ssq*2] 3340 lea r13, [r4+ss3q ] 3341 movu m7, [srcq+ssq*0] 3342 movu m9, [srcq+ssq*1] 3343 movu m8, [srcq+ssq*2] 3344 movu m10, [srcq+ss3q ] 3345 movu m1, [srcq+r4 ] 3346 movu m3, [srcq+r6 ] 3347 movu m2, [srcq+r11 ] 3348 movu m4, [srcq+r13 ] 3349 lea srcq, [srcq+ssq*4] 3350 REPX {pshufb x, m12}, m7, m9, m8, m10 3351 REPX {pmaddwd x, m13}, m7, m9, m8, m10 3352 REPX {pshufb x, m14}, m1, m2, m3, m4 3353 REPX {pmaddwd x, m15}, m1, m2, m3, m4 3354 mova m5, [rsp+0x10] 3355 movd xm6, [rsp+0x20] 3356 phaddd m7, m1 3357 phaddd m9, m3 3358 phaddd m8, m2 3359 phaddd m10, m4 3360 movu m1, [srcq+ssq*0] 3361 movu m2, [srcq+ssq*1] 3362 movu m3, [srcq+ssq*2] 3363 movu m4, [srcq+ss3q ] 3364 REPX {paddd x, m5}, m7, m9, m8, m10 3365 REPX {psrad x, xm6}, m7, m9, m8, m10 3366 packssdw m7, m9 ; 0 1 3367 packssdw m8, m10 ; 2 3 3368 movu m0, [srcq+r4 ] 3369 movu m9, [srcq+r6 ] 3370 movu m10, [srcq+r11 ] 3371 movu m11, [srcq+r13 ] 3372 lea srcq, [srcq+ssq*4] 3373 REPX {pshufb x, m12}, m1, m2, m3, m4 3374 REPX {pmaddwd x, m13}, m1, m2, m3, m4 3375 REPX {pshufb x, m14}, m0, m9, m10, m11 3376 REPX {pmaddwd x, m15}, m0, m9, m10, m11 3377 phaddd m1, m0 3378 phaddd m2, m9 3379 phaddd m3, m10 3380 phaddd m4, m11 3381 REPX {paddd x, m5}, m1, m2, m3, m4 3382 REPX {psrad x, xm6}, m1, m2, m3, m4 3383 packssdw m1, m2 ; 4 5 3384 packssdw m3, m4 ; 6 7 3385 SWAP m9, m1 3386 shufps m4, m7, m8, q1032 ; 1 2 3387 shufps m5, m8, m9, q1032 ; 3 4 3388 shufps m6, m9, m3, q1032 ; 5 6 3389 pshufd m10, m3, q1032 ; 7 _ 3390 punpcklwd m0, m7, m4 ; 01 3391 punpckhwd m7, m4 ; 12 3392 punpcklwd m1, m8, m5 ; 23 3393 punpckhwd m8, m5 ; 34 3394 punpcklwd m2, m9, m6 ; 45 3395 punpckhwd m9, m6 ; 56 3396 punpcklwd m3, m10 ; 67 3397 mova [rsp+0x40], m7 3398 mova [rsp+0x50], m8 3399 mova [rsp+0x60], m9 3400%else 3401 mova [stk+0x00], m12 3402 mova [stk+0x10], m14 3403 add r4, srcq 3404 MC_4TAP_SCALED_H 0x40 ; 0 1 3405 MC_4TAP_SCALED_H 0x50 ; 2 3 3406 MC_4TAP_SCALED_H 0x60 ; 4 5 3407 MC_4TAP_SCALED_H 0x70 ; 6 7 3408 mova m4, [stk+0x40] 3409 mova m5, [stk+0x50] 3410 mova m6, [stk+0x60] 3411 mova m7, [stk+0x70] 3412 mov [stk+0xc0], r4 3413 shufps m1, m4, m5, q1032 ; 1 2 3414 shufps m2, m5, m6, q1032 ; 3 4 3415 shufps m3, m6, m7, q1032 ; 5 6 3416 pshufd m0, m7, q1032 ; 7 _ 3417 mova [stk+0xb0], m0 3418 punpcklwd m0, m4, m1 ; 01 3419 punpckhwd m4, m1 ; 12 3420 punpcklwd m1, m5, m2 ; 23 3421 punpckhwd m5, m2 ; 34 3422 punpcklwd m2, m6, m3 ; 45 3423 punpckhwd m6, m3 ; 56 3424 punpcklwd m3, m7, [stk+0xb0] ; 67 3425 mov myd, mym 3426 mov r0, r0m 3427 mova [stk+0x40], m0 ; 01 3428 mova [stk+0x50], m1 ; 23 3429 mova [stk+0x60], m2 ; 45 3430 mova [stk+0x70], m3 ; 67 3431 mova [stk+0x80], m4 ; 12 3432 mova [stk+0x90], m5 ; 34 3433 mova [stk+0xa0], m6 ; 56 3434 %define m12 [stk+0x00] 3435 %define m14 [stk+0x10] 3436 %define m13 [stk+0x20] 3437 %define m15 [stk+0x30] 3438 %define hrnd_mem [esp+0x00] 3439 %define hsh_mem [esp+0x10] 3440 %if isput 3441 %define vrnd_mem [esp+0x20] 3442 %else 3443 %define vrnd_mem [base+pd_m524256] 3444 %endif 3445%endif 3446.w4_loop: 3447 and myd, 0x3ff 3448%if ARCH_X86_64 3449 mov r11d, 64 << 24 3450 mov r13d, myd 3451 shr r13d, 6 3452 lea r13d, [t1+r13] 3453 cmovnz r11q, [base+subpel_filters+r13*8] 3454 movq m9, r11q 3455 punpcklbw m9, m9 3456 psraw m9, 8 3457 pshufd m7, m9, q0000 3458 pshufd m8, m9, q1111 3459 pmaddwd m4, m0, m7 3460 pmaddwd m5, m1, m8 3461 pshufd m7, m9, q2222 3462 pshufd m9, m9, q3333 3463 pmaddwd m6, m2, m7 3464 pmaddwd m8, m3, m9 3465 %if isput 3466 movd m9, [rsp+0x28] 3467 %define vrnd_mem [rsp+0x30] 3468 %else 3469 %define vrnd_mem [base+pd_m524256] 3470 %endif 3471 paddd m4, m5 3472 paddd m6, m8 3473 paddd m4, m6 3474 paddd m4, vrnd_mem 3475%else 3476 mov mym, myd 3477 mov r5, [esp+0x1f4] 3478 xor r3, r3 3479 shr r4, 6 3480 lea r5, [r5+r4] 3481 mov r4, 64 << 24 3482 cmovnz r4, [base+subpel_filters+r5*8+0] 3483 cmovnz r3, [base+subpel_filters+r5*8+4] 3484 movd m7, r4 3485 movd m6, r3 3486 punpckldq m7, m6 3487 punpcklbw m7, m7 3488 psraw m7, 8 3489 pshufd m4, m7, q0000 3490 pshufd m5, m7, q1111 3491 pshufd m6, m7, q2222 3492 pshufd m7, m7, q3333 3493 pmaddwd m0, m4 3494 pmaddwd m1, m5 3495 pmaddwd m2, m6 3496 pmaddwd m3, m7 3497 %if isput 3498 movd m4, [esp+0x18] 3499 %endif 3500 paddd m0, m1 3501 paddd m2, m3 3502 paddd m0, vrnd_mem 3503 paddd m0, m2 3504 SWAP m4, m0 3505 %define m9 m0 3506%endif 3507%if isput 3508 pxor m5, m5 3509 psrad m4, m9 3510 packssdw m4, m4 3511 pmaxsw m4, m5 3512 pminsw m4, pxmaxm 3513 movq [dstq], m4 3514 add dstq, dsmp 3515%else 3516 psrad m4, 6 3517 packssdw m4, m4 3518 movq [tmpq], m4 3519 add tmpq, 8 3520%endif 3521 dec hd 3522 jz .ret 3523%if ARCH_X86_64 3524 add myd, dyd 3525 test myd, ~0x3ff 3526 jz .w4_loop 3527 mova m8, [rsp+0x10] 3528 movd m9, [rsp+0x20] 3529 movu m4, [srcq] 3530 movu m5, [srcq+r4] 3531 test myd, 0x400 3532 jz .w4_skip_line 3533 mova m0, [rsp+0x40] 3534 mova [rsp+0x40], m1 3535 mova m1, [rsp+0x50] 3536 mova [rsp+0x50], m2 3537 mova m2, [rsp+0x60] 3538 mova [rsp+0x60], m3 3539 pshufb m4, m12 3540 pshufb m5, m14 3541 pmaddwd m4, m13 3542 pmaddwd m5, m15 3543 phaddd m4, m5 3544 paddd m4, m8 3545 psrad m4, m9 3546 packssdw m4, m4 3547 punpcklwd m3, m10, m4 3548 mova m10, m4 3549 add srcq, ssq 3550 jmp .w4_loop 3551.w4_skip_line: 3552 movu m6, [srcq+ssq*1] 3553 movu m7, [srcq+r6] 3554 mova m0, [rsp+0x50] 3555 mova m11, [rsp+0x60] 3556 pshufb m4, m12 3557 pshufb m6, m12 3558 pshufb m5, m14 3559 pshufb m7, m14 3560 pmaddwd m4, m13 3561 pmaddwd m6, m13 3562 pmaddwd m5, m15 3563 pmaddwd m7, m15 3564 mova [rsp+0x40], m0 3565 mova [rsp+0x50], m11 3566 phaddd m4, m5 3567 phaddd m6, m7 3568 paddd m4, m8 3569 paddd m6, m8 3570 psrad m4, m9 3571 psrad m6, m9 3572 packssdw m4, m6 3573 punpcklwd m9, m10, m4 3574 mova [rsp+0x60], m9 3575 pshufd m10, m4, q1032 3576 mova m0, m1 3577 mova m1, m2 3578 mova m2, m3 3579 punpcklwd m3, m4, m10 3580 lea srcq, [srcq+ssq*2] 3581 jmp .w4_loop 3582%else 3583 SWAP m0, m4 3584 mov myd, mym 3585 mov r3, r3m 3586 add myd, dym 3587 test myd, ~0x3ff 3588 jnz .w4_next_line 3589 mova m0, [stk+0x40] 3590 mova m1, [stk+0x50] 3591 mova m2, [stk+0x60] 3592 mova m3, [stk+0x70] 3593 jmp .w4_loop 3594.w4_next_line: 3595 mov r5, [stk+0xc0] 3596 movu m4, [srcq] 3597 movu m5, [r5] 3598 test myd, 0x400 3599 jz .w4_skip_line 3600 add [stk+0xc0], ssq 3601 mova m0, [stk+0x80] 3602 mova m3, [stk+0x50] 3603 mova [stk+0x40], m0 3604 mova [stk+0x80], m3 3605 mova m1, [stk+0x90] 3606 mova m6, [stk+0x60] 3607 mova [stk+0x50], m1 3608 mova [stk+0x90], m6 3609 mova m2, [stk+0xa0] 3610 mova m7, [stk+0x70] 3611 mova [stk+0x60], m2 3612 mova [stk+0xa0], m7 3613 pshufb m4, m12 3614 pshufb m5, m14 3615 pmaddwd m4, m13 3616 pmaddwd m5, m15 3617 phaddd m4, m5 3618 paddd m4, hrnd_mem 3619 psrad m4, hsh_mem 3620 packssdw m4, m4 3621 punpcklwd m3, [stk+0xb0], m4 3622 mova [stk+0xb0], m4 3623 mova [stk+0x70], m3 3624 add srcq, ssq 3625 jmp .w4_loop 3626.w4_skip_line: 3627 movu m6, [srcq+ssq*1] 3628 movu m7, [r5 +ssq*1] 3629 lea r5, [r5 +ssq*2] 3630 mov [stk+0xc0], r5 3631 mova m0, [stk+0x50] 3632 mova m1, [stk+0x60] 3633 mova m2, [stk+0x70] 3634 mova m3, [stk+0x90] 3635 pshufb m4, m12 3636 pshufb m6, m12 3637 pshufb m5, m14 3638 pshufb m7, m14 3639 pmaddwd m4, m13 3640 pmaddwd m6, m13 3641 pmaddwd m5, m15 3642 pmaddwd m7, m15 3643 mova [stk+0x40], m0 3644 mova [stk+0x50], m1 3645 mova [stk+0x60], m2 3646 mova [stk+0x80], m3 3647 phaddd m4, m5 3648 phaddd m6, m7 3649 mova m5, [stk+0xa0] 3650 mova m7, [stk+0xb0] 3651 paddd m4, hrnd_mem 3652 paddd m6, hrnd_mem 3653 psrad m4, hsh_mem 3654 psrad m6, hsh_mem 3655 packssdw m4, m6 3656 punpcklwd m7, m4 3657 pshufd m6, m4, q1032 3658 mova [stk+0x90], m5 3659 mova [stk+0xa0], m7 3660 mova [stk+0xb0], m6 3661 punpcklwd m3, m4, m6 3662 mova [stk+0x70], m3 3663 lea srcq, [srcq+ssq*2] 3664 jmp .w4_loop 3665%endif 3666INIT_XMM ssse3 3667%if ARCH_X86_64 3668 %define stk rsp+0x20 3669%endif 3670.w8: 3671 mov dword [stk+0xf0], 1 3672 movifprep tmp_stridem, 16 3673 jmp .w_start 3674.w16: 3675 mov dword [stk+0xf0], 2 3676 movifprep tmp_stridem, 32 3677 jmp .w_start 3678.w32: 3679 mov dword [stk+0xf0], 4 3680 movifprep tmp_stridem, 64 3681 jmp .w_start 3682.w64: 3683 mov dword [stk+0xf0], 8 3684 movifprep tmp_stridem, 128 3685 jmp .w_start 3686.w128: 3687 mov dword [stk+0xf0], 16 3688 movifprep tmp_stridem, 256 3689.w_start: 3690%if ARCH_X86_64 3691 %ifidn %1, put 3692 movifnidn dsm, dsq 3693 %endif 3694 mova [rsp+0x10], m11 3695 %define hround m11 3696 shr t0d, 16 3697 movd m15, t0d 3698 %if isprep 3699 mova m13, [base+pd_m524256] 3700 %endif 3701%else 3702 %define hround [esp+0x00] 3703 %define m12 [esp+0x10] 3704 %define m10 [base+pd_0x3ff] 3705 %define m8 m0 3706 %xdefine m14 m4 3707 %define m15 m3 3708 %if isprep 3709 %define ssq ssm 3710 %endif 3711 mov r4, [esp+0x1f0] 3712 shr r4, 16 3713 movd m15, r4 3714 mov r0, r0m 3715 mov myd, mym 3716%endif 3717 sub srcq, 6 3718 pslld m7, m8, 2 ; dx*4 3719 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 3720 pshufd m15, m15, q0000 3721 paddd m14, m8 ; mx+dx*[0-3] 3722 mova [stk+0x100], m7 3723 mova [stk+0x120], m15 3724 mov [stk+0x0f8], srcq 3725 mov [stk+0x130], r0q ; dstq / tmpq 3726%if ARCH_X86_64 && UNIX64 3727 mov hm, hd 3728%elif ARCH_X86_32 3729 mov r5, hm 3730 mov [stk+0x0f4], myd 3731 mov [stk+0x134], r5 3732%endif 3733 jmp .hloop 3734.hloop_prep: 3735 dec dword [stk+0x0f0] 3736 jz .ret 3737%if ARCH_X86_64 3738 add qword [stk+0x130], 16 3739 mov hd, hm 3740%else 3741 add dword [stk+0x130], 16 3742 mov myd, [stk+0x0f4] 3743 mov r5, [stk+0x134] 3744 mov r0, [stk+0x130] 3745%endif 3746 mova m7, [stk+0x100] 3747 mova m14, [stk+0x110] 3748%if ARCH_X86_64 3749 mova m10, [base+pd_0x3ff] 3750 mova m11, [rsp+0x10] 3751%endif 3752 mova m15, [stk+0x120] 3753 mov srcq, [stk+0x0f8] 3754%if ARCH_X86_64 3755 mov r0q, [stk+0x130] ; dstq / tmpq 3756%else 3757 mov mym, myd 3758 mov hm, r5 3759 mov r0m, r0 3760 mov r3, r3m 3761%endif 3762 paddd m14, m7 3763.hloop: 3764%if ARCH_X86_64 3765 mova m9, [base+pq_0x40000000] 3766%else 3767 %define m9 [base+pq_0x40000000] 3768%endif 3769 pxor m1, m1 3770 psrld m2, m14, 10 3771 mova [stk], m2 3772 pand m6, m14, m10 3773 psrld m6, 6 3774 paddd m5, m15, m6 3775 pcmpeqd m6, m1 3776 pshufd m2, m5, q1032 3777%if ARCH_X86_64 3778 movd r4d, m5 3779 movd r6d, m2 3780 pshufd m5, m5, q0321 3781 pshufd m2, m2, q0321 3782 movd r7d, m5 3783 movd r9d, m2 3784 movq m0, [base+subpel_filters+r4*8] 3785 movq m1, [base+subpel_filters+r6*8] 3786 movhps m0, [base+subpel_filters+r7*8] 3787 movhps m1, [base+subpel_filters+r9*8] 3788%else 3789 movd r0, m5 3790 movd rX, m2 3791 pshufd m5, m5, q0321 3792 pshufd m2, m2, q0321 3793 movd r4, m5 3794 movd r5, m2 3795 movq m0, [base+subpel_filters+r0*8] 3796 movq m1, [base+subpel_filters+rX*8] 3797 movhps m0, [base+subpel_filters+r4*8] 3798 movhps m1, [base+subpel_filters+r5*8] 3799%endif 3800 paddd m14, m7 ; mx+dx*[4-7] 3801 pand m5, m14, m10 3802 psrld m5, 6 3803 paddd m15, m5 3804 pxor m2, m2 3805 pcmpeqd m5, m2 3806 mova [stk+0x110], m14 3807 pshufd m4, m15, q1032 3808%if ARCH_X86_64 3809 movd r10d, m15 3810 movd r11d, m4 3811 pshufd m15, m15, q0321 3812 pshufd m4, m4, q0321 3813 movd r13d, m15 3814 movd rXd, m4 3815 movq m2, [base+subpel_filters+r10*8] 3816 movq m3, [base+subpel_filters+r11*8] 3817 movhps m2, [base+subpel_filters+r13*8] 3818 movhps m3, [base+subpel_filters+ rX*8] 3819 psrld m14, 10 3820 movq r11, m14 3821 punpckhqdq m14, m14 3822 movq rX, m14 3823 mov r10d, r11d 3824 shr r11, 32 3825 mov r13d, rXd 3826 shr rX, 32 3827 mov r4d, [stk+ 0] 3828 mov r6d, [stk+ 4] 3829 mov r7d, [stk+ 8] 3830 mov r9d, [stk+12] 3831 pshufd m4, m6, q1100 3832 pshufd m6, m6, q3322 3833 pshufd m14, m5, q1100 3834 pshufd m5, m5, q3322 3835 pand m7, m9, m4 3836 pand m8, m9, m6 3837 pand m15, m9, m14 3838 pand m9, m9, m5 3839 pandn m4, m0 3840 pandn m6, m1 3841 pandn m14, m2 3842 pandn m5, m3 3843 por m7, m4 3844 por m8, m6 3845 por m15, m14 3846 por m9, m5 3847 punpcklbw m0, m7, m7 3848 punpckhbw m7, m7 3849 punpcklbw m1, m8, m8 3850 punpckhbw m8, m8 3851 psraw m0, 8 3852 psraw m7, 8 3853 psraw m1, 8 3854 psraw m8, 8 3855 punpcklbw m2, m15, m15 3856 punpckhbw m15, m15 3857 punpcklbw m3, m9, m9 3858 punpckhbw m9, m9 3859 psraw m2, 8 3860 psraw m15, 8 3861 psraw m3, 8 3862 psraw m9, 8 3863 mova [stk+0x10], m0 3864 mova [stk+0x20], m7 3865 mova [stk+0x30], m1 3866 mova [stk+0x40], m8 3867 mova [stk+0x50], m2 3868 mova [stk+0x60], m15 3869 mova [stk+0x70], m3 3870 mova [stk+0x80], m9 3871 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 3872 mova [stk+0x90], m1 3873 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 3874 mova [stk+0xa0], m2 3875 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 3876 mova [stk+0xb0], m3 3877 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 3878 mova [stk+0xc0], m4 3879 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 3880 mova [stk+0xd0], m5 3881 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 3882 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 3883 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 3884 mova m5, [stk+0xd0] 3885 mova m1, [stk+0x90] 3886 mova m2, [stk+0xa0] 3887 mova m3, [stk+0xb0] 3888 mova m9, [stk+0xc0] 3889 mov myd, mym 3890 mov dyd, dym 3891 punpcklwd m4, m5, m6 ; 45a 3892 punpckhwd m5, m6 ; 45b 3893 punpcklwd m6, m7, m8 ; 67a 3894 punpckhwd m7, m8 ; 67b 3895 punpcklwd m0, m1, m2 ; 01a 3896 punpckhwd m1, m2 ; 01b 3897 punpcklwd m2, m3, m9 ; 23a 3898 punpckhwd m3, m9 ; 23b 3899 mova [stk+0x90], m4 3900 mova [stk+0xa0], m5 3901 mova [stk+0xb0], m6 3902 mova [stk+0xc0], m7 3903 %define hround [rsp+0x10] 3904.vloop: 3905 and myd, 0x3ff 3906 mov r6d, 64 << 24 3907 mov r4d, myd 3908 shr r4d, 6 3909 lea r4d, [t1+r4] 3910 cmovnz r6q, [base+subpel_filters+r4*8] 3911 movq m11, r6q 3912 punpcklbw m11, m11 3913 psraw m11, 8 3914 pshufd m5, m11, q0000 3915 pshufd m7, m11, q1111 3916 pshufd m10, m11, q2222 3917 pshufd m11, m11, q3333 3918 pmaddwd m4, m5, m0 3919 pmaddwd m5, m5, m1 3920 pmaddwd m6, m7, m2 3921 pmaddwd m7, m7, m3 3922 paddd m4, m13 3923 paddd m5, m13 3924 paddd m4, m6 3925 paddd m5, m7 3926 pmaddwd m6, [stk+0x90], m10 3927 pmaddwd m7, [stk+0xa0], m10 3928 pmaddwd m8, [stk+0xb0], m11 3929 pmaddwd m9, [stk+0xc0], m11 3930 paddd m4, m6 3931 paddd m5, m7 3932 %if isput 3933 pshufd m6, m12, q1032 3934 %endif 3935 paddd m4, m8 3936 paddd m5, m9 3937%else 3938 movd r0, m15 3939 movd rX, m4 3940 pshufd m15, m15, q0321 3941 pshufd m4, m4, q0321 3942 movd r4, m15 3943 movd r5, m4 3944 mova m14, [stk+0x110] 3945 movq m2, [base+subpel_filters+r0*8] 3946 movq m3, [base+subpel_filters+rX*8] 3947 movhps m2, [base+subpel_filters+r4*8] 3948 movhps m3, [base+subpel_filters+r5*8] 3949 psrld m14, 10 3950 mova [stk+16], m14 3951 mov r0, [stk+ 0] 3952 mov rX, [stk+ 4] 3953 mov r4, [stk+ 8] 3954 mov r5, [stk+12] 3955 mova [stk+0x20], m0 3956 mova [stk+0x30], m1 3957 mova [stk+0x40], m2 3958 mova [stk+0x50], m3 3959 pshufd m4, m6, q1100 3960 pshufd m6, m6, q3322 3961 pshufd m7, m5, q1100 3962 pshufd m5, m5, q3322 3963 pand m0, m9, m4 3964 pand m1, m9, m6 3965 pand m2, m9, m7 3966 pand m3, m9, m5 3967 pandn m4, [stk+0x20] 3968 pandn m6, [stk+0x30] 3969 pandn m7, [stk+0x40] 3970 pandn m5, [stk+0x50] 3971 por m0, m4 3972 por m1, m6 3973 por m2, m7 3974 por m3, m5 3975 punpcklbw m4, m0, m0 3976 punpckhbw m0, m0 3977 punpcklbw m5, m1, m1 3978 punpckhbw m1, m1 3979 psraw m4, 8 3980 psraw m0, 8 3981 psraw m5, 8 3982 psraw m1, 8 3983 punpcklbw m6, m2, m2 3984 punpckhbw m2, m2 3985 punpcklbw m7, m3, m3 3986 punpckhbw m3, m3 3987 psraw m6, 8 3988 psraw m2, 8 3989 psraw m7, 8 3990 psraw m3, 8 3991 mova [stk+0x0a0], m4 3992 mova [stk+0x0b0], m0 3993 mova [stk+0x0c0], m5 3994 mova [stk+0x0d0], m1 3995 mova [stk+0x140], m6 3996 mova [stk+0x150], m2 3997 mova [stk+0x160], m7 3998 mova [stk+0x170], m3 3999 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 4000 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 4001 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 4002 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 4003 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 4004 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 4005 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 4006 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 4007 mova m5, [stk+0x60] 4008 mova m6, [stk+0x70] 4009 mova m7, [stk+0x80] 4010 mova m0, [stk+0x90] 4011 mov myd, mym 4012 punpcklwd m4, m5, m6 ; 45a 4013 punpckhwd m5, m6 ; 45b 4014 punpcklwd m6, m7, m0 ; 67a 4015 punpckhwd m7, m0 ; 67b 4016 mova [stk+0x60], m4 4017 mova [stk+0x70], m5 4018 mova [stk+0x80], m6 4019 mova [stk+0x90], m7 4020 mova m1, [stk+0x20] 4021 mova m2, [stk+0x30] 4022 mova m3, [stk+0x40] 4023 mova m4, [stk+0x50] 4024 punpcklwd m0, m1, m2 ; 01a 4025 punpckhwd m1, m2 ; 01b 4026 punpcklwd m2, m3, m4 ; 23a 4027 punpckhwd m3, m4 ; 23b 4028 mova [stk+0x20], m0 4029 mova [stk+0x30], m1 4030 mova [stk+0x40], m2 4031 mova [stk+0x50], m3 4032.vloop: 4033 mov r0, r0m 4034 mov r5, [esp+0x1f4] 4035 and myd, 0x3ff 4036 mov mym, myd 4037 xor r3, r3 4038 shr r4, 6 4039 lea r5, [r5+r4] 4040 mov r4, 64 << 24 4041 cmovnz r4, [base+subpel_filters+r5*8+0] 4042 cmovnz r3, [base+subpel_filters+r5*8+4] 4043 movd m7, r4 4044 movd m6, r3 4045 punpckldq m7, m6 4046 punpcklbw m7, m7 4047 psraw m7, 8 4048 pshufd m4, m7, q0000 4049 pshufd m5, m7, q1111 4050 pmaddwd m0, m4 4051 pmaddwd m1, m4 4052 pmaddwd m2, m5 4053 pmaddwd m3, m5 4054 pshufd m6, m7, q2222 4055 pshufd m7, m7, q3333 4056 paddd m0, m2 4057 paddd m1, m3 4058 pmaddwd m2, [stk+0x60], m6 4059 pmaddwd m3, [stk+0x70], m6 4060 pmaddwd m4, [stk+0x80], m7 4061 pmaddwd m5, [stk+0x90], m7 4062 %if isput 4063 movd m6, [esp+0x18] 4064 %endif 4065 paddd m0, m2 4066 paddd m1, m3 4067 paddd m0, vrnd_mem 4068 paddd m1, vrnd_mem 4069 paddd m4, m0 4070 paddd m5, m1 4071%endif 4072%ifidn %1, put 4073 psrad m4, m6 4074 psrad m5, m6 4075 packssdw m4, m5 4076 pxor m7, m7 4077 pmaxsw m4, m7 4078 pminsw m4, pxmaxm 4079 mova [dstq], m4 4080 add dstq, dsm 4081%else 4082 psrad m4, 6 4083 psrad m5, 6 4084 packssdw m4, m5 4085 mova [tmpq], m4 4086 add tmpq, tmp_stridem 4087%endif 4088 dec hd 4089 jz .hloop_prep 4090%if ARCH_X86_64 4091 add myd, dyd 4092 test myd, ~0x3ff 4093 jz .vloop 4094 test myd, 0x400 4095 mov [stk+0x140], myd 4096 mov r4d, [stk+ 0] 4097 mov r6d, [stk+ 4] 4098 mov r7d, [stk+ 8] 4099 mov r9d, [stk+12] 4100 jz .skip_line 4101 mova m14, [base+unpckw] 4102 movu m8, [srcq+r10*2] 4103 movu m9, [srcq+r11*2] 4104 movu m10, [srcq+r13*2] 4105 movu m11, [srcq+ rX*2] 4106 movu m4, [srcq+ r4*2] 4107 movu m5, [srcq+ r6*2] 4108 movu m6, [srcq+ r7*2] 4109 movu m7, [srcq+ r9*2] 4110 add srcq, ssq 4111 mov myd, [stk+0x140] 4112 mov dyd, dym 4113 pshufd m15, m14, q1032 4114 pshufb m0, m14 ; 0a 1a 4115 pshufb m1, m14 ; 0b 1b 4116 pshufb m2, m15 ; 3a 2a 4117 pshufb m3, m15 ; 3b 2b 4118 pmaddwd m8, [stk+0x50] 4119 pmaddwd m9, [stk+0x60] 4120 pmaddwd m10, [stk+0x70] 4121 pmaddwd m11, [stk+0x80] 4122 pmaddwd m4, [stk+0x10] 4123 pmaddwd m5, [stk+0x20] 4124 pmaddwd m6, [stk+0x30] 4125 pmaddwd m7, [stk+0x40] 4126 phaddd m8, m9 4127 phaddd m10, m11 4128 mova m11, hround 4129 phaddd m4, m5 4130 phaddd m6, m7 4131 phaddd m8, m10 4132 phaddd m4, m6 4133 paddd m4, m11 4134 paddd m8, m11 4135 psrad m4, m12 4136 psrad m8, m12 4137 packssdw m4, m8 4138 pshufb m5, [stk+0x90], m14 ; 4a 5a 4139 pshufb m6, [stk+0xa0], m14 ; 4b 5b 4140 pshufb m7, [stk+0xb0], m15 ; 7a 6a 4141 pshufb m8, [stk+0xc0], m15 ; 7b 6b 4142 punpckhwd m0, m2 ; 12a 4143 punpckhwd m1, m3 ; 12b 4144 punpcklwd m2, m5 ; 34a 4145 punpcklwd m3, m6 ; 34b 4146 punpckhwd m5, m7 ; 56a 4147 punpckhwd m6, m8 ; 56b 4148 punpcklwd m7, m4 ; 78a 4149 punpckhqdq m4, m4 4150 punpcklwd m8, m4 ; 78b 4151 mova [stk+0x90], m5 4152 mova [stk+0xa0], m6 4153 mova [stk+0xb0], m7 4154 mova [stk+0xc0], m8 4155 jmp .vloop 4156.skip_line: 4157 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11 4158 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11 4159 mov myd, [stk+0x140] 4160 mov dyd, dym 4161 mova m0, m2 ; 01a 4162 mova m1, m3 ; 01b 4163 mova m2, [stk+0x90] ; 23a 4164 mova m3, [stk+0xa0] ; 23b 4165 mova m5, [stk+0xb0] ; 45a 4166 mova m6, [stk+0xc0] ; 45b 4167 punpcklwd m7, m4, m8 ; 67a 4168 punpckhwd m4, m8 ; 67b 4169 mova [stk+0x90], m5 4170 mova [stk+0xa0], m6 4171 mova [stk+0xb0], m7 4172 mova [stk+0xc0], m4 4173%else 4174 mov r0m, r0 4175 mov myd, mym 4176 mov r3, r3m 4177 add myd, dym 4178 test myd, ~0x3ff 4179 mov mym, myd 4180 jnz .next_line 4181 mova m0, [stk+0x20] 4182 mova m1, [stk+0x30] 4183 mova m2, [stk+0x40] 4184 mova m3, [stk+0x50] 4185 jmp .vloop 4186.next_line: 4187 test myd, 0x400 4188 mov r0, [stk+ 0] 4189 mov rX, [stk+ 4] 4190 mov r4, [stk+ 8] 4191 mov r5, [stk+12] 4192 jz .skip_line 4193 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 4194 mova m7, [base+unpckw] 4195 pshufd m4, m7, q1032 4196 pshufb m0, [stk+0x20], m7 ; 0a 1a 4197 pshufb m1, [stk+0x30], m7 ; 0b 1b 4198 pshufb m2, [stk+0x40], m4 ; 3a 2a 4199 pshufb m3, [stk+0x50], m4 ; 3b 2b 4200 pshufb m5, [stk+0x60], m7 ; 4a 5a 4201 pshufb m6, [stk+0x70], m7 ; 4b 5b 4202 pshufb m7, [stk+0x80], m4 ; 7a 6a 4203 punpckhwd m0, m2 ; 12a 4204 punpckhwd m1, m3 ; 12b 4205 punpcklwd m2, m5 ; 34a 4206 punpcklwd m3, m6 ; 34b 4207 mova [stk+0x20], m0 4208 mova [stk+0x30], m1 4209 mova [stk+0x40], m2 4210 mova [stk+0x50], m3 4211 punpckhwd m5, m7 ; 56a 4212 mova [stk+0x60], m5 4213 pshufb m5, [stk+0x90], m4 ; 7b 6b 4214 punpcklwd m7, [stk+0xe0] ; 78a 4215 punpckhwd m6, m5 ; 56b 4216 mova [stk+0x70], m6 4217 movq m6, [stk+0xe8] 4218 mova [stk+0x80], m7 4219 punpcklwd m5, m6 4220 mov myd, mym 4221 mova [stk+0x90], m5 4222 jmp .vloop 4223.skip_line: 4224 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 4225 MC_8TAP_SCALED_H 0xa0, 0 ; 9 4226 mova m7, [stk+0xe0] 4227 mova m2, [stk+0x60] ; 23a 4228 mova m3, [stk+0x70] ; 23b 4229 mova m4, [stk+0x80] ; 45a 4230 mova m5, [stk+0x90] ; 45b 4231 punpcklwd m6, m7, m0 ; 67a 4232 punpckhwd m7, m0 ; 67b 4233 mova m0, [stk+0x40] ; 01a 4234 mova m1, [stk+0x50] ; 01b 4235 mov myd, mym 4236 mova [stk+0x40], m2 4237 mova [stk+0x50], m3 4238 mova [stk+0x60], m4 4239 mova [stk+0x70], m5 4240 mova [stk+0x80], m6 4241 mova [stk+0x90], m7 4242 mova [stk+0x20], m0 4243 mova [stk+0x30], m1 4244%endif 4245 jmp .vloop 4246INIT_XMM ssse3 4247.dy1: 4248 movzx wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2] 4249 add wq, base_reg 4250 jmp wq 4251%if isput 4252.dy1_w2: 4253 %if ARCH_X86_64 4254 mov myd, mym 4255 movzx t0d, t0b 4256 sub srcq, 2 4257 movd m15, t0d 4258 %else 4259 %define m8 m0 4260 %define m9 m1 4261 %define m14 m4 4262 %define m15 m3 4263 %define m11 [esp+0x00] 4264 %define m12 [esp+0x10] 4265 %define m13 [esp+0x20] 4266 movzx r5, byte [esp+0x1f0] 4267 sub srcq, 2 4268 movd m15, r5 4269 mov r1, r1m 4270 %endif 4271 pxor m9, m9 4272 punpckldq m9, m8 4273 paddd m14, m9 ; mx+dx*[0-1] 4274 %if ARCH_X86_64 4275 mova m9, [base+pd_0x4000] 4276 %endif 4277 pshufd m15, m15, q0000 4278 pand m8, m14, m10 4279 psrld m8, 6 4280 paddd m15, m8 4281 movd r4d, m15 4282 pshufd m15, m15, q0321 4283 %if ARCH_X86_64 4284 movd r6d, m15 4285 %else 4286 movd r3d, m15 4287 %endif 4288 mova m5, [base+bdct_lb_q] 4289 mova m6, [base+spel_s_shuf2] 4290 movd m15, [base+subpel_filters+r4*8+2] 4291 %if ARCH_X86_64 4292 movd m7, [base+subpel_filters+r6*8+2] 4293 %else 4294 movd m7, [base+subpel_filters+r3*8+2] 4295 %endif 4296 pxor m2, m2 4297 pcmpeqd m8, m2 4298 psrld m14, 10 4299 paddd m14, m14 4300 %if ARCH_X86_32 4301 mov r3, r3m 4302 pshufb m14, m5 4303 paddb m14, m6 4304 mova [stk], m14 4305 SWAP m5, m0 4306 SWAP m6, m3 4307 %define m15 m6 4308 %endif 4309 movu m0, [srcq+ssq*0] 4310 movu m1, [srcq+ssq*1] 4311 movu m2, [srcq+ssq*2] 4312 movu m3, [srcq+ss3q ] 4313 lea srcq, [srcq+ssq*4] 4314 punpckldq m15, m7 4315 %if ARCH_X86_64 4316 pshufb m14, m5 4317 paddb m14, m6 4318 pand m9, m8 4319 pandn m8, m15 4320 SWAP m15, m8 4321 por m15, m9 4322 movu m4, [srcq+ssq*0] 4323 movu m5, [srcq+ssq*1] 4324 movu m6, [srcq+ssq*2] 4325 add srcq, ss3q 4326 shr myd, 6 4327 mov r4d, 64 << 24 4328 lea myd, [t1+myq] 4329 cmovnz r4q, [base+subpel_filters+myq*8] 4330 %else 4331 pand m7, m5, [base+pd_0x4000] 4332 pandn m5, m15 4333 por m5, m7 4334 %define m15 m5 4335 mov myd, mym 4336 mov r5, [esp+0x1f4] 4337 xor r3, r3 4338 shr myd, 6 4339 lea r5, [r5+myd] 4340 mov r4, 64 << 24 4341 cmovnz r4, [base+subpel_filters+r5*8+0] 4342 cmovnz r3, [base+subpel_filters+r5*8+4] 4343 mov [stk+0x20], r3 4344 mov r3, r3m 4345 %endif 4346 punpcklbw m15, m15 4347 psraw m15, 8 4348 REPX {pshufb x, m14}, m0, m1, m2, m3 4349 REPX {pmaddwd x, m15}, m0, m1, m2, m3 4350 %if ARCH_X86_64 4351 REPX {pshufb x, m14}, m4, m5, m6 4352 REPX {pmaddwd x, m15}, m4, m5, m6 4353 phaddd m0, m1 4354 phaddd m2, m3 4355 phaddd m4, m5 4356 phaddd m6, m6 4357 REPX {paddd x, m11}, m0, m2, m4, m6 4358 REPX {psrad x, m12}, m0, m2, m4, m6 4359 packssdw m0, m2 ; 0 1 2 3 4360 packssdw m4, m6 ; 4 5 6 4361 SWAP m1, m4 4362 movq m10, r4 4363 %else 4364 mova [stk+0x10], m15 4365 phaddd m0, m1 4366 phaddd m2, m3 4367 movu m1, [srcq+ssq*0] 4368 movu m7, [srcq+ssq*1] 4369 movu m6, [srcq+ssq*2] 4370 add srcq, ss3q 4371 REPX {pshufb x, m14}, m1, m7, m6 4372 REPX {pmaddwd x, m15}, m1, m7, m6 4373 %define m14 [stk+0x00] 4374 %define m15 [stk+0x10] 4375 phaddd m1, m7 4376 phaddd m6, m6 4377 REPX {paddd x, m11}, m0, m2, m1, m6 4378 REPX {psrad x, m12}, m0, m2, m1, m6 4379 packssdw m0, m2 4380 packssdw m1, m6 4381 %define m8 m6 4382 %define m9 m4 4383 %define m10 m5 4384 movd m10, r4 4385 movd m9, [stk+0x20] 4386 punpckldq m10, m9 4387 %endif 4388 punpcklbw m10, m10 4389 psraw m10, 8 4390 pshufd m7, m10, q0000 4391 pshufd m8, m10, q1111 4392 pshufd m9, m10, q2222 4393 pshufd m10, m10, q3333 4394 %if ARCH_X86_32 4395 mova [stk+0x50], m7 4396 mova [stk+0x60], m8 4397 mova [stk+0x70], m9 4398 mova [stk+0x80], m10 4399 %define m7 [stk+0x50] 4400 %define m8 [stk+0x60] 4401 %define m9 [stk+0x70] 4402 %define m10 [stk+0x80] 4403 %endif 4404 palignr m2, m1, m0, 4 ; 1 2 3 4 4405 punpcklwd m3, m0, m2 ; 01 12 4406 punpckhwd m0, m2 ; 23 34 4407 pshufd m4, m1, q2121 ; 5 6 5 6 4408 punpcklwd m2, m1, m4 ; 45 56 4409 %if ARCH_X86_32 4410 mov r0, r0m 4411 %endif 4412.dy1_w2_loop: 4413 movu m1, [srcq+ssq*0] 4414 movu m6, [srcq+ssq*1] 4415 lea srcq, [srcq+ssq*2] 4416 pmaddwd m5, m3, m7 4417 mova m3, m0 4418 pmaddwd m0, m8 4419 pshufb m1, m14 4420 pshufb m6, m14 4421 pmaddwd m1, m15 4422 pmaddwd m6, m15 4423 phaddd m1, m6 4424 paddd m1, m11 4425 psrad m1, m12 4426 packssdw m1, m1 4427 paddd m5, m0 4428 mova m0, m2 4429 pmaddwd m2, m9 4430 paddd m5, m2 4431 palignr m2, m1, m4, 12 4432 punpcklwd m2, m1 ; 67 78 4433 pmaddwd m4, m2, m10 4434 paddd m5, m13 4435 paddd m5, m4 4436 pxor m6, m6 4437 mova m4, m1 4438 pshufd m1, m12, q1032 4439 psrad m5, m1 4440 packssdw m5, m5 4441 pmaxsw m5, m6 4442 pminsw m5, pxmaxm 4443 movd [dstq+dsq*0], m5 4444 pshuflw m5, m5, q1032 4445 movd [dstq+dsq*1], m5 4446 lea dstq, [dstq+dsq*2] 4447 sub hd, 2 4448 jg .dy1_w2_loop 4449 RET 4450%endif 4451INIT_XMM ssse3 4452.dy1_w4: 4453%if ARCH_X86_64 4454 mov myd, mym 4455 mova [rsp+0x10], m11 4456 mova [rsp+0x20], m12 4457 %if isput 4458 mova [rsp+0x30], m13 4459 %define vrnd_mem [rsp+0x30] 4460 %define stk rsp+0x40 4461 %else 4462 %define vrnd_mem [base+pd_m524256] 4463 %define stk rsp+0x30 4464 %endif 4465 movzx t0d, t0b 4466 sub srcq, 2 4467 movd m15, t0d 4468%else 4469 %define m10 [base+pd_0x3ff] 4470 %define m9 [base+pd_0x4000] 4471 %define m8 m0 4472 %xdefine m14 m4 4473 %define m15 m3 4474 %if isprep 4475 %define ssq r3 4476 %endif 4477 movzx r5, byte [esp+0x1f0] 4478 sub srcq, 2 4479 movd m15, r5 4480%endif 4481 pmaddwd m8, [base+rescale_mul] 4482%if ARCH_X86_64 4483 mova m9, [base+pd_0x4000] 4484%endif 4485 pshufd m15, m15, q0000 4486 paddd m14, m8 ; mx+dx*[0-3] 4487 pand m0, m14, m10 4488 psrld m0, 6 4489 paddd m15, m0 4490 pshufd m7, m15, q1032 4491%if ARCH_X86_64 4492 movd r4d, m15 4493 movd r11d, m7 4494 pshufd m15, m15, q0321 4495 pshufd m7, m7, q0321 4496 movd r6d, m15 4497 movd r13d, m7 4498 mova m10, [base+bdct_lb_q+ 0] 4499 mova m11, [base+bdct_lb_q+16] 4500 movd m13, [base+subpel_filters+ r4*8+2] 4501 movd m2, [base+subpel_filters+ r6*8+2] 4502 movd m15, [base+subpel_filters+r11*8+2] 4503 movd m4, [base+subpel_filters+r13*8+2] 4504%else 4505 movd r0, m15 4506 movd r4, m7 4507 pshufd m15, m15, q0321 4508 pshufd m7, m7, q0321 4509 movd rX, m15 4510 movd r5, m7 4511 mova m5, [base+bdct_lb_q+ 0] 4512 mova m6, [base+bdct_lb_q+16] 4513 movd m1, [base+subpel_filters+r0*8+2] 4514 movd m2, [base+subpel_filters+rX*8+2] 4515 movd m3, [base+subpel_filters+r4*8+2] 4516 movd m7, [base+subpel_filters+r5*8+2] 4517 SWAP m4, m7 4518 %if isprep 4519 mov r3, r3m 4520 %endif 4521 %define m10 m5 4522 %define m11 m6 4523 %define m12 m1 4524 %define m13 m1 4525%endif 4526 psrld m14, 10 4527 paddd m14, m14 4528 punpckldq m13, m2 4529 punpckldq m15, m4 4530 punpcklqdq m13, m15 4531 pxor m2, m2 4532 pcmpeqd m0, m2 4533%if ARCH_X86_64 4534 pand m9, m0 4535%else 4536 pand m2, m9, m0 4537 %define m9 m2 4538 SWAP m7, m4 4539%endif 4540 pandn m0, m13 4541%if ARCH_X86_64 4542 SWAP m13, m0 4543%else 4544 %define m13 m0 4545%endif 4546 por m13, m9 4547 punpckhbw m15, m13, m13 4548 punpcklbw m13, m13 4549 psraw m15, 8 4550 psraw m13, 8 4551 pshufb m12, m14, m10 4552 pshufb m14, m11 4553 mova m10, [base+spel_s_shuf2] 4554 movd r4d, m14 4555 shr r4d, 24 4556%if ARCH_X86_32 4557 mova [stk+0x40], m13 4558 mova [stk+0x50], m15 4559 pxor m2, m2 4560%endif 4561 pshufb m7, m14, m2 4562 psubb m14, m7 4563 paddb m12, m10 4564 paddb m14, m10 4565%if ARCH_X86_64 4566 lea r6, [r4+ssq*1] 4567 lea r11, [r4+ssq*2] 4568 lea r13, [r4+ss3q ] 4569 movu m7, [srcq+ssq*0] 4570 movu m9, [srcq+ssq*1] 4571 movu m8, [srcq+ssq*2] 4572 movu m10, [srcq+ss3q ] 4573 movu m1, [srcq+r4 ] 4574 movu m3, [srcq+r6 ] 4575 movu m2, [srcq+r11 ] 4576 movu m4, [srcq+r13 ] 4577 lea srcq, [srcq+ssq*4] 4578 REPX {pshufb x, m12}, m7, m9, m8, m10 4579 REPX {pmaddwd x, m13}, m7, m9, m8, m10 4580 REPX {pshufb x, m14}, m1, m3, m2, m4 4581 REPX {pmaddwd x, m15}, m1, m3, m2, m4 4582 mova m5, [rsp+0x10] 4583 movd xm6, [rsp+0x20] 4584 phaddd m7, m1 4585 phaddd m9, m3 4586 phaddd m8, m2 4587 phaddd m10, m4 4588 movu m1, [srcq+ssq*0] 4589 movu m2, [srcq+ssq*1] 4590 movu m3, [srcq+ssq*2] 4591 REPX {paddd x, m5}, m7, m9, m8, m10 4592 REPX {psrad x, xm6}, m7, m9, m8, m10 4593 packssdw m7, m9 ; 0 1 4594 packssdw m8, m10 ; 2 3 4595 movu m0, [srcq+r4 ] 4596 movu m9, [srcq+r6 ] 4597 movu m10, [srcq+r11 ] 4598 add srcq, ss3q 4599 REPX {pshufb x, m12}, m1, m2, m3 4600 REPX {pmaddwd x, m13}, m1, m2, m3 4601 REPX {pshufb x, m14}, m0, m9, m10 4602 REPX {pmaddwd x, m15}, m0, m9, m10 4603 phaddd m1, m0 4604 phaddd m2, m9 4605 phaddd m3, m10 4606 shr myd, 6 4607 mov r13d, 64 << 24 4608 lea myd, [t1+myq] 4609 cmovnz r13q, [base+subpel_filters+myq*8] 4610 REPX {paddd x, m5}, m1, m2, m3 4611 REPX {psrad x, xm6}, m1, m2, m3 4612 packssdw m1, m2 ; 4 5 4613 packssdw m3, m3 ; 6 6 4614 SWAP m9, m1 4615 shufps m4, m7, m8, q1032 ; 1 2 4616 shufps m5, m8, m9, q1032 ; 3 4 4617 shufps m6, m9, m3, q1032 ; 5 6 4618 punpcklwd m0, m7, m4 ; 01 4619 punpckhwd m7, m4 ; 12 4620 punpcklwd m1, m8, m5 ; 23 4621 punpckhwd m8, m5 ; 34 4622 punpcklwd m2, m9, m6 ; 45 4623 punpckhwd m9, m6 ; 56 4624 movq m10, r13 4625 mova [stk+0x00], m1 4626 mova [stk+0x10], m8 4627 mova [stk+0x20], m2 4628 mova [stk+0x30], m9 4629 mova [stk+0x40], m3 4630 %define hrnd_mem [rsp+0x10] 4631 %define hsh_mem [rsp+0x20] 4632 %define vsh_mem [rsp+0x28] 4633 %if isput 4634 %define vrnd_mem [rsp+0x30] 4635 %else 4636 %define vrnd_mem [base+pd_m524256] 4637 %endif 4638%else 4639 mova [stk+0x20], m12 4640 mova [stk+0x30], m14 4641 add r4, srcq 4642 MC_4TAP_SCALED_H 0x60 ; 0 1 4643 MC_4TAP_SCALED_H 0x70 ; 2 3 4644 MC_4TAP_SCALED_H 0x80 ; 4 5 4645 movu m7, [srcq] 4646 movu m2, [r4] 4647 add srcq, ssq 4648 add r4, ssq 4649 mov [stk+0xb0], r4 4650 pshufb m7, m12 4651 pshufb m2, m14 4652 pmaddwd m7, m13 4653 pmaddwd m2, m15 4654 phaddd m7, m2 4655 paddd m7, [esp+0x00] 4656 psrad m7, [esp+0x10] 4657 packssdw m7, m7 ; 6 6 4658 mova m4, [stk+0x60] 4659 mova m5, [stk+0x70] 4660 mova m6, [stk+0x80] 4661 mov myd, mym 4662 mov rX, [esp+0x1f4] 4663 xor r5, r5 4664 shr myd, 6 4665 lea rX, [rX+myd] 4666 mov r4, 64 << 24 4667 cmovnz r4, [base+subpel_filters+rX*8+0] 4668 cmovnz r5, [base+subpel_filters+rX*8+4] 4669 mov r3, r3m 4670 shufps m1, m4, m5, q1032 ; 1 2 4671 shufps m2, m5, m6, q1032 ; 3 4 4672 shufps m3, m6, m7, q1032 ; 5 6 4673 mova [stk+0xa0], m7 4674 punpcklwd m0, m4, m1 ; 01 4675 punpckhwd m4, m1 ; 12 4676 punpcklwd m1, m5, m2 ; 23 4677 punpckhwd m5, m2 ; 34 4678 punpcklwd m2, m6, m3 ; 45 4679 punpckhwd m6, m3 ; 56 4680 movd m7, r4 4681 movd m3, r5 4682 mov r0, r0m 4683 %if isput 4684 mov r1, r1m 4685 %endif 4686 mov r4, [stk+0xb0] 4687 mova [stk+0xc0], m4 ; 12 4688 mova [stk+0x60], m1 ; 23 4689 mova [stk+0x70], m2 ; 45 4690 mova [stk+0x80], m5 ; 34 4691 mova [stk+0x90], m6 ; 56 4692 %define m12 [stk+0x20] 4693 %define m14 [stk+0x30] 4694 %define m13 [stk+0x40] 4695 %define m15 [stk+0x50] 4696 %define hrnd_mem [esp+0x00] 4697 %define hsh_mem [esp+0x10] 4698 %define vsh_mem [esp+0x18] 4699 %if isput 4700 %define vrnd_mem [esp+0x20] 4701 %else 4702 %define vrnd_mem [base+pd_m524256] 4703 %endif 4704 %define m10 m7 4705 punpckldq m10, m3 4706%endif 4707 punpcklbw m10, m10 4708 psraw m10, 8 4709 pshufd m3, m10, q0000 4710 pshufd m4, m10, q1111 4711 pshufd m5, m10, q2222 4712 pshufd m10, m10, q3333 4713%if ARCH_X86_32 4714 %xdefine m8 m3 4715 %xdefine m9 m6 4716 %xdefine m11 m5 4717 %xdefine m6 m4 4718 mova [stk+0x100], m3 4719 mova [stk+0x110], m4 4720 mova [stk+0x120], m5 4721 mova [stk+0x130], m10 4722 %define m3 [stk+0x100] 4723 %define m4 [stk+0x110] 4724 %define m5 [stk+0x120] 4725 %define m10 [stk+0x130] 4726 mova m7, [stk+0xc0] 4727 mova m8, [stk+0x80] 4728%endif 4729.dy1_w4_loop: 4730 movu m11, [srcq+ssq*0] 4731 movu m6, [srcq+ssq*1] 4732 pmaddwd m0, m3 4733 pmaddwd m7, m3 4734 pmaddwd m1, m4 4735 pmaddwd m8, m4 4736 pmaddwd m2, m5 4737 pmaddwd m9, m5 4738 paddd m1, m0 4739 paddd m8, m7 4740%if ARCH_X86_64 4741 movu m0, [srcq+r4] 4742 movu m7, [srcq+r6] 4743%else 4744 movu m0, [r4+ssq*0] 4745 movu m7, [r4+ssq*1] 4746 lea r4, [r4+ssq*2] 4747%endif 4748 lea srcq, [srcq+ssq*2] 4749 paddd m1, m2 4750 paddd m8, m9 4751 pshufb m11, m12 4752 pshufb m6, m12 4753 pmaddwd m11, m13 4754 pmaddwd m6, m13 4755 pshufb m0, m14 4756 pshufb m7, m14 4757 pmaddwd m0, m15 4758 pmaddwd m7, m15 4759 phaddd m11, m0 4760 phaddd m6, m7 4761 paddd m11, hrnd_mem 4762 paddd m6, hrnd_mem 4763 psrad m11, hsh_mem 4764 psrad m6, hsh_mem 4765 packssdw m11, m6 ; 7 8 4766%if ARCH_X86_64 4767 shufps m9, [stk+0x40], m11, q1032 ; 6 7 4768 mova m0, [stk+0x00] 4769 mova [stk+0x40], m11 4770%else 4771 shufps m9, [stk+0xa0], m11, q1032 ; 6 7 4772 mova m0, [stk+0x60] 4773 mova [stk+0xa0], m11 4774%endif 4775 punpcklwd m2, m9, m11 ; 67 4776 punpckhwd m9, m11 ; 78 4777 pmaddwd m6, m2, m10 4778 pmaddwd m7, m9, m10 4779%if isput 4780 movd m11, vsh_mem 4781%endif 4782 paddd m1, vrnd_mem 4783 paddd m8, vrnd_mem 4784 paddd m1, m6 4785 paddd m8, m7 4786%if ARCH_X86_64 4787 mova m7, [stk+0x10] 4788%else 4789 mova m7, [stk+0x80] 4790%endif 4791%if isput 4792 psrad m1, m11 4793 psrad m8, m11 4794%else 4795 psrad m1, 6 4796 psrad m8, 6 4797%endif 4798 packssdw m1, m8 4799%if ARCH_X86_64 4800 mova m8, [stk+0x30] 4801%else 4802 mova m8, [stk+0x90] 4803%endif 4804%if isput 4805 pxor m6, m6 4806 pmaxsw m1, m6 4807 pminsw m1, pxmaxm 4808 movq [dstq+dsq*0], m1 4809 movhps [dstq+dsq*1], m1 4810 lea dstq, [dstq+dsq*2] 4811%else 4812 mova [tmpq], m1 4813 add tmpq, 16 4814%endif 4815%if ARCH_X86_64 4816 mova m1, [stk+0x20] 4817 mova [stk+0x10], m8 4818 mova [stk+0x00], m1 4819 mova [stk+0x20], m2 4820 mova [stk+0x30], m9 4821%else 4822 mova m1, [stk+0x70] 4823 mova [stk+0x80], m8 4824 mova [stk+0x60], m1 4825 mova [stk+0x70], m2 4826 mova [stk+0x90], m9 4827%endif 4828 sub hd, 2 4829 jg .dy1_w4_loop 4830 MC_8TAP_SCALED_RET ; why not jz .ret? 4831INIT_XMM ssse3 4832.dy1_w8: 4833 mov dword [stk+0xf0], 1 4834 movifprep tmp_stridem, 16 4835 jmp .dy1_w_start 4836.dy1_w16: 4837 mov dword [stk+0xf0], 2 4838 movifprep tmp_stridem, 32 4839 jmp .dy1_w_start 4840.dy1_w32: 4841 mov dword [stk+0xf0], 4 4842 movifprep tmp_stridem, 64 4843 jmp .dy1_w_start 4844.dy1_w64: 4845 mov dword [stk+0xf0], 8 4846 movifprep tmp_stridem, 128 4847 jmp .dy1_w_start 4848.dy1_w128: 4849 mov dword [stk+0xf0], 16 4850 movifprep tmp_stridem, 256 4851.dy1_w_start: 4852 mov myd, mym 4853%if ARCH_X86_64 4854 %ifidn %1, put 4855 movifnidn dsm, dsq 4856 %endif 4857 mova [rsp+0x10], m11 4858 mova [rsp+0x20], m12 4859 %define hround m11 4860 %if isput 4861 mova [rsp+0x30], m13 4862 %else 4863 mova m13, [base+pd_m524256] 4864 %endif 4865 shr t0d, 16 4866 shr myd, 6 4867 mov r4d, 64 << 24 4868 lea myd, [t1+myq] 4869 cmovnz r4q, [base+subpel_filters+myq*8] 4870 movd m15, t0d 4871%else 4872 %define hround [esp+0x00] 4873 %define m12 [esp+0x10] 4874 %define m10 [base+pd_0x3ff] 4875 %define m8 m0 4876 %xdefine m14 m4 4877 %xdefine m15 m3 4878 %if isprep 4879 %define ssq ssm 4880 %endif 4881 mov r5, [esp+0x1f0] 4882 mov r3, [esp+0x1f4] 4883 shr r5, 16 4884 movd m15, r5 4885 xor r5, r5 4886 shr myd, 6 4887 lea r3, [r3+myd] 4888 mov r4, 64 << 24 4889 cmovnz r4, [base+subpel_filters+r3*8+0] 4890 cmovnz r5, [base+subpel_filters+r3*8+4] 4891 mov r0, r0m 4892 mov r3, r3m 4893%endif 4894 sub srcq, 6 4895 pslld m7, m8, 2 ; dx*4 4896 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 4897 pshufd m15, m15, q0000 4898 paddd m14, m8 ; mx+dx*[0-3] 4899%if ARCH_X86_64 4900 movq m3, r4q 4901%else 4902 movd m5, r4 4903 movd m6, r5 4904 punpckldq m5, m6 4905 SWAP m3, m5 4906%endif 4907 punpcklbw m3, m3 4908 psraw m3, 8 4909 mova [stk+0x100], m7 4910 mova [stk+0x120], m15 4911 mov [stk+0x0f8], srcq 4912 mov [stk+0x130], r0q ; dstq / tmpq 4913 pshufd m0, m3, q0000 4914 pshufd m1, m3, q1111 4915 pshufd m2, m3, q2222 4916 pshufd m3, m3, q3333 4917%if ARCH_X86_64 4918 mova [stk+0x140], m0 4919 mova [stk+0x150], m1 4920 mova [stk+0x160], m2 4921 mova [stk+0x170], m3 4922 %if UNIX64 4923 mov hm, hd 4924 %endif 4925%else 4926 mova [stk+0x180], m0 4927 mova [stk+0x190], m1 4928 mova [stk+0x1a0], m2 4929 mova [stk+0x1b0], m3 4930 SWAP m5, m3 4931 mov r5, hm 4932 mov [stk+0x134], r5 4933%endif 4934 jmp .dy1_hloop 4935.dy1_hloop_prep: 4936 dec dword [stk+0x0f0] 4937 jz .ret 4938%if ARCH_X86_64 4939 add qword [stk+0x130], 16 4940 mov hd, hm 4941%else 4942 add dword [stk+0x130], 16 4943 mov r5, [stk+0x134] 4944 mov r0, [stk+0x130] 4945%endif 4946 mova m7, [stk+0x100] 4947 mova m14, [stk+0x110] 4948%if ARCH_X86_64 4949 mova m10, [base+pd_0x3ff] 4950 mova m11, [rsp+0x10] 4951%endif 4952 mova m15, [stk+0x120] 4953 mov srcq, [stk+0x0f8] 4954%if ARCH_X86_64 4955 mov r0q, [stk+0x130] ; dstq / tmpq 4956%else 4957 mov hm, r5 4958 mov r0m, r0 4959 mov r3, r3m 4960%endif 4961 paddd m14, m7 4962.dy1_hloop: 4963%if ARCH_X86_64 4964 mova m9, [base+pq_0x40000000] 4965%else 4966 %define m9 [base+pq_0x40000000] 4967%endif 4968 pxor m1, m1 4969 psrld m2, m14, 10 4970 mova [stk], m2 4971 pand m6, m14, m10 4972 psrld m6, 6 4973 paddd m5, m15, m6 4974 pcmpeqd m6, m1 4975 pshufd m2, m5, q1032 4976%if ARCH_X86_64 4977 movd r4d, m5 4978 movd r6d, m2 4979 pshufd m5, m5, q0321 4980 pshufd m2, m2, q0321 4981 movd r7d, m5 4982 movd r9d, m2 4983 movq m0, [base+subpel_filters+r4*8] 4984 movq m1, [base+subpel_filters+r6*8] 4985 movhps m0, [base+subpel_filters+r7*8] 4986 movhps m1, [base+subpel_filters+r9*8] 4987%else 4988 movd r0, m5 4989 movd rX, m2 4990 pshufd m5, m5, q0321 4991 pshufd m2, m2, q0321 4992 movd r4, m5 4993 movd r5, m2 4994 movq m0, [base+subpel_filters+r0*8] 4995 movq m1, [base+subpel_filters+rX*8] 4996 movhps m0, [base+subpel_filters+r4*8] 4997 movhps m1, [base+subpel_filters+r5*8] 4998%endif 4999 paddd m14, m7 ; mx+dx*[4-7] 5000 pand m5, m14, m10 5001 psrld m5, 6 5002 paddd m15, m5 5003 pxor m2, m2 5004 pcmpeqd m5, m2 5005 mova [stk+0x110], m14 5006 pshufd m4, m15, q1032 5007%if ARCH_X86_64 5008 movd r10d, m15 5009 movd r11d, m4 5010 pshufd m15, m15, q0321 5011 pshufd m4, m4, q0321 5012 movd r13d, m15 5013 movd rXd, m4 5014 movq m2, [base+subpel_filters+r10*8] 5015 movq m3, [base+subpel_filters+r11*8] 5016 movhps m2, [base+subpel_filters+r13*8] 5017 movhps m3, [base+subpel_filters+ rX*8] 5018 psrld m14, 10 5019 movq r11, m14 5020 punpckhqdq m14, m14 5021 movq rX, m14 5022 mov r10d, r11d 5023 shr r11, 32 5024 mov r13d, rXd 5025 shr rX, 32 5026 mov r4d, [stk+ 0] 5027 mov r6d, [stk+ 4] 5028 mov r7d, [stk+ 8] 5029 mov r9d, [stk+12] 5030 pshufd m4, m6, q1100 5031 pshufd m6, m6, q3322 5032 pshufd m14, m5, q1100 5033 pshufd m5, m5, q3322 5034 pand m7, m9, m4 5035 pand m8, m9, m6 5036 pand m15, m9, m14 5037 pand m9, m9, m5 5038 pandn m4, m0 5039 pandn m6, m1 5040 pandn m14, m2 5041 pandn m5, m3 5042 por m7, m4 5043 por m8, m6 5044 por m15, m14 5045 por m9, m5 5046 punpcklbw m0, m7, m7 5047 punpckhbw m7, m7 5048 punpcklbw m1, m8, m8 5049 punpckhbw m8, m8 5050 psraw m0, 8 5051 psraw m7, 8 5052 psraw m1, 8 5053 psraw m8, 8 5054 punpcklbw m2, m15, m15 5055 punpckhbw m15, m15 5056 punpcklbw m3, m9, m9 5057 punpckhbw m9, m9 5058 psraw m2, 8 5059 psraw m15, 8 5060 psraw m3, 8 5061 psraw m9, 8 5062 mova [stk+0x10], m0 5063 mova [stk+0x20], m7 5064 mova [stk+0x30], m1 5065 mova [stk+0x40], m8 5066 mova [stk+0x50], m2 5067 mova [stk+0x60], m15 5068 mova [stk+0x70], m3 5069 mova [stk+0x80], m9 5070 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 5071 mova [stk+0x90], m1 5072 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 5073 mova [stk+0xa0], m2 5074 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 5075 mova [stk+0xb0], m3 5076 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 5077 mova [stk+0xc0], m4 5078 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 5079 mova [stk+0xd0], m5 5080 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 5081 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 5082 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 5083 mova m5, [stk+0xd0] 5084 mova m1, [stk+0x90] 5085 mova m2, [stk+0xa0] 5086 mova m3, [stk+0xb0] 5087 mova m9, [stk+0xc0] 5088 punpcklwd m4, m5, m6 ; 45a 5089 punpckhwd m5, m6 ; 45b 5090 punpcklwd m6, m7, m8 ; 67a 5091 punpckhwd m7, m8 ; 67b 5092 punpcklwd m0, m1, m2 ; 01a 5093 punpckhwd m1, m2 ; 01b 5094 punpcklwd m2, m3, m9 ; 23a 5095 punpckhwd m3, m9 ; 23b 5096 mova m10, [stk+0x140] 5097 mova m11, [stk+0x150] 5098 mova m14, [stk+0x160] 5099 mova m15, [stk+0x170] 5100 mova [stk+0x90], m4 5101 mova [stk+0xa0], m5 5102 mova [stk+0xb0], m6 5103 mova [stk+0xc0], m7 5104 %define hround [rsp+0x10] 5105 %define shift [rsp+0x20] 5106 %if isput 5107 %define vround [rsp+0x30] 5108 %else 5109 %define vround [base+pd_m524256] 5110 %endif 5111.dy1_vloop: 5112 pmaddwd m4, m0, m10 5113 pmaddwd m5, m1, m10 5114 pmaddwd m6, m2, m11 5115 pmaddwd m7, m3, m11 5116 paddd m4, m13 5117 paddd m5, m13 5118 paddd m4, m6 5119 paddd m5, m7 5120 pmaddwd m6, [stk+0x90], m14 5121 pmaddwd m7, [stk+0xa0], m14 5122 pmaddwd m8, [stk+0xb0], m15 5123 pmaddwd m9, [stk+0xc0], m15 5124 paddd m4, m6 5125 paddd m5, m7 5126 %if isput 5127 pshufd m6, m12, q1032 5128 %endif 5129 paddd m4, m8 5130 paddd m5, m9 5131%else 5132 movd r0, m15 5133 movd rX, m4 5134 pshufd m15, m15, q0321 5135 pshufd m4, m4, q0321 5136 movd r4, m15 5137 movd r5, m4 5138 mova m14, [stk+0x110] 5139 movq m2, [base+subpel_filters+r0*8] 5140 movq m3, [base+subpel_filters+rX*8] 5141 movhps m2, [base+subpel_filters+r4*8] 5142 movhps m3, [base+subpel_filters+r5*8] 5143 psrld m14, 10 5144 mova [stk+16], m14 5145 mov r0, [stk+ 0] 5146 mov rX, [stk+ 4] 5147 mov r4, [stk+ 8] 5148 mov r5, [stk+12] 5149 mova [stk+0x20], m0 5150 mova [stk+0x30], m1 5151 mova [stk+0x40], m2 5152 mova [stk+0x50], m3 5153 pshufd m4, m6, q1100 5154 pshufd m6, m6, q3322 5155 pshufd m7, m5, q1100 5156 pshufd m5, m5, q3322 5157 pand m0, m9, m4 5158 pand m1, m9, m6 5159 pand m2, m9, m7 5160 pand m3, m9, m5 5161 pandn m4, [stk+0x20] 5162 pandn m6, [stk+0x30] 5163 pandn m7, [stk+0x40] 5164 pandn m5, [stk+0x50] 5165 por m0, m4 5166 por m1, m6 5167 por m2, m7 5168 por m3, m5 5169 punpcklbw m4, m0, m0 5170 punpckhbw m0, m0 5171 punpcklbw m5, m1, m1 5172 punpckhbw m1, m1 5173 psraw m4, 8 5174 psraw m0, 8 5175 psraw m5, 8 5176 psraw m1, 8 5177 punpcklbw m6, m2, m2 5178 punpckhbw m2, m2 5179 punpcklbw m7, m3, m3 5180 punpckhbw m3, m3 5181 psraw m6, 8 5182 psraw m2, 8 5183 psraw m7, 8 5184 psraw m3, 8 5185 mova [stk+0x0a0], m4 5186 mova [stk+0x0b0], m0 5187 mova [stk+0x0c0], m5 5188 mova [stk+0x0d0], m1 5189 mova [stk+0x140], m6 5190 mova [stk+0x150], m2 5191 mova [stk+0x160], m7 5192 mova [stk+0x170], m3 5193 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 5194 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 5195 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 5196 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 5197 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 5198 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 5199 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 5200 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 5201 mova m5, [stk+0x60] 5202 mova m6, [stk+0x70] 5203 mova m7, [stk+0x80] 5204 mova m0, [stk+0x90] 5205 mov r0, r0m 5206 punpcklwd m4, m5, m6 ; 45a 5207 punpckhwd m5, m6 ; 45b 5208 punpcklwd m6, m7, m0 ; 67a 5209 punpckhwd m7, m0 ; 67b 5210 mova [stk+0x60], m4 5211 mova [stk+0x70], m5 5212 mova [stk+0x80], m6 5213 mova [stk+0x90], m7 5214 mova m1, [stk+0x20] 5215 mova m2, [stk+0x30] 5216 mova m3, [stk+0x40] 5217 mova m4, [stk+0x50] 5218 punpcklwd m0, m1, m2 ; 01a 5219 punpckhwd m1, m2 ; 01b 5220 punpcklwd m2, m3, m4 ; 23a 5221 punpckhwd m3, m4 ; 23b 5222 mova m4, [stk+0x180] 5223 mova m5, [stk+0x190] 5224 mova m6, [stk+0x1a0] 5225 mova m7, [stk+0x1b0] 5226 mova [stk+0x20], m0 5227 mova [stk+0x30], m1 5228 mova [stk+0x40], m2 5229 mova [stk+0x50], m3 5230.dy1_vloop: 5231 pmaddwd m0, m4 5232 pmaddwd m1, m4 5233 pmaddwd m2, m5 5234 pmaddwd m3, m5 5235 paddd m0, m2 5236 paddd m1, m3 5237 pmaddwd m2, [stk+0x60], m6 5238 pmaddwd m3, [stk+0x70], m6 5239 pmaddwd m4, [stk+0x80], m7 5240 pmaddwd m5, [stk+0x90], m7 5241 %if isput 5242 movd m6, [esp+0x18] 5243 %endif 5244 paddd m0, m2 5245 paddd m1, m3 5246 paddd m0, vrnd_mem 5247 paddd m1, vrnd_mem 5248 paddd m4, m0 5249 paddd m5, m1 5250%endif 5251%ifidn %1, put 5252 psrad m4, m6 5253 psrad m5, m6 5254 packssdw m4, m5 5255 pxor m7, m7 5256 pmaxsw m4, m7 5257 pminsw m4, pxmaxm 5258 mova [dstq], m4 5259 add dstq, dsm 5260%else 5261 psrad m4, 6 5262 psrad m5, 6 5263 packssdw m4, m5 5264 mova [tmpq], m4 5265 add tmpq, tmp_stridem 5266%endif 5267 dec hd 5268 jz .dy1_hloop_prep 5269%if ARCH_X86_64 5270 movu m8, [srcq+r10*2] 5271 movu m9, [srcq+r11*2] 5272 movu m12, [srcq+r13*2] 5273 movu m13, [srcq+ rX*2] 5274 movu m4, [srcq+ r4*2] 5275 movu m5, [srcq+ r6*2] 5276 movu m6, [srcq+ r7*2] 5277 movu m7, [srcq+ r9*2] 5278 add srcq, ssq 5279 pmaddwd m8, [stk+0x50] 5280 pmaddwd m9, [stk+0x60] 5281 pmaddwd m12, [stk+0x70] 5282 pmaddwd m13, [stk+0x80] 5283 pmaddwd m4, [stk+0x10] 5284 pmaddwd m5, [stk+0x20] 5285 pmaddwd m6, [stk+0x30] 5286 pmaddwd m7, [stk+0x40] 5287 phaddd m8, m9 5288 phaddd m12, m13 5289 mova m9, [base+unpckw] 5290 mova m13, hround 5291 phaddd m4, m5 5292 phaddd m6, m7 5293 phaddd m8, m12 5294 phaddd m4, m6 5295 pshufd m5, m9, q1032 5296 pshufb m0, m9 ; 0a 1a 5297 pshufb m1, m9 ; 0b 1b 5298 pshufb m2, m5 ; 3a 2a 5299 pshufb m3, m5 ; 3b 2b 5300 mova m12, shift 5301 paddd m4, m13 5302 paddd m8, m13 5303 psrad m4, m12 5304 psrad m8, m12 5305 packssdw m4, m8 5306 pshufb m6, [stk+0x90], m9 ; 4a 5a 5307 pshufb m7, [stk+0xa0], m9 ; 4b 5b 5308 pshufb m8, [stk+0xb0], m5 ; 7a 6a 5309 pshufb m13, [stk+0xc0], m5 ; 7b 6b 5310 punpckhwd m0, m2 ; 12a 5311 punpckhwd m1, m3 ; 12b 5312 punpcklwd m2, m6 ; 34a 5313 punpcklwd m3, m7 ; 34b 5314 punpckhwd m6, m8 ; 56a 5315 punpckhwd m7, m13 ; 56b 5316 punpcklwd m8, m4 ; 78a 5317 punpckhqdq m4, m4 5318 punpcklwd m13, m4 ; 78b 5319 mova [stk+0x90], m6 5320 mova [stk+0xa0], m7 5321 mova [stk+0xb0], m8 5322 mova [stk+0xc0], m13 5323 mova m13, vround 5324%else 5325 mov r0m, r0 5326 mov r3, r3m 5327 mov r0, [stk+ 0] 5328 mov rX, [stk+ 4] 5329 mov r4, [stk+ 8] 5330 mov r5, [stk+12] 5331 MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8 5332 mova m7, [base+unpckw] 5333 pshufd m4, m7, q1032 5334 pshufb m0, [stk+0x20], m7 ; 0a 1a 5335 pshufb m1, [stk+0x30], m7 ; 0b 1b 5336 pshufb m2, [stk+0x40], m4 ; 3a 2a 5337 pshufb m3, [stk+0x50], m4 ; 3b 2b 5338 pshufb m5, [stk+0x60], m7 ; 4a 5a 5339 pshufb m6, [stk+0x70], m7 ; 4b 5b 5340 pshufb m7, [stk+0x80], m4 ; 7a 6a 5341 punpckhwd m0, m2 ; 12a 5342 punpckhwd m1, m3 ; 12b 5343 punpcklwd m2, m5 ; 34a 5344 punpcklwd m3, m6 ; 34b 5345 mova [stk+0x20], m0 5346 mova [stk+0x30], m1 5347 mova [stk+0x40], m2 5348 mova [stk+0x50], m3 5349 punpckhwd m5, m7 ; 56a 5350 mova [stk+0x60], m5 5351 pshufb m5, [stk+0x90], m4 ; 7b 6b 5352 punpcklwd m7, [stk+0xe0] ; 78a 5353 mova m4, [stk+0x180] 5354 punpckhwd m6, m5 ; 56b 5355 mova [stk+0x70], m6 5356 movq m6, [stk+0xe8] 5357 mova [stk+0x80], m7 5358 mova m7, [stk+0x1b0] 5359 punpcklwd m5, m6 5360 mova m6, [stk+0x1a0] 5361 mova [stk+0x90], m5 5362 mova m5, [stk+0x190] 5363 mov r0, r0m 5364%endif 5365 jmp .dy1_vloop 5366INIT_XMM ssse3 5367%if ARCH_X86_64 5368 %define stk rsp+0x20 5369%endif 5370.dy2: 5371 movzx wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2] 5372 add wq, base_reg 5373 jmp wq 5374%if isput 5375.dy2_w2: 5376 %if ARCH_X86_64 5377 mov myd, mym 5378 mova [rsp+0x10], m13 5379 %define vrnd_mem [rsp+0x10] 5380 movzx t0d, t0b 5381 sub srcq, 2 5382 movd m15, t0d 5383 %else 5384 %define m8 m0 5385 %define m9 m1 5386 %define m14 m4 5387 %define m15 m3 5388 %define m11 [esp+0x00] 5389 %define m12 [esp+0x10] 5390 %define vrnd_mem [esp+0x20] 5391 mov r1, r1m 5392 movzx r5, byte [esp+0x1f0] 5393 sub srcq, 2 5394 movd m15, r5 5395 %endif 5396 pxor m9, m9 5397 punpckldq m9, m8 5398 paddd m14, m9 ; mx+dx*[0-1] 5399 %if ARCH_X86_64 5400 mova m9, [base+pd_0x4000] 5401 %endif 5402 pshufd m15, m15, q0000 5403 pand m8, m14, m10 5404 psrld m8, 6 5405 paddd m15, m8 5406 movd r4d, m15 5407 pshufd m15, m15, q0321 5408 %if ARCH_X86_64 5409 movd r6d, m15 5410 %else 5411 movd r3d, m15 5412 %endif 5413 mova m5, [base+bdct_lb_q] 5414 mova m6, [base+spel_s_shuf2] 5415 movd m15, [base+subpel_filters+r4*8+2] 5416 %if ARCH_X86_64 5417 movd m7, [base+subpel_filters+r6*8+2] 5418 %else 5419 movd m7, [base+subpel_filters+r3*8+2] 5420 %endif 5421 pxor m2, m2 5422 pcmpeqd m8, m2 5423 psrld m14, 10 5424 paddd m14, m14 5425 %if ARCH_X86_32 5426 mov r3, r3m 5427 pshufb m14, m5 5428 paddb m14, m6 5429 mova [stk], m14 5430 SWAP m5, m0 5431 SWAP m6, m3 5432 %define m15 m6 5433 %endif 5434 movu m0, [srcq+ssq*0] 5435 movu m1, [srcq+ssq*2] 5436 movu m2, [srcq+ssq*4] 5437 punpckldq m15, m7 5438 %if ARCH_X86_64 5439 pshufb m14, m5 5440 paddb m14, m6 5441 pand m9, m8 5442 pandn m8, m15 5443 SWAP m15, m8 5444 por m15, m9 5445 movu m4, [srcq+ssq*1] 5446 movu m5, [srcq+ss3q ] 5447 lea srcq, [srcq+ssq*4] 5448 movu m6, [srcq+ssq*1] 5449 lea srcq, [srcq+ssq*2] 5450 shr myd, 6 5451 mov r4d, 64 << 24 5452 lea myd, [t1+myq] 5453 cmovnz r4q, [base+subpel_filters+myq*8] 5454 %else 5455 pand m7, m5, [base+pd_0x4000] 5456 pandn m5, m15 5457 por m5, m7 5458 %define m15 m5 5459 mov myd, mym 5460 mov r5, [esp+0x1f4] 5461 xor r3, r3 5462 shr myd, 6 5463 lea r5, [r5+myd] 5464 mov r4, 64 << 24 5465 cmovnz r4, [base+subpel_filters+r5*8+0] 5466 cmovnz r3, [base+subpel_filters+r5*8+4] 5467 mov [stk+0x20], r3 5468 mov r3, r3m 5469 %endif 5470 punpcklbw m15, m15 5471 psraw m15, 8 5472 REPX {pshufb x, m14}, m0, m1, m2 5473 REPX {pmaddwd x, m15}, m0, m1, m2 5474 %if ARCH_X86_64 5475 REPX {pshufb x, m14}, m4, m5, m6 5476 REPX {pmaddwd x, m15}, m4, m5, m6 5477 phaddd m0, m1 5478 phaddd m1, m2 5479 phaddd m4, m5 5480 phaddd m5, m6 5481 REPX {paddd x, m11}, m0, m1, m4, m5 5482 REPX {psrad x, m12}, m0, m1, m4, m5 5483 packssdw m0, m1 ; 0 2 2 4 5484 packssdw m4, m5 ; 1 3 3 5 5485 SWAP m2, m4 5486 movq m10, r4 5487 %else 5488 mova [stk+0x10], m15 5489 phaddd m0, m1 5490 phaddd m1, m2 5491 movu m2, [srcq+ssq*1] 5492 movu m7, [srcq+ss3q ] 5493 lea srcq, [srcq+ssq*4] 5494 movu m6, [srcq+ssq*1] 5495 lea srcq, [srcq+ssq*2] 5496 REPX {pshufb x, m14}, m2, m7, m6 5497 REPX {pmaddwd x, m15}, m2, m7, m6 5498 %define m14 [stk+0x00] 5499 %define m15 [stk+0x10] 5500 phaddd m2, m7 5501 phaddd m7, m6 5502 REPX {paddd x, m11}, m0, m1, m2, m7 5503 REPX {psrad x, m12}, m0, m1, m2, m7 5504 packssdw m0, m1 5505 packssdw m2, m7 5506 %define m8 m6 5507 %define m9 m4 5508 %define m10 m5 5509 movd m10, r4 5510 movd m9, [stk+0x20] 5511 punpckldq m10, m9 5512 %endif 5513 punpcklbw m10, m10 5514 psraw m10, 8 5515 pshufd m7, m10, q0000 5516 pshufd m8, m10, q1111 5517 pshufd m9, m10, q2222 5518 pshufd m10, m10, q3333 5519 %if ARCH_X86_32 5520 mova [stk+0x50], m7 5521 mova [stk+0x60], m8 5522 mova [stk+0x70], m9 5523 mova [stk+0x80], m10 5524 %xdefine m13 m7 5525 %define m7 [stk+0x50] 5526 %define m8 [stk+0x60] 5527 %define m9 [stk+0x70] 5528 %define m10 [stk+0x80] 5529 %endif 5530 punpcklwd m1, m0, m2 ; 01 23 5531 punpckhwd m3, m0, m2 ; 23 45 5532 %if ARCH_X86_32 5533 mov r4, r0m 5534 %define dstq r4 5535 mova [stk+0x20], m3 5536 mova [stk+0x30], m0 5537 %endif 5538.dy2_w2_loop: 5539 movu m4, [srcq+ssq*0] 5540 movu m5, [srcq+ssq*1] 5541 movu m6, [srcq+ssq*2] 5542 movu m13, [srcq+ss3q ] 5543 lea srcq, [srcq+ssq*4] 5544 pmaddwd m3, m8 5545 REPX {pshufb x, m14}, m4, m5, m6, m13 5546 REPX {pmaddwd x, m15}, m4, m5, m6, m13 5547 phaddd m4, m5 5548 phaddd m6, m13 5549 pmaddwd m5, m1, m7 5550 paddd m4, m11 5551 paddd m6, m11 5552 psrad m4, m12 5553 psrad m6, m12 5554 packssdw m4, m6 ; 6 7 8 9 5555 paddd m5, m3 5556 pshufd m3, m4, q2200 5557 pshufd m4, m4, q3311 5558 palignr m3, m0, 12 ; 4 6 6 8 5559 palignr m4, m2, 12 ; 5 7 7 9 5560 mova m0, m3 5561 mova m2, m4 5562 punpcklwd m1, m3, m4 5563 punpckhwd m3, m4 5564 pmaddwd m6, m1, m9 5565 pmaddwd m4, m3, m10 5566 paddd m5, vrnd_mem 5567 paddd m6, m4 5568 paddd m5, m6 5569 pshufd m4, m12, q1032 5570 pxor m6, m6 5571 psrad m5, m4 5572 packssdw m5, m5 5573 pmaxsw m5, m6 5574 pminsw m5, pxmaxm 5575 movd [dstq+dsq*0], m5 5576 pshuflw m5, m5, q1032 5577 movd [dstq+dsq*1], m5 5578 lea dstq, [dstq+dsq*2] 5579 sub hd, 2 5580 jg .dy2_w2_loop 5581 RET 5582%endif 5583INIT_XMM ssse3 5584.dy2_w4: 5585%if ARCH_X86_64 5586 mov myd, mym 5587 mova [rsp+0x10], m11 5588 mova [rsp+0x20], m12 5589 %if isput 5590 mova [rsp+0x30], m13 5591 %define vrnd_mem [rsp+0x30] 5592 %define stk rsp+0x40 5593 %else 5594 %define vrnd_mem [base+pd_m524256] 5595 %define stk rsp+0x30 5596 %endif 5597 movzx t0d, t0b 5598 sub srcq, 2 5599 movd m15, t0d 5600%else 5601 %define m10 [base+pd_0x3ff] 5602 %define m9 [base+pd_0x4000] 5603 %define m8 m0 5604 %xdefine m14 m4 5605 %define m15 m3 5606 %if isprep 5607 %define ssq r3 5608 %endif 5609 movzx r5, byte [esp+0x1f0] 5610 sub srcq, 2 5611 movd m15, r5 5612%endif 5613 pmaddwd m8, [base+rescale_mul] 5614%if ARCH_X86_64 5615 mova m9, [base+pd_0x4000] 5616%endif 5617 pshufd m15, m15, q0000 5618 paddd m14, m8 ; mx+dx*[0-3] 5619 pand m0, m14, m10 5620 psrld m0, 6 5621 paddd m15, m0 5622 pshufd m7, m15, q1032 5623%if ARCH_X86_64 5624 movd r4d, m15 5625 movd r11d, m7 5626 pshufd m15, m15, q0321 5627 pshufd m7, m7, q0321 5628 movd r6d, m15 5629 movd r13d, m7 5630 mova m10, [base+bdct_lb_q+ 0] 5631 mova m11, [base+bdct_lb_q+16] 5632 movd m13, [base+subpel_filters+ r4*8+2] 5633 movd m2, [base+subpel_filters+ r6*8+2] 5634 movd m15, [base+subpel_filters+r11*8+2] 5635 movd m4, [base+subpel_filters+r13*8+2] 5636%else 5637 movd r1, m15 5638 movd r4, m7 5639 pshufd m15, m15, q0321 5640 pshufd m7, m7, q0321 5641 movd r3, m15 5642 movd r5, m7 5643 mova m5, [base+bdct_lb_q+ 0] 5644 mova m6, [base+bdct_lb_q+16] 5645 movd m1, [base+subpel_filters+r1*8+2] 5646 movd m2, [base+subpel_filters+r3*8+2] 5647 movd m3, [base+subpel_filters+r4*8+2] 5648 movd m7, [base+subpel_filters+r5*8+2] 5649 SWAP m4, m7 5650 mov r3, r3m 5651 %if isprep 5652 lea ss3q, [ssq*3] 5653 %endif 5654 %define m10 m5 5655 %define m11 m6 5656 %define m12 m1 5657 %define m13 m1 5658%endif 5659 psrld m14, 10 5660 paddd m14, m14 5661 punpckldq m13, m2 5662 punpckldq m15, m4 5663 punpcklqdq m13, m15 5664 pxor m2, m2 5665 pcmpeqd m0, m2 5666%if ARCH_X86_64 5667 pand m9, m0 5668%else 5669 pand m2, m9, m0 5670 %define m9 m2 5671 SWAP m7, m4 5672%endif 5673 pandn m0, m13 5674%if ARCH_X86_64 5675 SWAP m13, m0 5676%else 5677 %define m13 m0 5678%endif 5679 por m13, m9 5680 punpckhbw m15, m13, m13 5681 punpcklbw m13, m13 5682 psraw m15, 8 5683 psraw m13, 8 5684 pshufb m12, m14, m10 5685 pshufb m14, m11 5686 mova m10, [base+spel_s_shuf2] 5687 movd r4d, m14 5688 shr r4d, 24 5689%if ARCH_X86_32 5690 mova [stk+0x40], m13 5691 mova [stk+0x50], m15 5692 pxor m2, m2 5693%endif 5694 pshufb m7, m14, m2 5695 psubb m14, m7 5696 paddb m12, m10 5697 paddb m14, m10 5698%if ARCH_X86_64 5699 lea r6, [r4+ssq*1] 5700 lea r11, [r4+ssq*2] 5701 lea r13, [r4+ss3q ] 5702 movu m1, [srcq+ssq*0] 5703 movu m8, [srcq+ssq*2] 5704 movu m9, [srcq+ssq*1] 5705 movu m10, [srcq+ss3q ] 5706 movu m7, [srcq+r4 ] 5707 movu m2, [srcq+r11 ] 5708 movu m3, [srcq+r6 ] 5709 movu m4, [srcq+r13 ] 5710 lea srcq, [srcq+ssq*4] 5711 REPX {pshufb x, m12}, m1, m9, m8, m10 5712 REPX {pmaddwd x, m13}, m1, m9, m8, m10 5713 REPX {pshufb x, m14}, m7, m3, m2, m4 5714 REPX {pmaddwd x, m15}, m7, m3, m2, m4 5715 mova m5, [rsp+0x10] 5716 movd xm6, [rsp+0x20] 5717 phaddd m1, m7 5718 phaddd m8, m2 5719 phaddd m9, m3 5720 phaddd m10, m4 5721 movu m2, [srcq+ssq*0] 5722 movu m3, [srcq+ssq*1] 5723 REPX {paddd x, m5}, m1, m9, m8, m10 5724 REPX {psrad x, xm6}, m1, m9, m8, m10 5725 packssdw m1, m8 ; 0 2 5726 packssdw m9, m10 ; 1 3 5727 movu m0, [srcq+r4 ] 5728 movu m8, [srcq+r6 ] 5729 lea srcq, [srcq+ssq*2] 5730 REPX {pshufb x, m12}, m2, m3 5731 REPX {pmaddwd x, m13}, m2, m3 5732 REPX {pshufb x, m14}, m0, m8 5733 REPX {pmaddwd x, m15}, m0, m8 5734 phaddd m2, m0 5735 phaddd m3, m8 5736 shr myd, 6 5737 mov r9d, 64 << 24 5738 lea myd, [t1+myq] 5739 cmovnz r9q, [base+subpel_filters+myq*8] 5740 REPX {paddd x, m5}, m2, m3 5741 REPX {psrad x, xm6}, m2, m3 5742 packssdw m2, m3 ; 4 5 5743 pshufd m3, m2, q1032 ; 5 _ 5744 punpcklwd m0, m1, m9 ; 01 5745 punpckhwd m1, m9 ; 23 5746 punpcklwd m2, m3 ; 45 5747 movq m10, r9 5748 %define hrnd_mem [rsp+0x10] 5749 %define hsh_mem [rsp+0x20] 5750 %define vsh_mem [rsp+0x28] 5751 %if isput 5752 %define vrnd_mem [rsp+0x30] 5753 %else 5754 %define vrnd_mem [base+pd_m524256] 5755 %endif 5756%else 5757 mova [stk+0x20], m12 5758 mova [stk+0x30], m14 5759 add r4, srcq 5760 MC_4TAP_SCALED_H 0x60 ; 0 1 5761 MC_4TAP_SCALED_H 0x70 ; 2 3 5762 MC_4TAP_SCALED_H 0x80 ; 4 5 5763 mov [stk+0xe0], r4 5764 mova m3, [base+spel_s_shuf8] 5765 mova m0, [stk+0x60] 5766 mova m1, [stk+0x70] 5767 mova m2, [stk+0x80] 5768 mov myd, mym 5769 mov rX, [esp+0x1f4] 5770 xor r5, r5 5771 shr myd, 6 5772 lea rX, [rX+myd] 5773 mov r4, 64 << 24 5774 cmovnz r4, [base+subpel_filters+rX*8+0] 5775 cmovnz r5, [base+subpel_filters+rX*8+4] 5776 mov r3, r3m 5777 pshufb m0, m3 ; 01 5778 pshufb m1, m3 ; 23 5779 pshufb m2, m3 ; 45 5780 movd m7, r4 5781 movd m4, r5 5782 mov r5, r0m 5783 %if isput 5784 mov r1, r1m 5785 %endif 5786 mov r4, [stk+0xe0] 5787 %define dstq r5 5788 %define tmpq r5 5789 %define m12 [stk+0x20] 5790 %define m14 [stk+0x30] 5791 %define m13 [stk+0x40] 5792 %define m15 [stk+0x50] 5793 %define hrnd_mem [esp+0x00] 5794 %define hsh_mem [esp+0x10] 5795 %define vsh_mem [esp+0x18] 5796 %if isput 5797 %define vrnd_mem [esp+0x20] 5798 %else 5799 %define vrnd_mem [base+pd_m524256] 5800 %endif 5801 %define m10 m7 5802 punpckldq m10, m4 5803%endif 5804 punpcklbw m10, m10 5805 psraw m10, 8 5806 pshufd m3, m10, q0000 5807 pshufd m4, m10, q1111 5808 pshufd m5, m10, q2222 5809 pshufd m10, m10, q3333 5810%if ARCH_X86_32 5811 %xdefine m8 m3 5812 %xdefine m9 m6 5813 %xdefine m11 m5 5814 %xdefine m6 m4 5815 mova [stk+0x100], m3 5816 mova [stk+0x110], m4 5817 mova [stk+0x120], m5 5818 mova [stk+0x130], m10 5819 %define m3 [stk+0x100] 5820 %define m4 [stk+0x110] 5821 %define m5 [stk+0x120] 5822 %define m10 [stk+0x130] 5823%endif 5824.dy2_w4_loop: 5825 pmaddwd m8, m0, m3 5826 pmaddwd m9, m1, m3 5827 mova m0, m2 5828 pmaddwd m1, m4 5829 pmaddwd m11, m2, m4 5830 paddd m8, vrnd_mem 5831 paddd m9, vrnd_mem 5832 pmaddwd m2, m5 5833 paddd m8, m1 5834 paddd m9, m11 5835 paddd m8, m2 5836 movu m6, [srcq+ssq*0] 5837 movu m1, [srcq+ssq*2] 5838%if ARCH_X86_64 5839 movu m11, [srcq+r4 ] 5840 movu m2, [srcq+r11] 5841%else 5842 movu m11, [r4+ssq*0] 5843 movu m2, [r4+ssq*2] 5844%endif 5845 pshufb m6, m12 5846 pshufb m1, m12 5847 pmaddwd m6, m13 5848 pmaddwd m1, m13 5849 pshufb m11, m14 5850 pshufb m2, m14 5851 pmaddwd m11, m15 5852 pmaddwd m2, m15 5853 phaddd m6, m11 5854 phaddd m1, m2 5855 paddd m6, hrnd_mem 5856 paddd m1, hrnd_mem 5857 psrad m6, hsh_mem 5858 psrad m1, hsh_mem 5859 movu m7, [srcq+ssq*1] 5860 movu m11, [srcq+ss3q ] 5861 packssdw m6, m1 ; 6 8 5862%if ARCH_X86_64 5863 movu m2, [srcq+r6 ] 5864 movu m1, [srcq+r13] 5865%else 5866 movu m2, [r4+ssq*1] 5867 movu m1, [r4+ss3q ] 5868%endif 5869 pshufb m7, m12 5870 pshufb m11, m12 5871 pmaddwd m7, m13 5872 pmaddwd m11, m13 5873 pshufb m2, m14 5874 pshufb m1, m14 5875 pmaddwd m2, m15 5876 pmaddwd m1, m15 5877 phaddd m7, m2 5878 phaddd m11, m1 5879 paddd m7, hrnd_mem 5880 paddd m11, hrnd_mem 5881 psrad m7, hsh_mem 5882 psrad m11, hsh_mem 5883 packssdw m7, m11 ; 7 9 5884%if ARCH_X86_32 5885 lea r4, [r4+ssq*4] 5886%endif 5887 lea srcq, [srcq+ssq*4] 5888 punpcklwd m1, m6, m7 ; 67 5889 punpckhwd m6, m7 ; 89 5890 mova m2, m6 5891 pmaddwd m11, m1, m5 5892 pmaddwd m7, m1, m10 5893 pmaddwd m6, m10 5894 paddd m9, m11 5895%if isput 5896 movd m11, vsh_mem 5897%endif 5898 paddd m8, m7 5899 paddd m9, m6 5900%if isput 5901 psrad m8, m11 5902 psrad m9, m11 5903 packssdw m8, m9 5904 pxor m7, m7 5905 pmaxsw m8, m7 5906 pminsw m8, pxmaxm 5907 movq [dstq+dsq*0], m8 5908 movhps [dstq+dsq*1], m8 5909 lea dstq, [dstq+dsq*2] 5910%else 5911 psrad m8, 6 5912 psrad m9, 6 5913 packssdw m8, m9 5914 mova [tmpq], m8 5915 add tmpq, 16 5916%endif 5917 sub hd, 2 5918 jg .dy2_w4_loop 5919 MC_8TAP_SCALED_RET ; why not jz .ret? 5920INIT_XMM ssse3 5921.dy2_w8: 5922 mov dword [stk+0xf0], 1 5923 movifprep tmp_stridem, 16 5924 jmp .dy2_w_start 5925.dy2_w16: 5926 mov dword [stk+0xf0], 2 5927 movifprep tmp_stridem, 32 5928 jmp .dy2_w_start 5929.dy2_w32: 5930 mov dword [stk+0xf0], 4 5931 movifprep tmp_stridem, 64 5932 jmp .dy2_w_start 5933.dy2_w64: 5934 mov dword [stk+0xf0], 8 5935 movifprep tmp_stridem, 128 5936 jmp .dy2_w_start 5937.dy2_w128: 5938 mov dword [stk+0xf0], 16 5939 movifprep tmp_stridem, 256 5940.dy2_w_start: 5941 mov myd, mym 5942%if ARCH_X86_64 5943 %ifidn %1, put 5944 movifnidn dsm, dsq 5945 %endif 5946 mova [rsp+0x10], m11 5947 mova [rsp+0x20], m12 5948 %define hround m11 5949 %if isput 5950 mova [rsp+0x30], m13 5951 %else 5952 mova m13, [base+pd_m524256] 5953 %endif 5954 shr t0d, 16 5955 shr myd, 6 5956 mov r4d, 64 << 24 5957 lea myd, [t1+myq] 5958 cmovnz r4q, [base+subpel_filters+myq*8] 5959 movd m15, t0d 5960%else 5961 %define hround [esp+0x00] 5962 %define m12 [esp+0x10] 5963 %define m10 [base+pd_0x3ff] 5964 %define m8 m0 5965 %xdefine m14 m4 5966 %xdefine m15 m3 5967 %if isput 5968 %define dstq r0 5969 %else 5970 %define tmpq r0 5971 %define ssq ssm 5972 %endif 5973 mov r5, [esp+0x1f0] 5974 mov r3, [esp+0x1f4] 5975 shr r5, 16 5976 movd m15, r5 5977 xor r5, r5 5978 shr myd, 6 5979 lea r3, [r3+myd] 5980 mov r4, 64 << 24 5981 cmovnz r4, [base+subpel_filters+r3*8+0] 5982 cmovnz r5, [base+subpel_filters+r3*8+4] 5983 mov r0, r0m 5984 mov r3, r3m 5985%endif 5986 sub srcq, 6 5987 pslld m7, m8, 2 ; dx*4 5988 pmaddwd m8, [base+rescale_mul] ; dx*[0-3] 5989 pshufd m15, m15, q0000 5990 paddd m14, m8 ; mx+dx*[0-3] 5991%if ARCH_X86_64 5992 movq m3, r4q 5993%else 5994 movd m5, r4 5995 movd m6, r5 5996 punpckldq m5, m6 5997 SWAP m3, m5 5998%endif 5999 punpcklbw m3, m3 6000 psraw m3, 8 6001 mova [stk+0x100], m7 6002 mova [stk+0x120], m15 6003 mov [stk+0x0f8], srcq 6004 mov [stk+0x130], r0q ; dstq / tmpq 6005 pshufd m0, m3, q0000 6006 pshufd m1, m3, q1111 6007 pshufd m2, m3, q2222 6008 pshufd m3, m3, q3333 6009%if ARCH_X86_64 6010 mova [stk+0x140], m0 6011 mova [stk+0x150], m1 6012 mova [stk+0x160], m2 6013 mova [stk+0x170], m3 6014 %if UNIX64 6015 mov hm, hd 6016 %endif 6017%else 6018 mova [stk+0x180], m0 6019 mova [stk+0x190], m1 6020 mova [stk+0x1a0], m2 6021 mova [stk+0x1b0], m3 6022 SWAP m5, m3 6023 mov r5, hm 6024 mov [stk+0x134], r5 6025%endif 6026 jmp .dy2_hloop 6027.dy2_hloop_prep: 6028 dec dword [stk+0x0f0] 6029 jz .ret 6030%if ARCH_X86_64 6031 add qword [stk+0x130], 16 6032 mov hd, hm 6033%else 6034 add dword [stk+0x130], 16 6035 mov r5, [stk+0x134] 6036 mov r0, [stk+0x130] 6037%endif 6038 mova m7, [stk+0x100] 6039 mova m14, [stk+0x110] 6040%if ARCH_X86_64 6041 mova m10, [base+pd_0x3ff] 6042 mova m11, [rsp+0x10] 6043%endif 6044 mova m15, [stk+0x120] 6045 mov srcq, [stk+0x0f8] 6046%if ARCH_X86_64 6047 mov r0q, [stk+0x130] ; dstq / tmpq 6048%else 6049 mov hm, r5 6050 mov r0m, r0 6051 mov r3, r3m 6052%endif 6053 paddd m14, m7 6054.dy2_hloop: 6055%if ARCH_X86_64 6056 mova m9, [base+pq_0x40000000] 6057%else 6058 %define m9 [base+pq_0x40000000] 6059%endif 6060 pxor m1, m1 6061 psrld m2, m14, 10 6062 mova [stk], m2 6063 pand m6, m14, m10 6064 psrld m6, 6 6065 paddd m5, m15, m6 6066 pcmpeqd m6, m1 6067 pshufd m2, m5, q1032 6068%if ARCH_X86_64 6069 movd r4d, m5 6070 movd r6d, m2 6071 pshufd m5, m5, q0321 6072 pshufd m2, m2, q0321 6073 movd r7d, m5 6074 movd r9d, m2 6075 movq m0, [base+subpel_filters+r4*8] 6076 movq m1, [base+subpel_filters+r6*8] 6077 movhps m0, [base+subpel_filters+r7*8] 6078 movhps m1, [base+subpel_filters+r9*8] 6079%else 6080 movd r0, m5 6081 movd rX, m2 6082 pshufd m5, m5, q0321 6083 pshufd m2, m2, q0321 6084 movd r4, m5 6085 movd r5, m2 6086 movq m0, [base+subpel_filters+r0*8] 6087 movq m1, [base+subpel_filters+rX*8] 6088 movhps m0, [base+subpel_filters+r4*8] 6089 movhps m1, [base+subpel_filters+r5*8] 6090%endif 6091 paddd m14, m7 ; mx+dx*[4-7] 6092 pand m5, m14, m10 6093 psrld m5, 6 6094 paddd m15, m5 6095 pxor m2, m2 6096 pcmpeqd m5, m2 6097 mova [stk+0x110], m14 6098 pshufd m4, m15, q1032 6099%if ARCH_X86_64 6100 movd r10d, m15 6101 movd r11d, m4 6102 pshufd m15, m15, q0321 6103 pshufd m4, m4, q0321 6104 movd r13d, m15 6105 movd rXd, m4 6106 movq m2, [base+subpel_filters+r10*8] 6107 movq m3, [base+subpel_filters+r11*8] 6108 movhps m2, [base+subpel_filters+r13*8] 6109 movhps m3, [base+subpel_filters+ rX*8] 6110 psrld m14, 10 6111 movq r11, m14 6112 punpckhqdq m14, m14 6113 movq rX, m14 6114 mov r10d, r11d 6115 shr r11, 32 6116 mov r13d, rXd 6117 shr rX, 32 6118 mov r4d, [stk+ 0] 6119 mov r6d, [stk+ 4] 6120 mov r7d, [stk+ 8] 6121 mov r9d, [stk+12] 6122 pshufd m4, m6, q1100 6123 pshufd m6, m6, q3322 6124 pshufd m14, m5, q1100 6125 pshufd m5, m5, q3322 6126 pand m7, m9, m4 6127 pand m8, m9, m6 6128 pand m15, m9, m14 6129 pand m9, m9, m5 6130 pandn m4, m0 6131 pandn m6, m1 6132 pandn m14, m2 6133 pandn m5, m3 6134 por m7, m4 6135 por m8, m6 6136 por m15, m14 6137 por m9, m5 6138 punpcklbw m0, m7, m7 6139 punpckhbw m7, m7 6140 punpcklbw m1, m8, m8 6141 punpckhbw m8, m8 6142 psraw m0, 8 6143 psraw m7, 8 6144 psraw m1, 8 6145 psraw m8, 8 6146 punpcklbw m2, m15, m15 6147 punpckhbw m15, m15 6148 punpcklbw m3, m9, m9 6149 punpckhbw m9, m9 6150 psraw m2, 8 6151 psraw m15, 8 6152 psraw m3, 8 6153 psraw m9, 8 6154 mova [stk+0x10], m0 6155 mova [stk+0x20], m7 6156 mova [stk+0x30], m1 6157 mova [stk+0x40], m8 6158 mova [stk+0x50], m2 6159 mova [stk+0x60], m15 6160 mova [stk+0x70], m3 6161 mova [stk+0x80], m9 6162 MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0 6163 mova [stk+0x90], m1 6164 MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1 6165 mova [stk+0xa0], m2 6166 MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2 6167 mova [stk+0xb0], m3 6168 MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3 6169 mova [stk+0xc0], m4 6170 MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4 6171 mova [stk+0xd0], m5 6172 MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5 6173 MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6 6174 MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7 6175 mova m5, [stk+0xd0] 6176 mova m1, [stk+0x90] 6177 mova m2, [stk+0xa0] 6178 mova m3, [stk+0xb0] 6179 mova m9, [stk+0xc0] 6180 punpcklwd m4, m5, m6 ; 45a 6181 punpckhwd m5, m6 ; 45b 6182 punpcklwd m6, m7, m8 ; 67a 6183 punpckhwd m7, m8 ; 67b 6184 punpcklwd m0, m1, m2 ; 01a 6185 punpckhwd m1, m2 ; 01b 6186 punpcklwd m2, m3, m9 ; 23a 6187 punpckhwd m3, m9 ; 23b 6188 mova m10, [stk+0x140] 6189 mova m11, [stk+0x150] 6190 mova m14, [stk+0x160] 6191 mova m15, [stk+0x170] 6192 mova [stk+0x90], m4 6193 mova [stk+0xa0], m5 6194 mova [stk+0xb0], m6 6195 mova [stk+0xc0], m7 6196 %define hround [rsp+0x10] 6197 %define shift [rsp+0x20] 6198 %if isput 6199 %define vround [rsp+0x30] 6200 %else 6201 %define vround [base+pd_m524256] 6202 %endif 6203.dy2_vloop: 6204 pmaddwd m4, m0, m10 6205 pmaddwd m5, m1, m10 6206 pmaddwd m6, m2, m11 6207 pmaddwd m7, m3, m11 6208 paddd m4, m13 6209 paddd m5, m13 6210 paddd m4, m6 6211 paddd m5, m7 6212 pmaddwd m6, [stk+0x90], m14 6213 pmaddwd m7, [stk+0xa0], m14 6214 pmaddwd m8, [stk+0xb0], m15 6215 pmaddwd m9, [stk+0xc0], m15 6216 paddd m4, m6 6217 paddd m5, m7 6218 %if isput 6219 pshufd m6, m12, q1032 6220 %endif 6221 paddd m4, m8 6222 paddd m5, m9 6223%else 6224 movd r0, m15 6225 movd rX, m4 6226 pshufd m15, m15, q0321 6227 pshufd m4, m4, q0321 6228 movd r4, m15 6229 movd r5, m4 6230 mova m14, [stk+0x110] 6231 movq m2, [base+subpel_filters+r0*8] 6232 movq m3, [base+subpel_filters+rX*8] 6233 movhps m2, [base+subpel_filters+r4*8] 6234 movhps m3, [base+subpel_filters+r5*8] 6235 psrld m14, 10 6236 mova [stk+16], m14 6237 mov r0, [stk+ 0] 6238 mov rX, [stk+ 4] 6239 mov r4, [stk+ 8] 6240 mov r5, [stk+12] 6241 mova [stk+0x20], m0 6242 mova [stk+0x30], m1 6243 mova [stk+0x40], m2 6244 mova [stk+0x50], m3 6245 pshufd m4, m6, q1100 6246 pshufd m6, m6, q3322 6247 pshufd m7, m5, q1100 6248 pshufd m5, m5, q3322 6249 pand m0, m9, m4 6250 pand m1, m9, m6 6251 pand m2, m9, m7 6252 pand m3, m9, m5 6253 pandn m4, [stk+0x20] 6254 pandn m6, [stk+0x30] 6255 pandn m7, [stk+0x40] 6256 pandn m5, [stk+0x50] 6257 por m0, m4 6258 por m1, m6 6259 por m2, m7 6260 por m3, m5 6261 punpcklbw m4, m0, m0 6262 punpckhbw m0, m0 6263 punpcklbw m5, m1, m1 6264 punpckhbw m1, m1 6265 psraw m4, 8 6266 psraw m0, 8 6267 psraw m5, 8 6268 psraw m1, 8 6269 punpcklbw m6, m2, m2 6270 punpckhbw m2, m2 6271 punpcklbw m7, m3, m3 6272 punpckhbw m3, m3 6273 psraw m6, 8 6274 psraw m2, 8 6275 psraw m7, 8 6276 psraw m3, 8 6277 mova [stk+0x0a0], m4 6278 mova [stk+0x0b0], m0 6279 mova [stk+0x0c0], m5 6280 mova [stk+0x0d0], m1 6281 mova [stk+0x140], m6 6282 mova [stk+0x150], m2 6283 mova [stk+0x160], m7 6284 mova [stk+0x170], m3 6285 MC_8TAP_SCALED_H 0xa0, 0x20, 0 ; 0 6286 MC_8TAP_SCALED_H 0xa0, 0x30 ; 1 6287 MC_8TAP_SCALED_H 0xa0, 0x40 ; 2 6288 MC_8TAP_SCALED_H 0xa0, 0x50 ; 3 6289 MC_8TAP_SCALED_H 0xa0, 0x60 ; 4 6290 MC_8TAP_SCALED_H 0xa0, 0x70 ; 5 6291 MC_8TAP_SCALED_H 0xa0, 0x80 ; 6 6292 MC_8TAP_SCALED_H 0xa0, 0x90 ; 7 6293 mova m5, [stk+0x60] 6294 mova m6, [stk+0x70] 6295 mova m7, [stk+0x80] 6296 mova m0, [stk+0x90] 6297 mov r0, r0m 6298 punpcklwd m4, m5, m6 ; 45a 6299 punpckhwd m5, m6 ; 45b 6300 punpcklwd m6, m7, m0 ; 67a 6301 punpckhwd m7, m0 ; 67b 6302 mova [stk+0x60], m4 6303 mova [stk+0x70], m5 6304 mova [stk+0x80], m6 6305 mova [stk+0x90], m7 6306 mova m1, [stk+0x20] 6307 mova m2, [stk+0x30] 6308 mova m3, [stk+0x40] 6309 mova m4, [stk+0x50] 6310 punpcklwd m0, m1, m2 ; 01a 6311 punpckhwd m1, m2 ; 01b 6312 punpcklwd m2, m3, m4 ; 23a 6313 punpckhwd m3, m4 ; 23b 6314 mova m4, [stk+0x180] 6315 mova m5, [stk+0x190] 6316 mova m6, [stk+0x1a0] 6317 mova m7, [stk+0x1b0] 6318 mova [stk+0x40], m2 6319 mova [stk+0x50], m3 6320.dy2_vloop: 6321 pmaddwd m0, m4 6322 pmaddwd m1, m4 6323 pmaddwd m2, m5 6324 pmaddwd m3, m5 6325 paddd m0, m2 6326 paddd m1, m3 6327 pmaddwd m2, [stk+0x60], m6 6328 pmaddwd m3, [stk+0x70], m6 6329 pmaddwd m4, [stk+0x80], m7 6330 pmaddwd m5, [stk+0x90], m7 6331 %if isput 6332 movd m6, [esp+0x18] 6333 %endif 6334 paddd m0, m2 6335 paddd m1, m3 6336 paddd m0, vrnd_mem 6337 paddd m1, vrnd_mem 6338 paddd m4, m0 6339 paddd m5, m1 6340%endif 6341%ifidn %1, put 6342 psrad m4, m6 6343 psrad m5, m6 6344 packssdw m4, m5 6345 pxor m7, m7 6346 pmaxsw m4, m7 6347 pminsw m4, pxmaxm 6348 mova [dstq], m4 6349 add dstq, dsm 6350%else 6351 psrad m4, 6 6352 psrad m5, 6 6353 packssdw m4, m5 6354 mova [tmpq], m4 6355 add tmpq, tmp_stridem 6356%endif 6357 dec hd 6358 jz .dy2_hloop_prep 6359%if ARCH_X86_64 6360 MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1 6361 mova [stk+0xd0], m4 6362 MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1 6363 mova m4, [stk+0xd0] 6364 mova m0, m2 ; 01a 6365 mova m1, m3 ; 01b 6366 mova m2, [stk+0x90] ; 23a 6367 mova m3, [stk+0xa0] ; 23b 6368 mova m5, [stk+0xb0] ; 45a 6369 mova m6, [stk+0xc0] ; 45b 6370 punpcklwd m7, m4, m8 ; 67a 6371 punpckhwd m4, m8 ; 67b 6372 mova [stk+0x90], m5 6373 mova [stk+0xa0], m6 6374 mova [stk+0xb0], m7 6375 mova [stk+0xc0], m4 6376%else 6377 mov r0m, r0 6378 mov r3, r3m 6379 MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8 6380 MC_8TAP_SCALED_H 0xa0, 0 ; 9 6381 mova m7, [stk+0xe0] 6382 mova m2, [stk+0x60] ; 23a 6383 mova m3, [stk+0x70] ; 23b 6384 mova m4, [stk+0x80] ; 45a 6385 mova m5, [stk+0x90] ; 45b 6386 punpcklwd m6, m7, m0 ; 67a 6387 punpckhwd m7, m0 ; 67b 6388 mova m0, [stk+0x40] ; 01a 6389 mova m1, [stk+0x50] ; 01b 6390 mova [stk+0x40], m2 6391 mova [stk+0x50], m3 6392 mova [stk+0x60], m4 6393 mova [stk+0x70], m5 6394 mova m4, [stk+0x180] 6395 mova m5, [stk+0x190] 6396 mova [stk+0x80], m6 6397 mova [stk+0x90], m7 6398 mova m6, [stk+0x1a0] 6399 mova m7, [stk+0x1b0] 6400 mov r0, r0m 6401%endif 6402 jmp .dy2_vloop 6403INIT_XMM ssse3 6404.ret: 6405 MC_8TAP_SCALED_RET 0 6406%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT 6407 %define r0m [rstk+stack_offset+ 4] 6408 %define r1m [rstk+stack_offset+ 8] 6409 %define r2m [rstk+stack_offset+12] 6410 %define r3m [rstk+stack_offset+16] 6411%endif 6412%undef isput 6413%undef isprep 6414%endmacro 6415 6416%macro BILIN_SCALED_FN 1 6417cglobal %1_bilin_scaled_16bpc 6418 mov t0d, (5*15 << 16) | 5*15 6419 mov t1d, (5*15 << 16) | 5*15 6420 jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX) 6421%endmacro 6422 6423%if WIN64 6424DECLARE_REG_TMP 6, 5 6425%elif ARCH_X86_64 6426DECLARE_REG_TMP 6, 8 6427%else 6428DECLARE_REG_TMP 1, 2 6429%endif 6430BILIN_SCALED_FN put 6431FN put_8tap_scaled, sharp, SHARP, SHARP 6432FN put_8tap_scaled, sharp_smooth, SHARP, SMOOTH 6433FN put_8tap_scaled, smooth_sharp, SMOOTH, SHARP 6434FN put_8tap_scaled, smooth, SMOOTH, SMOOTH 6435FN put_8tap_scaled, sharp_regular, SHARP, REGULAR 6436FN put_8tap_scaled, regular_sharp, REGULAR, SHARP 6437FN put_8tap_scaled, smooth_regular, SMOOTH, REGULAR 6438FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH 6439FN put_8tap_scaled, regular, REGULAR, REGULAR 6440MC_8TAP_SCALED put 6441 6442%if WIN64 6443DECLARE_REG_TMP 5, 4 6444%elif ARCH_X86_64 6445DECLARE_REG_TMP 6, 7 6446%else 6447DECLARE_REG_TMP 1, 2 6448%endif 6449BILIN_SCALED_FN prep 6450FN prep_8tap_scaled, sharp, SHARP, SHARP 6451FN prep_8tap_scaled, sharp_smooth, SHARP, SMOOTH 6452FN prep_8tap_scaled, smooth_sharp, SMOOTH, SHARP 6453FN prep_8tap_scaled, smooth, SMOOTH, SMOOTH 6454FN prep_8tap_scaled, sharp_regular, SHARP, REGULAR 6455FN prep_8tap_scaled, regular_sharp, REGULAR, SHARP 6456FN prep_8tap_scaled, smooth_regular, SMOOTH, REGULAR 6457FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH 6458FN prep_8tap_scaled, regular, REGULAR, REGULAR 6459MC_8TAP_SCALED prep 6460 6461%if ARCH_X86_64 6462DECLARE_REG_TMP 6 6463%else 6464DECLARE_REG_TMP 2 6465%endif 6466 6467%if ARCH_X86_64 6468; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that 6469; by allocating 16 bytes more stack space so that stack offsets match up. 6470%if WIN64 && STACK_ALIGNMENT == 16 6471%assign stksz 16*14 6472%else 6473%assign stksz 16*13 6474%endif 6475cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \ 6476 mx, tmp, alpha, beta, \ 6477 filter, my, gamma, cnt 6478%assign stack_size_padded_8x8t stack_size_padded 6479%else 6480cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ 6481 filter, mx, my 6482%define m8 [esp+16*13] 6483%define m9 [esp+16*14] 6484%define cntd dword [esp+4*63] 6485%define dstq tmpq 6486%define dsq 0 6487%if STACK_ALIGNMENT < 16 6488%define dstm [esp+4*65] 6489%define dsm [esp+4*66] 6490%else 6491%define dstm r0m 6492%define dsm r1m 6493%endif 6494%endif 6495%define base filterq-$$ 6496 mov t0d, r7m 6497 LEA filterq, $$ 6498 shr t0d, 11 6499%if ARCH_X86_64 6500 movddup m8, [base+warp8x8t_rnd] 6501%else 6502 movddup m1, [base+warp8x8t_rnd] 6503 mov r1, r1m 6504 add r1, r1 6505 mova m8, m1 6506 mov r1m, r1 ; ds *= 2 6507%endif 6508 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main 6509 jmp .start 6510.loop: 6511%if ARCH_X86_64 6512 lea dstq, [dstq+dsq*4] 6513%else 6514 add dstq, dsm 6515 mov dstm, dstq 6516%endif 6517 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2 6518.start: 6519%if ARCH_X86_32 6520 mov dstq, dstm 6521%endif 6522 paddd m1, m8 6523 paddd m2, m8 6524 psrad m1, 15 6525 psrad m2, 15 6526 packssdw m1, m2 6527 mova [dstq+dsq*0], m1 6528 call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3 6529%if ARCH_X86_32 6530 mov dstq, dstm 6531 add dstq, dsm 6532%endif 6533 paddd m1, m8 6534 paddd m2, m8 6535 psrad m1, 15 6536 psrad m2, 15 6537 packssdw m1, m2 6538 mova [dstq+dsq*2], m1 6539 dec cntd 6540 jg .loop 6541 RET 6542 6543%if ARCH_X86_64 6544cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \ 6545 mx, tmp, alpha, beta, \ 6546 filter, my, gamma, cnt 6547ASSERT stack_size_padded == stack_size_padded_8x8t 6548%else 6549cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \ 6550 filter, mx, my 6551%endif 6552 mov t0d, r7m 6553 LEA filterq, $$ 6554 shr t0d, 11 6555%if ARCH_X86_64 6556 movddup m8, [base+warp8x8_rnd2+t0*8] 6557 movd m9, r7m ; pixel_max 6558 pshufb m9, [base+pw_256] 6559%else 6560 movddup m1, [base+warp8x8_rnd2+t0*8] 6561 movd m2, r7m ; pixel_max 6562 pshufb m2, [base+pw_256] 6563 mova m8, m1 6564 mova m9, m2 6565%endif 6566 call .main 6567 jmp .start 6568.loop: 6569%if ARCH_X86_64 6570 lea dstq, [dstq+dsq*2] 6571%else 6572 add dstq, dsm 6573 mov dstm, dstq 6574%endif 6575 call .main2 6576.start: 6577%if ARCH_X86_32 6578 mov dstq, dstm 6579%endif 6580 psrad m1, 16 6581 psrad m2, 16 6582 packssdw m1, m2 6583 pmaxsw m1, m6 6584 pmulhrsw m1, m8 6585 pminsw m1, m9 6586 mova [dstq+dsq*0], m1 6587 call .main3 6588%if ARCH_X86_32 6589 mov dstq, dstm 6590 add dstq, dsm 6591%endif 6592 psrad m1, 16 6593 psrad m2, 16 6594 packssdw m1, m2 6595 pmaxsw m1, m6 6596 pmulhrsw m1, m8 6597 pminsw m1, m9 6598 mova [dstq+dsq*1], m1 6599 dec cntd 6600 jg .loop 6601 RET 6602ALIGN function_align 6603.main: 6604 ; Stack args offset by one (r4m -> r5m etc.) due to call 6605%if WIN64 6606 mov deltaq, r5m 6607 mov mxd, r6m 6608%endif 6609 movd m0, [base+warp8x8_shift+t0*4] 6610 movddup m7, [base+warp8x8_rnd1+t0*8] 6611 add filterq, mc_warp_filter-$$ 6612%if ARCH_X86_64 6613 movsx alphad, word [deltaq+2*0] 6614 movsx betad, word [deltaq+2*1] 6615 movsx gammad, word [deltaq+2*2] 6616 movsx deltad, word [deltaq+2*3] 6617 lea tmpq, [ssq*3] 6618 add mxd, 512+(64<<10) 6619 sub srcq, tmpq ; src -= ss*3 6620 imul tmpd, alphad, -7 6621 mov myd, r7m 6622 add betad, tmpd ; beta -= alpha*7 6623 imul tmpd, gammad, -7 6624 add myd, 512+(64<<10) 6625 mov cntd, 4 6626 add deltad, tmpd ; delta -= gamma*7 6627%else 6628%if STACK_ALIGNMENT < 16 6629 %assign stack_offset stack_offset - gprsize 6630%endif 6631 mov r3d, r5m ; abcd 6632%if STACK_ALIGNMENT < 16 6633 mov r0, r1m ; dst 6634 mov r1, r2m ; ds 6635 mov [esp+gprsize+4*65], r0 6636 mov [esp+gprsize+4*66], r1 6637%endif 6638 movsx alphad, word [r3+2*0] 6639 movsx r2d, word [r3+2*1] 6640 movsx gammad, word [r3+2*2] 6641 movsx r3d, word [r3+2*3] 6642 imul r5d, alphad, -7 6643 add r2d, r5d ; beta -= alpha*7 6644 imul r5d, gammad, -7 6645 mov [esp+gprsize+4*60], r2d 6646 add r3d, r5d ; delta -= gamma*7 6647 mov [esp+gprsize+4*61], r3d 6648 mov r3d, r4m ; ss 6649 mov srcq, r3m 6650 mov mxd, r6m 6651 mov myd, r7m 6652 mov dword [esp+gprsize+4*63], 4 ; cnt 6653 mov [esp+gprsize+4*62], r3 6654 lea r3, [r3*3] 6655 add mxd, 512+(64<<10) 6656 add myd, 512+(64<<10) 6657 sub srcq, r3 ; src -= ss*3 6658%if STACK_ALIGNMENT < 16 6659 %assign stack_offset stack_offset + gprsize 6660%endif 6661%endif 6662 mova [rsp+gprsize], m0 6663 pxor m6, m6 6664 call .h 6665 mova m5, m0 6666 call .h 6667 punpcklwd m1, m5, m0 ; 01 6668 punpckhwd m5, m0 6669 mova [rsp+gprsize+16* 1], m1 6670 mova [rsp+gprsize+16* 4], m5 6671 mova m5, m0 6672 call .h 6673 punpcklwd m1, m5, m0 ; 12 6674 punpckhwd m5, m0 6675 mova [rsp+gprsize+16* 7], m1 6676 mova [rsp+gprsize+16*10], m5 6677 mova m5, m0 6678 call .h 6679 punpcklwd m1, m5, m0 ; 23 6680 punpckhwd m5, m0 6681 mova [rsp+gprsize+16* 2], m1 6682 mova [rsp+gprsize+16* 5], m5 6683 mova m5, m0 6684 call .h 6685 punpcklwd m1, m5, m0 ; 34 6686 punpckhwd m5, m0 6687 mova [rsp+gprsize+16* 8], m1 6688 mova [rsp+gprsize+16*11], m5 6689 mova m5, m0 6690 call .h 6691 punpcklwd m1, m5, m0 ; 45 6692 punpckhwd m5, m0 6693 mova [rsp+gprsize+16* 3], m1 6694 mova [rsp+gprsize+16* 6], m5 6695 mova m5, m0 6696 call .h 6697 punpcklwd m1, m5, m0 ; 56 6698 punpckhwd m5, m0 6699 mova [rsp+gprsize+16* 9], m1 6700 mova [rsp+gprsize+16*12], m5 6701 mova m5, m0 6702.main2: 6703 call .h 6704%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h 6705 lea tmpd, [myq+gammaq] 6706 shr myd, 10 6707 movq m4, [filterq+myq*8] ; a 6708 lea myd, [tmpq+gammaq] 6709 shr tmpd, 10 6710 movq m2, [filterq+tmpq*8] ; b 6711 lea tmpd, [myq+gammaq] 6712 shr myd, 10 6713 movq m3, [filterq+myq*8] ; c 6714 lea myd, [tmpq+gammaq] 6715 shr tmpd, 10 6716 movq m1, [filterq+tmpq*8] ; d 6717 lea tmpd, [myq+gammaq] 6718 shr myd, 10 6719 punpcklwd m4, m2 6720 punpcklwd m3, m1 6721 punpckldq m2, m4, m3 6722 punpckhdq m4, m3 6723 punpcklbw m1, m6, m2 ; a0 a1 b0 b1 c0 c1 d0 d1 << 8 6724 pmaddwd m1, [rsp+gprsize+16*%1] 6725 punpckhbw m3, m6, m2 ; a2 a3 b2 b3 c2 c3 d2 d3 << 8 6726 mova m2, [rsp+gprsize+16*%2] 6727 pmaddwd m3, m2 6728 mova [rsp+gprsize+16*%1], m2 6729 paddd m1, m3 6730 punpcklbw m3, m6, m4 ; a4 a5 b4 b5 c4 c5 d4 d5 << 8 6731 mova m2, [rsp+gprsize+16*%3] 6732 pmaddwd m3, m2 6733 mova [rsp+gprsize+16*%2], m2 6734 paddd m1, m3 6735 punpcklwd m3, m5, m0 ; 67 6736 punpckhbw m2, m6, m4 ; a6 a7 b6 b7 c6 c7 d6 d7 << 8 6737 pmaddwd m2, m3 6738 mova [rsp+gprsize+16*%3], m3 6739 paddd m1, m2 6740 movq m4, [filterq+myq*8] ; e 6741 lea myd, [tmpq+gammaq] 6742 shr tmpd, 10 6743 movq m3, [filterq+tmpq*8] ; f 6744 lea tmpd, [myq+gammaq] 6745 shr myd, 10 6746 movq m2, [filterq+myq*8] ; g 6747%if ARCH_X86_64 6748 lea myd, [tmpq+deltaq] ; my += delta 6749%else 6750 mov myd, [esp+gprsize+4*61] 6751 add myd, tmpd 6752%endif 6753 shr tmpd, 10 6754 punpcklwd m4, m3 6755 movq m3, [filterq+tmpq*8] ; h 6756 punpcklwd m2, m3 6757 punpckldq m3, m4, m2 6758 punpckhdq m4, m2 6759 punpcklbw m2, m6, m3 ; e0 e1 f0 f1 g0 g1 h0 h1 << 8 6760 pmaddwd m2, [rsp+gprsize+16*%4] 6761 punpckhbw m6, m3 ; e2 e3 f2 f3 g2 g3 h2 h3 << 8 6762 mova m3, [rsp+gprsize+16*%5] 6763 pmaddwd m6, m3 6764 mova [rsp+gprsize+16*%4], m3 6765 pxor m3, m3 6766 paddd m2, m6 6767 punpcklbw m3, m4 ; e4 e5 f4 f5 g4 g5 h4 h5 << 8 6768 mova m6, [rsp+gprsize+16*%6] 6769 pmaddwd m3, m6 6770 mova [rsp+gprsize+16*%5], m6 6771 punpckhwd m5, m0 6772 pxor m6, m6 6773 paddd m2, m3 6774 punpckhbw m3, m6, m4 ; e6 e7 f6 f7 g6 g7 h6 h7 << 8 6775 pmaddwd m3, m5 6776 mova [rsp+gprsize+16*%6], m5 6777 mova m5, m0 6778 paddd m2, m3 6779%endmacro 6780 WARP_V 1, 2, 3, 4, 5, 6 6781 ret 6782.main3: 6783 call .h 6784 WARP_V 7, 8, 9, 10, 11, 12 6785 ret 6786ALIGN function_align 6787.h: 6788 lea tmpd, [mxq+alphaq] 6789 shr mxd, 10 6790 movq m3, [filterq+mxq*8] 6791 punpcklbw m0, m6, m3 6792 movu m3, [srcq-6] 6793 pmaddwd m0, m3 ; 0 6794 lea mxd, [tmpq+alphaq] 6795 shr tmpd, 10 6796 movq m3, [filterq+tmpq*8] 6797 punpcklbw m2, m6, m3 6798 movu m3, [srcq-4] 6799 pmaddwd m2, m3 ; 1 6800 lea tmpd, [mxq+alphaq] 6801 shr mxd, 10 6802 movq m3, [filterq+mxq*8] 6803 phaddd m0, m2 ; 0 1 6804 punpcklbw m2, m6, m3 6805 movu m3, [srcq-2] 6806 pmaddwd m2, m3 ; 2 6807 lea mxd, [tmpq+alphaq] 6808 shr tmpd, 10 6809 movq m3, [filterq+tmpq*8] 6810 punpcklbw m1, m6, m3 6811 movu m3, [srcq+0] 6812 pmaddwd m1, m3 ; 3 6813 lea tmpd, [mxq+alphaq] 6814 shr mxd, 10 6815 movq m3, [filterq+mxq*8] 6816 phaddd m2, m1 ; 2 3 6817 punpcklbw m1, m6, m3 6818 movu m3, [srcq+2] 6819 pmaddwd m1, m3 ; 4 6820 lea mxd, [tmpq+alphaq] 6821 shr tmpd, 10 6822 movq m3, [filterq+tmpq*8] 6823 phaddd m0, m2 ; 0 1 2 3 6824 punpcklbw m2, m6, m3 6825 movu m3, [srcq+4] 6826 pmaddwd m2, m3 ; 5 6827 lea tmpd, [mxq+alphaq] 6828 shr mxd, 10 6829 movq m3, [filterq+mxq*8] 6830 phaddd m1, m2 ; 4 5 6831 punpcklbw m2, m6, m3 6832 movu m3, [srcq+6] 6833 pmaddwd m2, m3 ; 6 6834%if ARCH_X86_64 6835 lea mxd, [tmpq+betaq] ; mx += beta 6836%else 6837 mov mxd, [esp+gprsize*2+4*60] 6838 add mxd, tmpd 6839%endif 6840 shr tmpd, 10 6841 movq m3, [filterq+tmpq*8] 6842 punpcklbw m4, m6, m3 6843 movu m3, [srcq+8] 6844%if ARCH_X86_64 6845 add srcq, ssq 6846%else 6847 add srcq, [esp+gprsize*2+4*62] 6848%endif 6849 pmaddwd m3, m4 ; 7 6850 phaddd m2, m3 ; 6 7 6851 phaddd m1, m2 ; 4 5 6 7 6852 paddd m0, m7 6853 paddd m1, m7 6854 psrad m0, [rsp+gprsize*2] 6855 psrad m1, [rsp+gprsize*2] 6856 packssdw m0, m1 6857 ret 6858 6859%macro BIDIR_FN 0 6860 call .main 6861 jmp wq 6862.w4_loop: 6863 call .main 6864 lea dstq, [dstq+strideq*2] 6865.w4: 6866 movq [dstq+strideq*0], m0 6867 movhps [dstq+strideq*1], m0 6868 lea dstq, [dstq+strideq*2] 6869 movq [dstq+strideq*0], m1 6870 movhps [dstq+strideq*1], m1 6871 sub hd, 4 6872 jg .w4_loop 6873.ret: 6874 RET 6875.w8_loop: 6876 call .main 6877 lea dstq, [dstq+strideq*2] 6878.w8: 6879 mova [dstq+strideq*0], m0 6880 mova [dstq+strideq*1], m1 6881 sub hd, 2 6882 jne .w8_loop 6883 RET 6884.w16_loop: 6885 call .main 6886 add dstq, strideq 6887.w16: 6888 mova [dstq+16*0], m0 6889 mova [dstq+16*1], m1 6890 dec hd 6891 jg .w16_loop 6892 RET 6893.w32_loop: 6894 call .main 6895 add dstq, strideq 6896.w32: 6897 mova [dstq+16*0], m0 6898 mova [dstq+16*1], m1 6899 call .main 6900 mova [dstq+16*2], m0 6901 mova [dstq+16*3], m1 6902 dec hd 6903 jg .w32_loop 6904 RET 6905.w64_loop: 6906 call .main 6907 add dstq, strideq 6908.w64: 6909 mova [dstq+16*0], m0 6910 mova [dstq+16*1], m1 6911 call .main 6912 mova [dstq+16*2], m0 6913 mova [dstq+16*3], m1 6914 call .main 6915 mova [dstq+16*4], m0 6916 mova [dstq+16*5], m1 6917 call .main 6918 mova [dstq+16*6], m0 6919 mova [dstq+16*7], m1 6920 dec hd 6921 jg .w64_loop 6922 RET 6923.w128_loop: 6924 call .main 6925 add dstq, strideq 6926.w128: 6927 mova [dstq+16* 0], m0 6928 mova [dstq+16* 1], m1 6929 call .main 6930 mova [dstq+16* 2], m0 6931 mova [dstq+16* 3], m1 6932 call .main 6933 mova [dstq+16* 4], m0 6934 mova [dstq+16* 5], m1 6935 call .main 6936 mova [dstq+16* 6], m0 6937 mova [dstq+16* 7], m1 6938 call .main 6939 mova [dstq+16* 8], m0 6940 mova [dstq+16* 9], m1 6941 call .main 6942 mova [dstq+16*10], m0 6943 mova [dstq+16*11], m1 6944 call .main 6945 mova [dstq+16*12], m0 6946 mova [dstq+16*13], m1 6947 call .main 6948 mova [dstq+16*14], m0 6949 mova [dstq+16*15], m1 6950 dec hd 6951 jg .w128_loop 6952 RET 6953%endmacro 6954 6955%if UNIX64 6956DECLARE_REG_TMP 7 6957%else 6958DECLARE_REG_TMP 5 6959%endif 6960 6961cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h 6962%define base r6-avg_ssse3_table 6963 LEA r6, avg_ssse3_table 6964 tzcnt wd, wm 6965 mov t0d, r6m ; pixel_max 6966 movsxd wq, [r6+wq*4] 6967 shr t0d, 11 6968 movddup m2, [base+bidir_rnd+t0*8] 6969 movddup m3, [base+bidir_mul+t0*8] 6970 movifnidn hd, hm 6971 add wq, r6 6972 BIDIR_FN 6973ALIGN function_align 6974.main: 6975 mova m0, [tmp1q+16*0] 6976 paddsw m0, [tmp2q+16*0] 6977 mova m1, [tmp1q+16*1] 6978 paddsw m1, [tmp2q+16*1] 6979 add tmp1q, 16*2 6980 add tmp2q, 16*2 6981 pmaxsw m0, m2 6982 pmaxsw m1, m2 6983 psubsw m0, m2 6984 psubsw m1, m2 6985 pmulhw m0, m3 6986 pmulhw m1, m3 6987 ret 6988 6989cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h 6990%define base r6-w_avg_ssse3_table 6991 LEA r6, w_avg_ssse3_table 6992 tzcnt wd, wm 6993 mov t0d, r6m ; weight 6994 movd m6, r7m ; pixel_max 6995 movddup m5, [base+pd_65538] 6996 movsxd wq, [r6+wq*4] 6997 pshufb m6, [base+pw_256] 6998 add wq, r6 6999 lea r6d, [t0-16] 7000 shl t0d, 16 7001 sub t0d, r6d ; 16-weight, weight 7002 paddw m5, m6 7003 mov r6d, t0d 7004 shl t0d, 2 7005 test dword r7m, 0x800 7006 cmovnz r6d, t0d 7007 movifnidn hd, hm 7008 movd m4, r6d 7009 pslld m5, 7 7010 pxor m7, m7 7011 pshufd m4, m4, q0000 7012 BIDIR_FN 7013ALIGN function_align 7014.main: 7015 mova m2, [tmp1q+16*0] 7016 mova m0, [tmp2q+16*0] 7017 punpckhwd m3, m0, m2 7018 punpcklwd m0, m2 7019 mova m2, [tmp1q+16*1] 7020 mova m1, [tmp2q+16*1] 7021 add tmp1q, 16*2 7022 add tmp2q, 16*2 7023 pmaddwd m3, m4 7024 pmaddwd m0, m4 7025 paddd m3, m5 7026 paddd m0, m5 7027 psrad m3, 8 7028 psrad m0, 8 7029 packssdw m0, m3 7030 punpckhwd m3, m1, m2 7031 punpcklwd m1, m2 7032 pmaddwd m3, m4 7033 pmaddwd m1, m4 7034 paddd m3, m5 7035 paddd m1, m5 7036 psrad m3, 8 7037 psrad m1, 8 7038 packssdw m1, m3 7039 pminsw m0, m6 7040 pminsw m1, m6 7041 pmaxsw m0, m7 7042 pmaxsw m1, m7 7043 ret 7044 7045%if ARCH_X86_64 7046cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask 7047%else 7048cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask 7049%define hd dword r5m 7050%define m8 [base+pw_64] 7051%endif 7052%define base r6-mask_ssse3_table 7053 LEA r6, mask_ssse3_table 7054 tzcnt wd, wm 7055 mov t0d, r7m ; pixel_max 7056 shr t0d, 11 7057 movsxd wq, [r6+wq*4] 7058 movddup m6, [base+bidir_rnd+t0*8] 7059 movddup m7, [base+bidir_mul+t0*8] 7060%if ARCH_X86_64 7061 mova m8, [base+pw_64] 7062 movifnidn hd, hm 7063%endif 7064 add wq, r6 7065 mov maskq, r6mp 7066 BIDIR_FN 7067ALIGN function_align 7068.main: 7069 movq m3, [maskq+8*0] 7070 mova m0, [tmp1q+16*0] 7071 mova m4, [tmp2q+16*0] 7072 pxor m5, m5 7073 punpcklbw m3, m5 7074 punpckhwd m2, m0, m4 7075 punpcklwd m0, m4 7076 psubw m1, m8, m3 7077 punpckhwd m4, m3, m1 ; m, 64-m 7078 punpcklwd m3, m1 7079 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) 7080 pmaddwd m0, m3 7081 movq m3, [maskq+8*1] 7082 mova m1, [tmp1q+16*1] 7083 mova m4, [tmp2q+16*1] 7084 add maskq, 8*2 7085 add tmp1q, 16*2 7086 add tmp2q, 16*2 7087 psrad m2, 5 7088 psrad m0, 5 7089 packssdw m0, m2 7090 punpcklbw m3, m5 7091 punpckhwd m2, m1, m4 7092 punpcklwd m1, m4 7093 psubw m5, m8, m3 7094 punpckhwd m4, m3, m5 ; m, 64-m 7095 punpcklwd m3, m5 7096 pmaddwd m2, m4 ; tmp1 * m + tmp2 * (64-m) 7097 pmaddwd m1, m3 7098 psrad m2, 5 7099 psrad m1, 5 7100 packssdw m1, m2 7101 pmaxsw m0, m6 7102 pmaxsw m1, m6 7103 psubsw m0, m6 7104 psubsw m1, m6 7105 pmulhw m0, m7 7106 pmulhw m1, m7 7107 ret 7108 7109cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 7110%define base t0-w_mask_420_ssse3_table 7111 LEA t0, w_mask_420_ssse3_table 7112 tzcnt wd, wm 7113 mov r6d, r8m ; pixel_max 7114 movd m0, r7m ; sign 7115 shr r6d, 11 7116 movsxd wq, [t0+wq*4] 7117%if ARCH_X86_64 7118 mova m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 7119 mova m9, [base+pw_64] 7120 movddup m10, [base+bidir_rnd+r6*8] 7121 movddup m11, [base+bidir_mul+r6*8] 7122%else 7123 mova m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32 7124 mova m2, [base+pw_64] 7125 movddup m3, [base+bidir_rnd+r6*8] 7126 movddup m4, [base+bidir_mul+r6*8] 7127 ALLOC_STACK -16*4 7128 mova [rsp+16*0], m1 7129 mova [rsp+16*1], m2 7130 mova [rsp+16*2], m3 7131 mova [rsp+16*3], m4 7132 %define m8 [rsp+gprsize+16*0] 7133 %define m9 [rsp+gprsize+16*1] 7134 %define m10 [rsp+gprsize+16*2] 7135 %define m11 [rsp+gprsize+16*3] 7136%endif 7137 movd m7, [base+pw_2] 7138 psubw m7, m0 7139 pshufb m7, [base+pw_256] 7140 add wq, t0 7141 movifnidn hd, r5m 7142 mov maskq, r6mp 7143 call .main 7144 jmp wq 7145.w4_loop: 7146 call .main 7147 lea dstq, [dstq+strideq*2] 7148 add maskq, 4 7149.w4: 7150 movq [dstq+strideq*0], m0 7151 phaddw m2, m3 7152 movhps [dstq+strideq*1], m0 7153 phaddd m2, m2 7154 lea dstq, [dstq+strideq*2] 7155 paddw m2, m7 7156 movq [dstq+strideq*0], m1 7157 psrlw m2, 2 7158 movhps [dstq+strideq*1], m1 7159 packuswb m2, m2 7160 movd [maskq], m2 7161 sub hd, 4 7162 jg .w4_loop 7163 RET 7164.w8_loop: 7165 call .main 7166 lea dstq, [dstq+strideq*2] 7167 add maskq, 4 7168.w8: 7169 mova [dstq+strideq*0], m0 7170 paddw m2, m3 7171 phaddw m2, m2 7172 mova [dstq+strideq*1], m1 7173 paddw m2, m7 7174 psrlw m2, 2 7175 packuswb m2, m2 7176 movd [maskq], m2 7177 sub hd, 2 7178 jg .w8_loop 7179 RET 7180.w16_loop: 7181 call .main 7182 lea dstq, [dstq+strideq*2] 7183 add maskq, 8 7184.w16: 7185 mova [dstq+strideq*1+16*0], m2 7186 mova [dstq+strideq*0+16*0], m0 7187 mova [dstq+strideq*1+16*1], m3 7188 mova [dstq+strideq*0+16*1], m1 7189 call .main 7190 paddw m2, [dstq+strideq*1+16*0] 7191 paddw m3, [dstq+strideq*1+16*1] 7192 mova [dstq+strideq*1+16*0], m0 7193 phaddw m2, m3 7194 mova [dstq+strideq*1+16*1], m1 7195 paddw m2, m7 7196 psrlw m2, 2 7197 packuswb m2, m2 7198 movq [maskq], m2 7199 sub hd, 2 7200 jg .w16_loop 7201 RET 7202.w32_loop: 7203 call .main 7204 lea dstq, [dstq+strideq*2] 7205 add maskq, 16 7206.w32: 7207 mova [dstq+strideq*1+16*0], m2 7208 mova [dstq+strideq*0+16*0], m0 7209 mova [dstq+strideq*1+16*1], m3 7210 mova [dstq+strideq*0+16*1], m1 7211 call .main 7212 mova [dstq+strideq*0+16*2], m0 7213 phaddw m2, m3 7214 mova [dstq+strideq*1+16*3], m2 7215 mova [dstq+strideq*0+16*3], m1 7216 call .main 7217 paddw m2, [dstq+strideq*1+16*0] 7218 paddw m3, [dstq+strideq*1+16*1] 7219 mova [dstq+strideq*1+16*0], m0 7220 phaddw m2, m3 7221 mova [dstq+strideq*1+16*2], m2 7222 mova [dstq+strideq*1+16*1], m1 7223 call .main 7224 phaddw m2, m3 7225 paddw m3, m7, [dstq+strideq*1+16*2] 7226 paddw m2, [dstq+strideq*1+16*3] 7227 mova [dstq+strideq*1+16*2], m0 7228 paddw m2, m7 7229 psrlw m3, 2 7230 psrlw m2, 2 7231 mova [dstq+strideq*1+16*3], m1 7232 packuswb m3, m2 7233 mova [maskq], m3 7234 sub hd, 2 7235 jg .w32_loop 7236 RET 7237.w64_loop: 7238 call .main 7239 lea dstq, [dstq+strideq*2] 7240 add maskq, 16*2 7241.w64: 7242 mova [dstq+strideq*1+16*1], m2 7243 mova [dstq+strideq*0+16*0], m0 7244 mova [dstq+strideq*1+16*2], m3 7245 mova [dstq+strideq*0+16*1], m1 7246 call .main 7247 mova [dstq+strideq*1+16*3], m2 7248 mova [dstq+strideq*0+16*2], m0 7249 mova [dstq+strideq*1+16*4], m3 7250 mova [dstq+strideq*0+16*3], m1 7251 call .main 7252 mova [dstq+strideq*1+16*5], m2 7253 mova [dstq+strideq*0+16*4], m0 7254 mova [dstq+strideq*1+16*6], m3 7255 mova [dstq+strideq*0+16*5], m1 7256 call .main 7257 mova [dstq+strideq*0+16*6], m0 7258 phaddw m2, m3 7259 mova [dstq+strideq*1+16*7], m2 7260 mova [dstq+strideq*0+16*7], m1 7261 call .main 7262 paddw m2, [dstq+strideq*1+16*1] 7263 paddw m3, [dstq+strideq*1+16*2] 7264 mova [dstq+strideq*1+16*0], m0 7265 phaddw m2, m3 7266 mova [dstq+strideq*1+16*2], m2 7267 mova [dstq+strideq*1+16*1], m1 7268 call .main 7269 paddw m2, [dstq+strideq*1+16*3] 7270 paddw m3, [dstq+strideq*1+16*4] 7271 phaddw m2, m3 7272 paddw m3, m7, [dstq+strideq*1+16*2] 7273 mova [dstq+strideq*1+16*2], m0 7274 paddw m2, m7 7275 psrlw m3, 2 7276 psrlw m2, 2 7277 mova [dstq+strideq*1+16*3], m1 7278 packuswb m3, m2 7279 mova [maskq+16*0], m3 7280 call .main 7281 paddw m2, [dstq+strideq*1+16*5] 7282 paddw m3, [dstq+strideq*1+16*6] 7283 mova [dstq+strideq*1+16*4], m0 7284 phaddw m2, m3 7285 mova [dstq+strideq*1+16*6], m2 7286 mova [dstq+strideq*1+16*5], m1 7287 call .main 7288 phaddw m2, m3 7289 paddw m3, m7, [dstq+strideq*1+16*6] 7290 paddw m2, [dstq+strideq*1+16*7] 7291 mova [dstq+strideq*1+16*6], m0 7292 paddw m2, m7 7293 psrlw m3, 2 7294 psrlw m2, 2 7295 mova [dstq+strideq*1+16*7], m1 7296 packuswb m3, m2 7297 mova [maskq+16*1], m3 7298 sub hd, 2 7299 jg .w64_loop 7300 RET 7301.w128_loop: 7302 call .main 7303 lea dstq, [dstq+strideq*2] 7304 add maskq, 16*4 7305.w128: 7306 mova [dstq+strideq*1+16* 1], m2 7307 mova [dstq+strideq*0+16* 0], m0 7308 mova [dstq+strideq*1+16* 2], m3 7309 mova [dstq+strideq*0+16* 1], m1 7310 call .main 7311 mova [dstq+strideq*1+16* 3], m2 7312 mova [dstq+strideq*0+16* 2], m0 7313 mova [dstq+strideq*1+16* 4], m3 7314 mova [dstq+strideq*0+16* 3], m1 7315 call .main 7316 mova [dstq+strideq*1+16* 5], m2 7317 mova [dstq+strideq*0+16* 4], m0 7318 mova [dstq+strideq*1+16* 6], m3 7319 mova [dstq+strideq*0+16* 5], m1 7320 call .main 7321 mova [dstq+strideq*1+16* 7], m2 7322 mova [dstq+strideq*0+16* 6], m0 7323 mova [dstq+strideq*1+16* 8], m3 7324 mova [dstq+strideq*0+16* 7], m1 7325 call .main 7326 mova [dstq+strideq*1+16* 9], m2 7327 mova [dstq+strideq*0+16* 8], m0 7328 mova [dstq+strideq*1+16*10], m3 7329 mova [dstq+strideq*0+16* 9], m1 7330 call .main 7331 mova [dstq+strideq*1+16*11], m2 7332 mova [dstq+strideq*0+16*10], m0 7333 mova [dstq+strideq*1+16*12], m3 7334 mova [dstq+strideq*0+16*11], m1 7335 call .main 7336 mova [dstq+strideq*1+16*13], m2 7337 mova [dstq+strideq*0+16*12], m0 7338 mova [dstq+strideq*1+16*14], m3 7339 mova [dstq+strideq*0+16*13], m1 7340 call .main 7341 mova [dstq+strideq*0+16*14], m0 7342 phaddw m2, m3 7343 mova [dstq+strideq*1+16*15], m2 7344 mova [dstq+strideq*0+16*15], m1 7345 call .main 7346 paddw m2, [dstq+strideq*1+16* 1] 7347 paddw m3, [dstq+strideq*1+16* 2] 7348 mova [dstq+strideq*1+16* 0], m0 7349 phaddw m2, m3 7350 mova [dstq+strideq*1+16* 2], m2 7351 mova [dstq+strideq*1+16* 1], m1 7352 call .main 7353 paddw m2, [dstq+strideq*1+16* 3] 7354 paddw m3, [dstq+strideq*1+16* 4] 7355 phaddw m2, m3 7356 paddw m3, m7, [dstq+strideq*1+16* 2] 7357 mova [dstq+strideq*1+16* 2], m0 7358 paddw m2, m7 7359 psrlw m3, 2 7360 psrlw m2, 2 7361 mova [dstq+strideq*1+16* 3], m1 7362 packuswb m3, m2 7363 mova [maskq+16*0], m3 7364 call .main 7365 paddw m2, [dstq+strideq*1+16* 5] 7366 paddw m3, [dstq+strideq*1+16* 6] 7367 mova [dstq+strideq*1+16* 4], m0 7368 phaddw m2, m3 7369 mova [dstq+strideq*1+16* 6], m2 7370 mova [dstq+strideq*1+16* 5], m1 7371 call .main 7372 paddw m2, [dstq+strideq*1+16* 7] 7373 paddw m3, [dstq+strideq*1+16* 8] 7374 phaddw m2, m3 7375 paddw m3, m7, [dstq+strideq*1+16* 6] 7376 mova [dstq+strideq*1+16* 6], m0 7377 paddw m2, m7 7378 psrlw m3, 2 7379 psrlw m2, 2 7380 mova [dstq+strideq*1+16* 7], m1 7381 packuswb m3, m2 7382 mova [maskq+16*1], m3 7383 call .main 7384 paddw m2, [dstq+strideq*1+16* 9] 7385 paddw m3, [dstq+strideq*1+16*10] 7386 mova [dstq+strideq*1+16* 8], m0 7387 phaddw m2, m3 7388 mova [dstq+strideq*1+16*10], m2 7389 mova [dstq+strideq*1+16* 9], m1 7390 call .main 7391 paddw m2, [dstq+strideq*1+16*11] 7392 paddw m3, [dstq+strideq*1+16*12] 7393 phaddw m2, m3 7394 paddw m3, m7, [dstq+strideq*1+16*10] 7395 mova [dstq+strideq*1+16*10], m0 7396 paddw m2, m7 7397 psrlw m3, 2 7398 psrlw m2, 2 7399 mova [dstq+strideq*1+16*11], m1 7400 packuswb m3, m2 7401 mova [maskq+16*2], m3 7402 call .main 7403 paddw m2, [dstq+strideq*1+16*13] 7404 paddw m3, [dstq+strideq*1+16*14] 7405 mova [dstq+strideq*1+16*12], m0 7406 phaddw m2, m3 7407 mova [dstq+strideq*1+16*14], m2 7408 mova [dstq+strideq*1+16*13], m1 7409 call .main 7410 phaddw m2, m3 7411 paddw m3, m7, [dstq+strideq*1+16*14] 7412 paddw m2, [dstq+strideq*1+16*15] 7413 mova [dstq+strideq*1+16*14], m0 7414 paddw m2, m7 7415 psrlw m3, 2 7416 psrlw m2, 2 7417 mova [dstq+strideq*1+16*15], m1 7418 packuswb m3, m2 7419 mova [maskq+16*3], m3 7420 sub hd, 2 7421 jg .w128_loop 7422 RET 7423ALIGN function_align 7424.main: 7425%macro W_MASK 2 ; dst/tmp_offset, mask 7426 mova m%1, [tmp1q+16*%1] 7427 mova m%2, [tmp2q+16*%1] 7428 punpcklwd m4, m%2, m%1 7429 punpckhwd m5, m%2, m%1 7430 psubsw m%1, m%2 7431 pabsw m%1, m%1 7432 psubusw m6, m8, m%1 7433 psrlw m6, 10 ; 64-m 7434 psubw m%2, m9, m6 ; m 7435 punpcklwd m%1, m6, m%2 7436 punpckhwd m6, m%2 7437 pmaddwd m%1, m4 7438 pmaddwd m6, m5 7439 psrad m%1, 5 7440 psrad m6, 5 7441 packssdw m%1, m6 7442 pmaxsw m%1, m10 7443 psubsw m%1, m10 7444 pmulhw m%1, m11 7445%endmacro 7446 W_MASK 0, 2 7447 W_MASK 1, 3 7448 add tmp1q, 16*2 7449 add tmp2q, 16*2 7450 ret 7451 7452cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 7453%define base t0-w_mask_422_ssse3_table 7454 LEA t0, w_mask_422_ssse3_table 7455 tzcnt wd, wm 7456 mov r6d, r8m ; pixel_max 7457 movd m7, r7m ; sign 7458 shr r6d, 11 7459 movsxd wq, [t0+wq*4] 7460%if ARCH_X86_64 7461 mova m8, [base+pw_27615] 7462 mova m9, [base+pw_64] 7463 movddup m10, [base+bidir_rnd+r6*8] 7464 movddup m11, [base+bidir_mul+r6*8] 7465%else 7466 mova m1, [base+pw_27615] 7467 mova m2, [base+pw_64] 7468 movddup m3, [base+bidir_rnd+r6*8] 7469 movddup m4, [base+bidir_mul+r6*8] 7470 ALLOC_STACK -16*4 7471 mova [rsp+16*0], m1 7472 mova [rsp+16*1], m2 7473 mova [rsp+16*2], m3 7474 mova [rsp+16*3], m4 7475%endif 7476 pxor m0, m0 7477 add wq, t0 7478 pshufb m7, m0 7479 movifnidn hd, r5m 7480 mov maskq, r6mp 7481 call .main 7482 jmp wq 7483.w4_loop: 7484 call .main 7485 lea dstq, [dstq+strideq*2] 7486.w4: 7487 movq [dstq+strideq*0], m0 7488 movhps [dstq+strideq*1], m0 7489 lea dstq, [dstq+strideq*2] 7490 movq [dstq+strideq*0], m1 7491 movhps [dstq+strideq*1], m1 7492 sub hd, 4 7493 jg .w4_loop 7494.end: 7495 RET 7496.w8_loop: 7497 call .main 7498 lea dstq, [dstq+strideq*2] 7499.w8: 7500 mova [dstq+strideq*0], m0 7501 mova [dstq+strideq*1], m1 7502 sub hd, 2 7503 jg .w8_loop 7504.w8_end: 7505 RET 7506.w16_loop: 7507 call .main 7508 lea dstq, [dstq+strideq*2] 7509.w16: 7510 mova [dstq+strideq*0+16*0], m0 7511 mova [dstq+strideq*0+16*1], m1 7512 call .main 7513 mova [dstq+strideq*1+16*0], m0 7514 mova [dstq+strideq*1+16*1], m1 7515 sub hd, 2 7516 jg .w16_loop 7517 RET 7518.w32_loop: 7519 call .main 7520 add dstq, strideq 7521.w32: 7522 mova [dstq+16*0], m0 7523 mova [dstq+16*1], m1 7524 call .main 7525 mova [dstq+16*2], m0 7526 mova [dstq+16*3], m1 7527 dec hd 7528 jg .w32_loop 7529 RET 7530.w64_loop: 7531 call .main 7532 add dstq, strideq 7533.w64: 7534 mova [dstq+16*0], m0 7535 mova [dstq+16*1], m1 7536 call .main 7537 mova [dstq+16*2], m0 7538 mova [dstq+16*3], m1 7539 call .main 7540 mova [dstq+16*4], m0 7541 mova [dstq+16*5], m1 7542 call .main 7543 mova [dstq+16*6], m0 7544 mova [dstq+16*7], m1 7545 dec hd 7546 jg .w64_loop 7547 RET 7548.w128_loop: 7549 call .main 7550 add dstq, strideq 7551.w128: 7552 mova [dstq+16* 0], m0 7553 mova [dstq+16* 1], m1 7554 call .main 7555 mova [dstq+16* 2], m0 7556 mova [dstq+16* 3], m1 7557 call .main 7558 mova [dstq+16* 4], m0 7559 mova [dstq+16* 5], m1 7560 call .main 7561 mova [dstq+16* 6], m0 7562 mova [dstq+16* 7], m1 7563 call .main 7564 mova [dstq+16* 8], m0 7565 mova [dstq+16* 9], m1 7566 call .main 7567 mova [dstq+16*10], m0 7568 mova [dstq+16*11], m1 7569 call .main 7570 mova [dstq+16*12], m0 7571 mova [dstq+16*13], m1 7572 call .main 7573 mova [dstq+16*14], m0 7574 mova [dstq+16*15], m1 7575 dec hd 7576 jg .w128_loop 7577 RET 7578ALIGN function_align 7579.main: 7580 W_MASK 0, 2 7581 W_MASK 1, 3 7582 phaddw m2, m3 7583 add tmp1q, 16*2 7584 add tmp2q, 16*2 7585 packuswb m2, m2 7586 pxor m3, m3 7587 psubb m2, m7 7588 pavgb m2, m3 7589 movq [maskq], m2 7590 add maskq, 8 7591 ret 7592 7593cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask 7594%define base t0-w_mask_444_ssse3_table 7595 LEA t0, w_mask_444_ssse3_table 7596 tzcnt wd, wm 7597 mov r6d, r8m ; pixel_max 7598 shr r6d, 11 7599 movsxd wq, [t0+wq*4] 7600%if ARCH_X86_64 7601 mova m8, [base+pw_27615] 7602 mova m9, [base+pw_64] 7603 movddup m10, [base+bidir_rnd+r6*8] 7604 movddup m11, [base+bidir_mul+r6*8] 7605%else 7606 mova m1, [base+pw_27615] 7607 mova m2, [base+pw_64] 7608 movddup m3, [base+bidir_rnd+r6*8] 7609 movddup m7, [base+bidir_mul+r6*8] 7610 ALLOC_STACK -16*3 7611 mova [rsp+16*0], m1 7612 mova [rsp+16*1], m2 7613 mova [rsp+16*2], m3 7614 %define m11 m7 7615%endif 7616 add wq, t0 7617 movifnidn hd, r5m 7618 mov maskq, r6mp 7619 call .main 7620 jmp wq 7621.w4_loop: 7622 call .main 7623 lea dstq, [dstq+strideq*2] 7624.w4: 7625 movq [dstq+strideq*0], m0 7626 movhps [dstq+strideq*1], m0 7627 lea dstq, [dstq+strideq*2] 7628 movq [dstq+strideq*0], m1 7629 movhps [dstq+strideq*1], m1 7630 sub hd, 4 7631 jg .w4_loop 7632.end: 7633 RET 7634.w8_loop: 7635 call .main 7636 lea dstq, [dstq+strideq*2] 7637.w8: 7638 mova [dstq+strideq*0], m0 7639 mova [dstq+strideq*1], m1 7640 sub hd, 2 7641 jg .w8_loop 7642.w8_end: 7643 RET 7644.w16_loop: 7645 call .main 7646 lea dstq, [dstq+strideq*2] 7647.w16: 7648 mova [dstq+strideq*0+16*0], m0 7649 mova [dstq+strideq*0+16*1], m1 7650 call .main 7651 mova [dstq+strideq*1+16*0], m0 7652 mova [dstq+strideq*1+16*1], m1 7653 sub hd, 2 7654 jg .w16_loop 7655 RET 7656.w32_loop: 7657 call .main 7658 add dstq, strideq 7659.w32: 7660 mova [dstq+16*0], m0 7661 mova [dstq+16*1], m1 7662 call .main 7663 mova [dstq+16*2], m0 7664 mova [dstq+16*3], m1 7665 dec hd 7666 jg .w32_loop 7667 RET 7668.w64_loop: 7669 call .main 7670 add dstq, strideq 7671.w64: 7672 mova [dstq+16*0], m0 7673 mova [dstq+16*1], m1 7674 call .main 7675 mova [dstq+16*2], m0 7676 mova [dstq+16*3], m1 7677 call .main 7678 mova [dstq+16*4], m0 7679 mova [dstq+16*5], m1 7680 call .main 7681 mova [dstq+16*6], m0 7682 mova [dstq+16*7], m1 7683 dec hd 7684 jg .w64_loop 7685 RET 7686.w128_loop: 7687 call .main 7688 add dstq, strideq 7689.w128: 7690 mova [dstq+16* 0], m0 7691 mova [dstq+16* 1], m1 7692 call .main 7693 mova [dstq+16* 2], m0 7694 mova [dstq+16* 3], m1 7695 call .main 7696 mova [dstq+16* 4], m0 7697 mova [dstq+16* 5], m1 7698 call .main 7699 mova [dstq+16* 6], m0 7700 mova [dstq+16* 7], m1 7701 call .main 7702 mova [dstq+16* 8], m0 7703 mova [dstq+16* 9], m1 7704 call .main 7705 mova [dstq+16*10], m0 7706 mova [dstq+16*11], m1 7707 call .main 7708 mova [dstq+16*12], m0 7709 mova [dstq+16*13], m1 7710 call .main 7711 mova [dstq+16*14], m0 7712 mova [dstq+16*15], m1 7713 dec hd 7714 jg .w128_loop 7715 RET 7716ALIGN function_align 7717.main: 7718 W_MASK 0, 2 7719 W_MASK 1, 3 7720 packuswb m2, m3 7721 add tmp1q, 16*2 7722 add tmp2q, 16*2 7723 mova [maskq], m2 7724 add maskq, 16 7725 ret 7726 7727; (a * (64 - m) + b * m + 32) >> 6 7728; = (((b - a) * m + 32) >> 6) + a 7729; = (((b - a) * (m << 9) + 16384) >> 15) + a 7730; except m << 9 overflows int16_t when m == 64 (which is possible), 7731; but if we negate m it works out (-64 << 9 == -32768). 7732; = (((a - b) * (m * -512) + 16384) >> 15) + a 7733cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3 7734%define base r6-blend_ssse3_table 7735 LEA r6, blend_ssse3_table 7736 tzcnt wd, wm 7737 movifnidn hd, hm 7738 movsxd wq, [r6+wq*4] 7739 movifnidn maskq, maskmp 7740 mova m7, [base+pw_m512] 7741 add wq, r6 7742 lea stride3q, [strideq*3] 7743 pxor m6, m6 7744 jmp wq 7745.w4: 7746 mova m5, [maskq] 7747 movq m0, [dstq+strideq*0] 7748 movhps m0, [dstq+strideq*1] 7749 movq m1, [dstq+strideq*2] 7750 movhps m1, [dstq+stride3q ] 7751 psubw m2, m0, [tmpq+16*0] 7752 psubw m3, m1, [tmpq+16*1] 7753 add maskq, 16 7754 add tmpq, 32 7755 punpcklbw m4, m5, m6 7756 punpckhbw m5, m6 7757 pmullw m4, m7 7758 pmullw m5, m7 7759 pmulhrsw m2, m4 7760 pmulhrsw m3, m5 7761 paddw m0, m2 7762 paddw m1, m3 7763 movq [dstq+strideq*0], m0 7764 movhps [dstq+strideq*1], m0 7765 movq [dstq+strideq*2], m1 7766 movhps [dstq+stride3q ], m1 7767 lea dstq, [dstq+strideq*4] 7768 sub hd, 4 7769 jg .w4 7770 RET 7771.w8: 7772 mova m5, [maskq] 7773 mova m0, [dstq+strideq*0] 7774 mova m1, [dstq+strideq*1] 7775 psubw m2, m0, [tmpq+16*0] 7776 psubw m3, m1, [tmpq+16*1] 7777 add maskq, 16 7778 add tmpq, 32 7779 punpcklbw m4, m5, m6 7780 punpckhbw m5, m6 7781 pmullw m4, m7 7782 pmullw m5, m7 7783 pmulhrsw m2, m4 7784 pmulhrsw m3, m5 7785 paddw m0, m2 7786 paddw m1, m3 7787 mova [dstq+strideq*0], m0 7788 mova [dstq+strideq*1], m1 7789 lea dstq, [dstq+strideq*2] 7790 sub hd, 2 7791 jg .w8 7792 RET 7793.w16: 7794 mova m5, [maskq] 7795 mova m0, [dstq+16*0] 7796 mova m1, [dstq+16*1] 7797 psubw m2, m0, [tmpq+16*0] 7798 psubw m3, m1, [tmpq+16*1] 7799 add maskq, 16 7800 add tmpq, 32 7801 punpcklbw m4, m5, m6 7802 punpckhbw m5, m6 7803 pmullw m4, m7 7804 pmullw m5, m7 7805 pmulhrsw m2, m4 7806 pmulhrsw m3, m5 7807 paddw m0, m2 7808 paddw m1, m3 7809 mova [dstq+16*0], m0 7810 mova [dstq+16*1], m1 7811 add dstq, strideq 7812 dec hd 7813 jg .w16 7814 RET 7815.w32: 7816 mova m5, [maskq+16*0] 7817 mova m0, [dstq+16*0] 7818 mova m1, [dstq+16*1] 7819 psubw m2, m0, [tmpq+16*0] 7820 psubw m3, m1, [tmpq+16*1] 7821 punpcklbw m4, m5, m6 7822 punpckhbw m5, m6 7823 pmullw m4, m7 7824 pmullw m5, m7 7825 pmulhrsw m2, m4 7826 pmulhrsw m3, m5 7827 paddw m0, m2 7828 paddw m1, m3 7829 mova [dstq+16*0], m0 7830 mova [dstq+16*1], m1 7831 mova m5, [maskq+16*1] 7832 mova m0, [dstq+16*2] 7833 mova m1, [dstq+16*3] 7834 psubw m2, m0, [tmpq+16*2] 7835 psubw m3, m1, [tmpq+16*3] 7836 add maskq, 32 7837 add tmpq, 64 7838 punpcklbw m4, m5, m6 7839 punpckhbw m5, m6 7840 pmullw m4, m7 7841 pmullw m5, m7 7842 pmulhrsw m2, m4 7843 pmulhrsw m3, m5 7844 paddw m0, m2 7845 paddw m1, m3 7846 mova [dstq+16*2], m0 7847 mova [dstq+16*3], m1 7848 add dstq, strideq 7849 dec hd 7850 jg .w32 7851 RET 7852 7853cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h 7854%define base r5-blend_v_ssse3_table 7855 LEA r5, blend_v_ssse3_table 7856 tzcnt wd, wm 7857 movifnidn hd, hm 7858 movsxd wq, [r5+wq*4] 7859 add wq, r5 7860 jmp wq 7861.w2: 7862 movd m4, [base+obmc_masks+2*2] 7863.w2_loop: 7864 movd m0, [dstq+strideq*0] 7865 movd m2, [tmpq+4*0] 7866 movd m1, [dstq+strideq*1] 7867 movd m3, [tmpq+4*1] 7868 add tmpq, 4*2 7869 psubw m2, m0 7870 psubw m3, m1 7871 pmulhrsw m2, m4 7872 pmulhrsw m3, m4 7873 paddw m0, m2 7874 paddw m1, m3 7875 movd [dstq+strideq*0], m0 7876 movd [dstq+strideq*1], m1 7877 lea dstq, [dstq+strideq*2] 7878 sub hd, 2 7879 jg .w2_loop 7880 RET 7881.w4: 7882 movddup m2, [base+obmc_masks+4*2] 7883.w4_loop: 7884 movq m0, [dstq+strideq*0] 7885 movhps m0, [dstq+strideq*1] 7886 mova m1, [tmpq] 7887 add tmpq, 8*2 7888 psubw m1, m0 7889 pmulhrsw m1, m2 7890 paddw m0, m1 7891 movq [dstq+strideq*0], m0 7892 movhps [dstq+strideq*1], m0 7893 lea dstq, [dstq+strideq*2] 7894 sub hd, 2 7895 jg .w4_loop 7896 RET 7897.w8: 7898 mova m4, [base+obmc_masks+8*2] 7899.w8_loop: 7900 mova m0, [dstq+strideq*0] 7901 mova m2, [tmpq+16*0] 7902 mova m1, [dstq+strideq*1] 7903 mova m3, [tmpq+16*1] 7904 add tmpq, 16*2 7905 psubw m2, m0 7906 psubw m3, m1 7907 pmulhrsw m2, m4 7908 pmulhrsw m3, m4 7909 paddw m0, m2 7910 paddw m1, m3 7911 mova [dstq+strideq*0], m0 7912 mova [dstq+strideq*1], m1 7913 lea dstq, [dstq+strideq*2] 7914 sub hd, 2 7915 jg .w8_loop 7916 RET 7917.w16: 7918 mova m4, [base+obmc_masks+16*2] 7919 movq m5, [base+obmc_masks+16*3] 7920.w16_loop: 7921 mova m0, [dstq+16*0] 7922 mova m2, [tmpq+16*0] 7923 mova m1, [dstq+16*1] 7924 mova m3, [tmpq+16*1] 7925 add tmpq, 16*2 7926 psubw m2, m0 7927 psubw m3, m1 7928 pmulhrsw m2, m4 7929 pmulhrsw m3, m5 7930 paddw m0, m2 7931 paddw m1, m3 7932 mova [dstq+16*0], m0 7933 mova [dstq+16*1], m1 7934 add dstq, strideq 7935 dec hd 7936 jg .w16_loop 7937 RET 7938.w32: 7939%if WIN64 7940 movaps [rsp+8], m6 7941%endif 7942 mova m4, [base+obmc_masks+16*4] 7943 mova m5, [base+obmc_masks+16*5] 7944 mova m6, [base+obmc_masks+16*6] 7945.w32_loop: 7946 mova m0, [dstq+16*0] 7947 mova m2, [tmpq+16*0] 7948 mova m1, [dstq+16*1] 7949 mova m3, [tmpq+16*1] 7950 psubw m2, m0 7951 psubw m3, m1 7952 pmulhrsw m2, m4 7953 pmulhrsw m3, m5 7954 paddw m0, m2 7955 mova m2, [dstq+16*2] 7956 paddw m1, m3 7957 mova m3, [tmpq+16*2] 7958 add tmpq, 16*4 7959 psubw m3, m2 7960 pmulhrsw m3, m6 7961 paddw m2, m3 7962 mova [dstq+16*0], m0 7963 mova [dstq+16*1], m1 7964 mova [dstq+16*2], m2 7965 add dstq, strideq 7966 dec hd 7967 jg .w32_loop 7968%if WIN64 7969 movaps m6, [rsp+8] 7970%endif 7971 RET 7972 7973%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp 7974 mova m0, [dstq+16*(%1+0)] 7975 mova m2, [tmpq+16*(%2+0)] 7976 mova m1, [dstq+16*(%1+1)] 7977 mova m3, [tmpq+16*(%2+1)] 7978%if %3 7979 add tmpq, 16*%3 7980%endif 7981 psubw m2, m0 7982 psubw m3, m1 7983 pmulhrsw m2, m5 7984 pmulhrsw m3, m5 7985 paddw m0, m2 7986 paddw m1, m3 7987 mova [dstq+16*(%1+0)], m0 7988 mova [dstq+16*(%1+1)], m1 7989%endmacro 7990 7991cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask 7992%define base r6-blend_h_ssse3_table 7993 LEA r6, blend_h_ssse3_table 7994 tzcnt wd, wm 7995 mov hd, hm 7996 movsxd wq, [r6+wq*4] 7997 movddup m4, [base+blend_shuf] 7998 lea maskq, [base+obmc_masks+hq*2] 7999 lea hd, [hq*3] 8000 add wq, r6 8001 shr hd, 2 ; h * 3/4 8002 lea maskq, [maskq+hq*2] 8003 neg hq 8004 jmp wq 8005.w2: 8006 movd m0, [dstq+dsq*0] 8007 movd m2, [dstq+dsq*1] 8008 movd m3, [maskq+hq*2] 8009 movq m1, [tmpq] 8010 add tmpq, 4*2 8011 punpckldq m0, m2 8012 punpcklwd m3, m3 8013 psubw m1, m0 8014 pmulhrsw m1, m3 8015 paddw m0, m1 8016 movd [dstq+dsq*0], m0 8017 psrlq m0, 32 8018 movd [dstq+dsq*1], m0 8019 lea dstq, [dstq+dsq*2] 8020 add hq, 2 8021 jl .w2 8022 RET 8023.w4: 8024 mova m3, [base+blend_shuf] 8025.w4_loop: 8026 movq m0, [dstq+dsq*0] 8027 movhps m0, [dstq+dsq*1] 8028 movd m2, [maskq+hq*2] 8029 mova m1, [tmpq] 8030 add tmpq, 8*2 8031 psubw m1, m0 8032 pshufb m2, m3 8033 pmulhrsw m1, m2 8034 paddw m0, m1 8035 movq [dstq+dsq*0], m0 8036 movhps [dstq+dsq*1], m0 8037 lea dstq, [dstq+dsq*2] 8038 add hq, 2 8039 jl .w4_loop 8040 RET 8041.w8: 8042 movddup m5, [base+blend_shuf+8] 8043%if WIN64 8044 movaps [rsp+ 8], m6 8045 movaps [rsp+24], m7 8046%endif 8047.w8_loop: 8048 movd m7, [maskq+hq*2] 8049 mova m0, [dstq+dsq*0] 8050 mova m2, [tmpq+16*0] 8051 mova m1, [dstq+dsq*1] 8052 mova m3, [tmpq+16*1] 8053 add tmpq, 16*2 8054 pshufb m6, m7, m4 8055 psubw m2, m0 8056 pshufb m7, m5 8057 psubw m3, m1 8058 pmulhrsw m2, m6 8059 pmulhrsw m3, m7 8060 paddw m0, m2 8061 paddw m1, m3 8062 mova [dstq+dsq*0], m0 8063 mova [dstq+dsq*1], m1 8064 lea dstq, [dstq+dsq*2] 8065 add hq, 2 8066 jl .w8_loop 8067%if WIN64 8068 movaps m6, [rsp+ 8] 8069 movaps m7, [rsp+24] 8070%endif 8071 RET 8072.w16: 8073 movd m5, [maskq+hq*2] 8074 pshufb m5, m4 8075 BLEND_H_ROW 0, 0, 2 8076 add dstq, dsq 8077 inc hq 8078 jl .w16 8079 RET 8080.w32: 8081 movd m5, [maskq+hq*2] 8082 pshufb m5, m4 8083 BLEND_H_ROW 0, 0 8084 BLEND_H_ROW 2, 2, 4 8085 add dstq, dsq 8086 inc hq 8087 jl .w32 8088 RET 8089.w64: 8090 movd m5, [maskq+hq*2] 8091 pshufb m5, m4 8092 BLEND_H_ROW 0, 0 8093 BLEND_H_ROW 2, 2 8094 BLEND_H_ROW 4, 4 8095 BLEND_H_ROW 6, 6, 8 8096 add dstq, dsq 8097 inc hq 8098 jl .w64 8099 RET 8100.w128: 8101 movd m5, [maskq+hq*2] 8102 pshufb m5, m4 8103 BLEND_H_ROW 0, 0 8104 BLEND_H_ROW 2, 2 8105 BLEND_H_ROW 4, 4 8106 BLEND_H_ROW 6, 6, 16 8107 BLEND_H_ROW 8, -8 8108 BLEND_H_ROW 10, -6 8109 BLEND_H_ROW 12, -4 8110 BLEND_H_ROW 14, -2 8111 add dstq, dsq 8112 inc hq 8113 jl .w128 8114 RET 8115 8116; emu_edge args: 8117; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih, 8118; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride, 8119; const pixel *ref, const ptrdiff_t ref_stride 8120; 8121; bw, bh total filled size 8122; iw, ih, copied block -> fill bottom, right 8123; x, y, offset in bw/bh -> fill top, left 8124cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \ 8125 y, dst, dstride, src, sstride, \ 8126 bottomext, rightext, blk 8127 ; we assume that the buffer (stride) is larger than width, so we can 8128 ; safely overwrite by a few bytes 8129 8130%if ARCH_X86_64 8131 %define reg_zero r12q 8132 %define reg_tmp r10 8133 %define reg_src srcq 8134 %define reg_bottomext bottomextq 8135 %define reg_rightext rightextq 8136 %define reg_blkm r9m 8137%else 8138 %define reg_zero r6 8139 %define reg_tmp r0 8140 %define reg_src r1 8141 %define reg_bottomext r0 8142 %define reg_rightext r1 8143 %define reg_blkm r2m 8144%endif 8145 ; 8146 ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride) 8147 xor reg_zero, reg_zero 8148 lea reg_tmp, [ihq-1] 8149 cmp yq, ihq 8150 cmovs reg_tmp, yq 8151 test yq, yq 8152 cmovs reg_tmp, reg_zero 8153%if ARCH_X86_64 8154 imul reg_tmp, sstrideq 8155 add srcq, reg_tmp 8156%else 8157 imul reg_tmp, sstridem 8158 mov reg_src, srcm 8159 add reg_src, reg_tmp 8160%endif 8161 ; 8162 ; ref += iclip(x, 0, iw - 1) 8163 lea reg_tmp, [iwq-1] 8164 cmp xq, iwq 8165 cmovs reg_tmp, xq 8166 test xq, xq 8167 cmovs reg_tmp, reg_zero 8168 lea reg_src, [reg_src+reg_tmp*2] 8169%if ARCH_X86_32 8170 mov srcm, reg_src 8171%endif 8172 ; 8173 ; bottom_ext = iclip(y + bh - ih, 0, bh - 1) 8174%if ARCH_X86_32 8175 mov r1, r1m ; restore bh 8176%endif 8177 lea reg_bottomext, [yq+bhq] 8178 sub reg_bottomext, ihq 8179 lea r3, [bhq-1] 8180 cmovs reg_bottomext, reg_zero 8181 ; 8182 8183 DEFINE_ARGS bw, bh, iw, ih, x, \ 8184 topext, dst, dstride, src, sstride, \ 8185 bottomext, rightext, blk 8186 8187 ; top_ext = iclip(-y, 0, bh - 1) 8188 neg topextq 8189 cmovs topextq, reg_zero 8190 cmp reg_bottomext, bhq 8191 cmovns reg_bottomext, r3 8192 cmp topextq, bhq 8193 cmovg topextq, r3 8194 %if ARCH_X86_32 8195 mov r4m, reg_bottomext 8196 ; 8197 ; right_ext = iclip(x + bw - iw, 0, bw - 1) 8198 mov r0, r0m ; restore bw 8199 %endif 8200 lea reg_rightext, [xq+bwq] 8201 sub reg_rightext, iwq 8202 lea r2, [bwq-1] 8203 cmovs reg_rightext, reg_zero 8204 8205 DEFINE_ARGS bw, bh, iw, ih, leftext, \ 8206 topext, dst, dstride, src, sstride, \ 8207 bottomext, rightext, blk 8208 8209 ; left_ext = iclip(-x, 0, bw - 1) 8210 neg leftextq 8211 cmovs leftextq, reg_zero 8212 cmp reg_rightext, bwq 8213 cmovns reg_rightext, r2 8214 %if ARCH_X86_32 8215 mov r3m, r1 8216 %endif 8217 cmp leftextq, bwq 8218 cmovns leftextq, r2 8219 8220%undef reg_zero 8221%undef reg_tmp 8222%undef reg_src 8223%undef reg_bottomext 8224%undef reg_rightext 8225 8226 DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \ 8227 topext, dst, dstride, src, sstride, \ 8228 bottomext, rightext, blk 8229 8230 ; center_h = bh - top_ext - bottom_ext 8231%if ARCH_X86_64 8232 lea r3, [bottomextq+topextq] 8233 sub centerhq, r3 8234%else 8235 mov r1, centerhm ; restore r1 8236 sub centerhq, topextq 8237 sub centerhq, r4m 8238 mov r1m, centerhq 8239%endif 8240 ; 8241 ; blk += top_ext * PXSTRIDE(dst_stride) 8242 mov r2, topextq 8243%if ARCH_X86_64 8244 imul r2, dstrideq 8245%else 8246 mov r6, r6m ; restore dstq 8247 imul r2, dstridem 8248%endif 8249 add dstq, r2 8250 mov reg_blkm, dstq ; save pointer for ext 8251 ; 8252 ; center_w = bw - left_ext - right_ext 8253 mov centerwq, bwq 8254%if ARCH_X86_64 8255 lea r3, [rightextq+leftextq] 8256 sub centerwq, r3 8257%else 8258 sub centerwq, r3m 8259 sub centerwq, leftextq 8260%endif 8261 8262; vloop Macro 8263%macro v_loop 3 ; need_left_ext, need_right_ext, suffix 8264 %if ARCH_X86_64 8265 %define reg_tmp r12 8266 %else 8267 %define reg_tmp r0 8268 %endif 8269.v_loop_%3: 8270 %if ARCH_X86_32 8271 mov r0, r0m 8272 mov r1, r1m 8273 %endif 8274%if %1 8275 ; left extension 8276 %if ARCH_X86_64 8277 movd m0, [srcq] 8278 %else 8279 mov r3, srcm 8280 movd m0, [r3] 8281 %endif 8282 pshuflw m0, m0, q0000 8283 punpcklqdq m0, m0 8284 xor r3, r3 8285.left_loop_%3: 8286 mova [dstq+r3*2], m0 8287 add r3, mmsize/2 8288 cmp r3, leftextq 8289 jl .left_loop_%3 8290 ; body 8291 lea reg_tmp, [dstq+leftextq*2] 8292%endif 8293 xor r3, r3 8294.body_loop_%3: 8295 %if ARCH_X86_64 8296 movu m0, [srcq+r3*2] 8297 %else 8298 mov r1, srcm 8299 movu m0, [r1+r3*2] 8300 %endif 8301%if %1 8302 movu [reg_tmp+r3*2], m0 8303%else 8304 movu [dstq+r3*2], m0 8305%endif 8306 add r3, mmsize/2 8307 cmp r3, centerwq 8308 jl .body_loop_%3 8309%if %2 8310 ; right extension 8311%if %1 8312 lea reg_tmp, [reg_tmp+centerwq*2] 8313%else 8314 lea reg_tmp, [dstq+centerwq*2] 8315%endif 8316 %if ARCH_X86_64 8317 movd m0, [srcq+centerwq*2-2] 8318 %else 8319 mov r3, srcm 8320 movd m0, [r3+centerwq*2-2] 8321 %endif 8322 pshuflw m0, m0, q0000 8323 punpcklqdq m0, m0 8324 xor r3, r3 8325.right_loop_%3: 8326 movu [reg_tmp+r3*2], m0 8327 add r3, mmsize/2 8328 %if ARCH_X86_64 8329 cmp r3, rightextq 8330 %else 8331 cmp r3, r3m 8332 %endif 8333 jl .right_loop_%3 8334%endif 8335 %if ARCH_X86_64 8336 add dstq, dstrideq 8337 add srcq, sstrideq 8338 dec centerhq 8339 jg .v_loop_%3 8340 %else 8341 add dstq, dstridem 8342 mov r0, sstridem 8343 add srcm, r0 8344 sub dword centerhm, 1 8345 jg .v_loop_%3 8346 mov r0, r0m ; restore r0 8347 %endif 8348%endmacro ; vloop MACRO 8349 8350 test leftextq, leftextq 8351 jnz .need_left_ext 8352 %if ARCH_X86_64 8353 test rightextq, rightextq 8354 jnz .need_right_ext 8355 %else 8356 cmp leftextq, r3m ; leftextq == 0 8357 jne .need_right_ext 8358 %endif 8359 v_loop 0, 0, 0 8360 jmp .body_done 8361 8362 ;left right extensions 8363.need_left_ext: 8364 %if ARCH_X86_64 8365 test rightextq, rightextq 8366 %else 8367 mov r3, r3m 8368 test r3, r3 8369 %endif 8370 jnz .need_left_right_ext 8371 v_loop 1, 0, 1 8372 jmp .body_done 8373 8374.need_left_right_ext: 8375 v_loop 1, 1, 2 8376 jmp .body_done 8377 8378.need_right_ext: 8379 v_loop 0, 1, 3 8380 8381.body_done: 8382; r0 ; bw 8383; r1 ;; x loop 8384; r4 ;; y loop 8385; r5 ; topextq 8386; r6 ;dstq 8387; r7 ;dstrideq 8388; r8 ; srcq 8389%if ARCH_X86_64 8390 %define reg_dstride dstrideq 8391%else 8392 %define reg_dstride r2 8393%endif 8394 ; 8395 ; bottom edge extension 8396 %if ARCH_X86_64 8397 test bottomextq, bottomextq 8398 jz .top 8399 %else 8400 xor r1, r1 8401 cmp r1, r4m 8402 je .top 8403 %endif 8404 ; 8405 %if ARCH_X86_64 8406 mov srcq, dstq 8407 sub srcq, dstrideq 8408 xor r1, r1 8409 %else 8410 mov r3, dstq 8411 mov reg_dstride, dstridem 8412 sub r3, reg_dstride 8413 mov srcm, r3 8414 %endif 8415 ; 8416.bottom_x_loop: 8417 %if ARCH_X86_64 8418 mova m0, [srcq+r1*2] 8419 lea r3, [dstq+r1*2] 8420 mov r4, bottomextq 8421 %else 8422 mov r3, srcm 8423 mova m0, [r3+r1*2] 8424 lea r3, [dstq+r1*2] 8425 mov r4, r4m 8426 %endif 8427 ; 8428.bottom_y_loop: 8429 mova [r3], m0 8430 add r3, reg_dstride 8431 dec r4 8432 jg .bottom_y_loop 8433 add r1, mmsize/2 8434 cmp r1, bwq 8435 jl .bottom_x_loop 8436 8437.top: 8438 ; top edge extension 8439 test topextq, topextq 8440 jz .end 8441%if ARCH_X86_64 8442 mov srcq, reg_blkm 8443%else 8444 mov r3, reg_blkm 8445 mov reg_dstride, dstridem 8446%endif 8447 mov dstq, dstm 8448 xor r1, r1 8449 ; 8450.top_x_loop: 8451%if ARCH_X86_64 8452 mova m0, [srcq+r1*2] 8453%else 8454 mov r3, reg_blkm 8455 mova m0, [r3+r1*2] 8456%endif 8457 lea r3, [dstq+r1*2] 8458 mov r4, topextq 8459 ; 8460.top_y_loop: 8461 mova [r3], m0 8462 add r3, reg_dstride 8463 dec r4 8464 jg .top_y_loop 8465 add r1, mmsize/2 8466 cmp r1, bwq 8467 jl .top_x_loop 8468 8469.end: 8470 RET 8471 8472%undef reg_dstride 8473%undef reg_blkm 8474%undef reg_tmp 8475 8476%macro SCRATCH 3 8477%if ARCH_X86_32 8478 mova [rsp+%3*mmsize], m%1 8479%define m%2 [rsp+%3*mmsize] 8480%else 8481 SWAP %1, %2 8482%endif 8483%endmacro 8484 8485%if ARCH_X86_64 8486cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \ 8487 dst_w, h, src_w, dx, mx0, pxmax 8488%elif STACK_ALIGNMENT >= 16 8489cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \ 8490 dst_w, h, src_w, dx, mx0, pxmax 8491%else 8492cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \ 8493 dst_w, h, src_w, dx, mx0, pxmax 8494%endif 8495 movifnidn dstq, dstmp 8496 movifnidn srcq, srcmp 8497%if STACK_ALIGNMENT >= 16 8498 movifnidn dst_wd, dst_wm 8499%endif 8500%if ARCH_X86_64 8501 movifnidn hd, hm 8502%endif 8503 sub dword mx0m, 4<<14 8504 sub dword src_wm, 8 8505 movd m4, pxmaxm 8506 movd m7, dxm 8507 movd m6, mx0m 8508 movd m5, src_wm 8509 punpcklwd m4, m4 8510 pshufd m4, m4, q0000 8511 pshufd m7, m7, q0000 8512 pshufd m6, m6, q0000 8513 pshufd m5, m5, q0000 8514 mova [rsp+16*3*ARCH_X86_32], m4 8515%if ARCH_X86_64 8516 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x 8517 LEA r7, $$ 8518 %define base r7-$$ 8519%else 8520 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x 8521 %define hd dword r5m 8522 %if STACK_ALIGNMENT >= 16 8523 LEA r6, $$ 8524 %define base r6-$$ 8525 %else 8526 LEA r4, $$ 8527 %define base r4-$$ 8528 %endif 8529%endif 8530%if ARCH_X86_64 8531 mova m12, [base+pd_64] 8532 mova m11, [base+pd_63] 8533%else 8534 %define m12 [base+pd_64] 8535 %define m11 [base+pd_63] 8536%endif 8537 pmaddwd m4, m7, [base+rescale_mul] ; dx*[0,1,2,3] 8538 pslld m7, 2 ; dx*4 8539 pslld m5, 14 8540 paddd m6, m4 ; mx+[0..3]*dx 8541 SCRATCH 7, 15, 0 8542 SCRATCH 6, 14, 1 8543 SCRATCH 5, 13, 2 8544 pxor m1, m1 8545.loop_y: 8546 xor xd, xd 8547 mova m0, m14 ; per-line working version of mx 8548.loop_x: 8549 pcmpgtd m1, m0 8550 pandn m1, m0 8551 psrad m2, m0, 8 ; filter offset (unmasked) 8552 pcmpgtd m3, m13, m1 8553 pand m1, m3 8554 pandn m3, m13 8555 por m1, m3 8556 psubd m3, m0, m1 ; pshufb offset 8557 psrad m1, 14 ; clipped src_x offset 8558 psrad m3, 14 ; pshufb edge_emu offset 8559 pand m2, m11 ; filter offset (masked) 8560 ; load source pixels 8561%if ARCH_X86_64 8562 movd r8d, m1 8563 pshuflw m1, m1, q3232 8564 movd r9d, m1 8565 punpckhqdq m1, m1 8566 movd r10d, m1 8567 psrlq m1, 32 8568 movd r11d, m1 8569 movu m4, [srcq+r8*2] 8570 movu m5, [srcq+r9*2] 8571 movu m6, [srcq+r10*2] 8572 movu m7, [srcq+r11*2] 8573 ; if no emulation is required, we don't need to shuffle or emulate edges 8574 packssdw m3, m3 8575 movq r11, m3 8576 test r11, r11 8577 jz .filter 8578 movsx r8, r11w 8579 sar r11, 16 8580 movsx r9, r11w 8581 sar r11, 16 8582 movsx r10, r11w 8583 sar r11, 16 8584 movu m1, [base+resize_shuf+8+r8*2] 8585 movu m3, [base+resize_shuf+8+r9*2] 8586 movu m8, [base+resize_shuf+8+r10*2] 8587 movu m9, [base+resize_shuf+8+r11*2] 8588 pshufb m4, m1 8589 pshufb m5, m3 8590 pshufb m6, m8 8591 pshufb m7, m9 8592.filter: 8593 movd r8d, m2 8594 pshuflw m2, m2, q3232 8595 movd r9d, m2 8596 punpckhqdq m2, m2 8597 movd r10d, m2 8598 psrlq m2, 32 8599 movd r11d, m2 8600 movq m8, [base+resize_filter+r8*8] 8601 movq m2, [base+resize_filter+r9*8] 8602 pxor m9, m9 8603 punpcklbw m1, m9, m8 8604 punpcklbw m3, m9, m2 8605 psraw m1, 8 8606 psraw m3, 8 8607 movq m10, [base+resize_filter+r10*8] 8608 movq m2, [base+resize_filter+r11*8] 8609 punpcklbw m8, m9, m10 8610 punpcklbw m9, m2 8611 psraw m8, 8 8612 psraw m9, 8 8613 pmaddwd m4, m1 8614 pmaddwd m5, m3 8615 pmaddwd m6, m8 8616 pmaddwd m7, m9 8617 phaddd m4, m5 8618%else 8619 movd r3, m1 8620 pshuflw m1, m1, q3232 8621 movd r1, m1 8622 punpckhqdq m1, m1 8623 movu m4, [srcq+r3*2] 8624 movu m5, [srcq+r1*2] 8625 movd r3, m1 8626 psrlq m1, 32 8627 movd r1, m1 8628 movu m6, [srcq+r3*2] 8629 movu m7, [srcq+r1*2] 8630 ; if no emulation is required, we don't need to shuffle or emulate edges 8631 pxor m1, m1 8632 pcmpeqb m1, m3 8633 pmovmskb r3d, m1 8634 cmp r3d, 0xffff 8635 je .filter 8636 movd r3, m3 8637 movu m1, [base+resize_shuf+8+r3*2] 8638 pshuflw m3, m3, q3232 8639 movd r1, m3 8640 pshufb m4, m1 8641 movu m1, [base+resize_shuf+8+r1*2] 8642 punpckhqdq m3, m3 8643 movd r3, m3 8644 pshufb m5, m1 8645 movu m1, [base+resize_shuf+8+r3*2] 8646 psrlq m3, 32 8647 movd r1, m3 8648 pshufb m6, m1 8649 movu m1, [base+resize_shuf+8+r1*2] 8650 pshufb m7, m1 8651.filter: 8652 mova [esp+4*16], m6 8653 mova [esp+5*16], m7 8654 movd r3, m2 8655 pshuflw m2, m2, q3232 8656 movd r1, m2 8657 movq m6, [base+resize_filter+r3*8] 8658 movq m7, [base+resize_filter+r1*8] 8659 pxor m3, m3 8660 punpcklbw m1, m3, m6 8661 punpcklbw m3, m7 8662 psraw m1, 8 8663 psraw m3, 8 8664 pmaddwd m4, m1 8665 pmaddwd m5, m3 8666 punpckhqdq m2, m2 8667 movd r3, m2 8668 psrlq m2, 32 8669 movd r1, m2 8670 phaddd m4, m5 8671 movq m2, [base+resize_filter+r3*8] 8672 movq m5, [base+resize_filter+r1*8] 8673 mova m6, [esp+4*16] 8674 mova m7, [esp+5*16] 8675 pxor m3, m3 8676 punpcklbw m1, m3, m2 8677 punpcklbw m3, m5 8678 psraw m1, 8 8679 psraw m3, 8 8680 pmaddwd m6, m1 8681 pmaddwd m7, m3 8682%endif 8683 phaddd m6, m7 8684 phaddd m4, m6 8685 pxor m1, m1 8686 psubd m2, m12, m4 8687 psrad m2, 7 8688 packssdw m2, m2 8689 pmaxsw m2, m1 8690 pminsw m2, [rsp+16*3*ARCH_X86_32] 8691 movq [dstq+xq*2], m2 8692 paddd m0, m15 8693 add xd, 4 8694%if STACK_ALIGNMENT >= 16 8695 cmp xd, dst_wd 8696%else 8697 cmp xd, dst_wm 8698%endif 8699 jl .loop_x 8700 add dstq, dst_stridemp 8701 add srcq, src_stridemp 8702 dec hd 8703 jg .loop_y 8704 RET 8705