1; Copyright © 2021, VideoLAN and dav1d authors 2; Copyright © 2021, Two Orioles, LLC 3; All rights reserved. 4; 5; Redistribution and use in source and binary forms, with or without 6; modification, are permitted provided that the following conditions are met: 7; 8; 1. Redistributions of source code must retain the above copyright notice, this 9; list of conditions and the following disclaimer. 10; 11; 2. Redistributions in binary form must reproduce the above copyright notice, 12; this list of conditions and the following disclaimer in the documentation 13; and/or other materials provided with the distribution. 14; 15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 26%include "config.asm" 27%include "ext/x86/x86inc.asm" 28 29SECTION_RODATA 64 30 31%macro JMP_TABLE 2-* 32 %xdefine %%prefix mangle(private_prefix %+ _%1) 33 %1_table: 34 %xdefine %%base %1_table 35 %rep %0 - 1 36 dd %%prefix %+ .w%2 - %%base 37 %rotate 1 38 %endrep 39%endmacro 40 41%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix 42 %rep %1 43 db %2*3 44 db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \ 45 mangle(private_prefix %+ _save_tmvs_%3).write1 46 %endrep 47%endmacro 48 49%if ARCH_X86_64 50mv_proj: dw 0, 16384, 8192, 5461, 4096, 3276, 2730, 2340 51 dw 2048, 1820, 1638, 1489, 1365, 1260, 1170, 1092 52 dw 1024, 963, 910, 862, 819, 780, 744, 712 53 dw 682, 655, 630, 606, 585, 564, 546, 528 54splat_mv_shuf: db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 55 db 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7 56 db 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 57 db 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3 58%endif 59save_pack0: db 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0 60 db 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 61save_pack1: db 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2 62 db 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3 63save_ref_shuf: db 0, -1, -1, -1, 1, -1, -1, -1, 8, -1, -1, -1, 9, -1, -1, -1 64cond_shuf512: db 3, 3, 3, 3, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 3 65save_cond0: db 0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00 66save_cond1: db 0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00 67pb_128: times 16 db 128 68pq_8192: dq 8192 69 70save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3 71 SAVE_TMVS_TABLE 4, 8, ssse3 72 SAVE_TMVS_TABLE 4, 4, ssse3 73 SAVE_TMVS_TABLE 5, 2, ssse3 74 SAVE_TMVS_TABLE 7, 1, ssse3 75 76%if ARCH_X86_64 77save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2 78 SAVE_TMVS_TABLE 4, 8, avx2 79 SAVE_TMVS_TABLE 4, 4, avx2 80 SAVE_TMVS_TABLE 5, 2, avx2 81 SAVE_TMVS_TABLE 7, 1, avx2 82 83save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl 84 SAVE_TMVS_TABLE 4, 8, avx512icl 85 SAVE_TMVS_TABLE 4, 4, avx512icl 86 SAVE_TMVS_TABLE 5, 2, avx512icl 87 SAVE_TMVS_TABLE 7, 1, avx512icl 88 89JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32 90JMP_TABLE splat_mv_avx2, 1, 2, 4, 8, 16, 32 91%endif 92 93JMP_TABLE splat_mv_sse2, 1, 2, 4, 8, 16, 32 94 95SECTION .text 96 97%macro movif32 2 98%if ARCH_X86_32 99 mov %1, %2 100%endif 101%endmacro 102 103INIT_XMM ssse3 104; refmvs_temporal_block *rp, ptrdiff_t stride, 105; refmvs_block **rr, uint8_t *ref_sign, 106; int col_end8, int row_end8, int col_start8, int row_start8 107%if ARCH_X86_64 108cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \ 109 xend, yend, xstart, ystart 110%define base_reg r12 111%else 112cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \ 113 xend, yend, xstart, ystart 114 movq m5, [ref_signq] 115 lea strided, [strided*5] 116 mov stridem, strided 117 mov r3, xstartm 118 mov r1, ystartm 119 DEFINE_ARGS b, ystart, rr, cand, xend, x 120%define stridemp r1m 121%define m8 [base+pb_128] 122%define m9 [base+save_pack0+ 0] 123%define m10 [base+save_pack0+16] 124%define base_reg r6 125%endif 126%define base base_reg-.write1 127 LEA base_reg, .write1 128%if ARCH_X86_64 129 movifnidn xendd, xendm 130 movifnidn yendd, yendm 131 mov xstartd, xstartm 132 mov ystartd, ystartm 133 movq m5, [ref_signq] 134%endif 135 movu m4, [base+save_ref_shuf] 136 movddup m6, [base+save_cond0] 137 movddup m7, [base+save_cond1] 138%if ARCH_X86_64 139 mova m8, [base+pb_128] 140 mova m9, [base+save_pack0+ 0] 141 mova m10, [base+save_pack0+16] 142%endif 143 psllq m5, 8 144%if ARCH_X86_64 145 lea r9d, [xendq*5] 146 lea xstartd, [xstartq*5] 147 sub yendd, ystartd 148 add ystartd, ystartd 149 lea strideq, [strideq*5] 150 sub xstartq, r9 151 add xendd, r9d 152 add rpq, r9 153 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 154%else 155 lea r0, [xendd*5] ; xend5 156 lea r3, [r3*5] ; xstart5 157 sub r3, r0 ; -w5 158 mov r6m, r3 159%define xstartq r6m 160 add xendd, r0 ; xend6 161 add r0m, r0 ; rp+xend5 162 mov xendm, xendd 163 sub r5, r1 ; h 164 add r1, r1 165 mov r7m, r1 166 mov r5m, r5 167%define hd r5mp 168 jmp .loop_y_noload 169%endif 170.loop_y: 171 movif32 ystartd, r7m 172 movif32 xendd, xendm 173.loop_y_noload: 174 and ystartd, 30 175 mov xq, xstartq 176 mov bq, [rrq+ystartq*gprsize] 177 add ystartd, 2 178 movif32 r7m, ystartd 179 lea bq, [bq+xendq*4] 180.loop_x: 181%if ARCH_X86_32 182%define rpq r3 183%define r10 r1 184%define r10d r1 185%define r11 r4 186%define r11d r4 187%endif 188 imul candq, xq, 0x9999 ; x / 5 * 3 189 sar candq, 16 190 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 191 movu m0, [bq+candq*8+12] ; cand_b 192 movzx r11d, byte [base+save_tmvs_ssse3_table+r10*2+0] 193 movzx r10d, byte [base+save_tmvs_ssse3_table+r10*2+1] 194 add r10, base_reg 195 add candq, r11 196 jge .calc 197 movu m1, [bq+candq*8+12] 198 movzx r11d, byte [bq+candq*8+22] 199 movzx r11d, byte [base+save_tmvs_ssse3_table+r11*2+1] 200 add r11, base_reg 201.calc: 202 movif32 rpq, r0m 203 ; ref check 204 punpckhqdq m2, m0, m1 205 pshufb m2, m4 ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ... 206 pshufb m3, m5, m2 ; ref > 0 && res_sign[ref - 1] 207 ; mv check 208 punpcklqdq m2, m0, m1 ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ... 209 pabsw m2, m2 210 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 211 ; res 212 pcmpgtd m3, m2 213 pshufd m2, m3, q2301 214 pand m3, m6 ; b0c0 b0c1 b1c0 b1c1 | ... 215 pand m2, m7 ; b0c1 b0c0 b1c1 b1c0 | ... 216 por m3, m2 ; b0.shuf b1.shuf | ... 217 pxor m3, m8 ; if cond0|cond1 == 0 => zero out 218 pshufb m0, m3 219 pshufb m1, m3 220 call r10 221 jge .next_line 222 pshufd m0, m1, q3232 223 call r11 224 jl .loop_x 225.next_line: 226 add rpq, stridemp 227 movif32 r0m, rpq 228 dec hd 229 jg .loop_y 230 RET 231.write1: 232 movd [rpq+xq+0], m0 233 psrlq m0, 8 234 movd [rpq+xq+1], m0 235 add xq, 5*1 236 ret 237.write2: 238 movq [rpq+xq+0], m0 239 psrlq m0, 8 240 movd [rpq+xq+6], m0 241 add xq, 5*2 242 ret 243.write4: 244 pshufb m0, m9 245 movu [rpq+xq+ 0], m0 246 psrlq m0, 8 247 movd [rpq+xq+16], m0 248 add xq, 5*4 249 ret 250.write8: 251 pshufb m2, m0, m9 252 movu [rpq+xq+ 0], m2 253 pshufb m0, m10 254 movu [rpq+xq+16], m0 255 psrldq m2, 2 256 movq [rpq+xq+32], m2 257 add xq, 5*8 258 ret 259.write16: 260 pshufb m2, m0, m9 261 movu [rpq+xq+ 0], m2 262 pshufb m0, m10 263 movu [rpq+xq+16], m0 264 shufps m2, m0, q1032 265 movu [rpq+xq+48], m2 266 shufps m2, m0, q2121 267 movu [rpq+xq+32], m2 268 shufps m0, m2, q1032 269 movu [rpq+xq+64], m0 270 add xq, 5*16 271 ret 272 273INIT_XMM sse2 274; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4 275cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 276 add bx4d, bw4d 277 tzcnt bw4d, bw4d 278 mova m2, [aq] 279 LEA aq, splat_mv_sse2_table 280 lea bx4q, [bx4q*3-32] 281 movsxd bw4q, [aq+bw4q*4] 282 movifnidn bh4d, bh4m 283 pshufd m0, m2, q0210 284 pshufd m1, m2, q1021 285 pshufd m2, m2, q2102 286 add bw4q, aq 287.loop: 288 mov aq, [rrq] 289 add rrq, gprsize 290 lea aq, [aq+bx4q*4] 291 jmp bw4q 292.w32: 293 mova [aq-16*16], m0 294 mova [aq-16*15], m1 295 mova [aq-16*14], m2 296 mova [aq-16*13], m0 297 mova [aq-16*12], m1 298 mova [aq-16*11], m2 299 mova [aq-16*10], m0 300 mova [aq-16* 9], m1 301 mova [aq-16* 8], m2 302 mova [aq-16* 7], m0 303 mova [aq-16* 6], m1 304 mova [aq-16* 5], m2 305.w16: 306 mova [aq-16* 4], m0 307 mova [aq-16* 3], m1 308 mova [aq-16* 2], m2 309 mova [aq-16* 1], m0 310 mova [aq+16* 0], m1 311 mova [aq+16* 1], m2 312.w8: 313 mova [aq+16* 2], m0 314 mova [aq+16* 3], m1 315 mova [aq+16* 4], m2 316.w4: 317 mova [aq+16* 5], m0 318 mova [aq+16* 6], m1 319 mova [aq+16* 7], m2 320 dec bh4d 321 jg .loop 322 RET 323.w2: 324 movu [aq+104], m0 325 movq [aq+120], m1 326 dec bh4d 327 jg .loop 328 RET 329.w1: 330 movq [aq+116], m0 331 movd [aq+124], m2 332 dec bh4d 333 jg .loop 334 RET 335 336%if ARCH_X86_64 337INIT_XMM sse4 338; refmvs_frame *rf, int tile_row_idx, 339; int col_start8, int col_end8, int row_start8, int row_end8 340cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \ 341 stride, rp_proj, roff, troff, \ 342 xendi, xstarti, iw8, ih8, dst 343 xor r14d, r14d 344 cmp dword [rfq+212], 1 ; n_tile_threads 345 mov ih8d, [rfq+20] ; rf->ih8 346 mov iw8d, [rfq+16] ; rf->iw8 347 mov xstartd, xstartd 348 mov xendd, xendd 349 cmove tridxd, r14d 350 lea xstartid, [xstartq-8] 351 lea xendid, [xendq+8] 352 mov strideq, [rfq+184] 353 mov rp_projq, [rfq+176] 354 cmp ih8d, yendd 355 mov [rsp+0x30], strideq 356 cmovs yendd, ih8d 357 test xstartid, xstartid 358 cmovs xstartid, r14d 359 cmp iw8d, xendid 360 cmovs xendid, iw8d 361 mov troffq, strideq 362 shl troffq, 4 363 imul troffq, tridxq 364 mov dstd, ystartd 365 and dstd, 15 366 imul dstq, strideq 367 add dstq, troffq ; (16 * tridx + (ystart & 15)) * stride 368 lea dstq, [dstq*5] 369 add dstq, rp_projq 370 lea troffq, [troffq*5] ; 16 * tridx * stride * 5 371 lea r13d, [xendq*5] 372 lea r12, [strideq*5] 373 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \ 374 _, troff, xendi, xstarti, stride5, _, dst 375 lea w5d, [xstartq*5] 376 add r7, troffq ; rp_proj + tile_row_offset 377 mov hd, yendd 378 mov [rsp+0x28], r7 379 add dstq, r13 380 sub w5q, r13 381 sub hd, ystartd 382.init_xloop_start: 383 mov x5q, w5q 384 test w5b, 1 385 jz .init_2blk 386 mov dword [dstq+x5q], 0x80008000 387 add x5q, 5 388 jz .init_next_row 389.init_2blk: 390 mov dword [dstq+x5q+0], 0x80008000 391 mov dword [dstq+x5q+5], 0x80008000 392 add x5q, 10 393 jl .init_2blk 394.init_next_row: 395 add dstq, stride5q 396 dec hd 397 jg .init_xloop_start 398 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \ 399 _, _, xendi, xstarti, stride5, _, n 400 mov r13d, [rfq+152] ; rf->n_mfmvs 401 test r13d, r13d 402 jz .ret 403 mov [rsp+0x0c], r13d 404 mov strideq, [rsp+0x30] 405 movddup m3, [pq_8192] 406 mov r9d, ystartd 407 mov [rsp+0x38], yendd 408 mov [rsp+0x20], xstartid 409 xor nd, nd 410 xor n7d, n7d 411 imul r9, strideq ; ystart * stride 412 mov [rsp+0x48], rfq 413 mov [rsp+0x18], stride5q 414 lea r7, [r9*5] 415 mov [rsp+0x24], ystartd 416 mov [rsp+0x00], r7 417.nloop: 418 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \ 419 ref, rp_ref, xendi, xstarti, _, _, n 420 mov rfq, [rsp+0x48] 421 mov refd, [rfq+56+nq*4] ; ref2cur 422 cmp refd, 0x80000000 423 je .next_n 424 mov [rsp+0x40], refd 425 mov offq, [rsp+0x00] ; ystart * stride * 5 426 movzx refd, byte [rfq+53+nq] ; rf->mfmv_ref[n] 427 lea refsignq, [refq-4] 428 mov rp_refq, [rfq+168] 429 movq m2, refsignq 430 add offq, [rp_refq+refq*8] ; r = rp_ref[ref] + row_offset 431 mov [rsp+0x14], nd 432 mov yd, ystartd 433.yloop: 434 mov r11d, [rsp+0x24] ; ystart 435 mov r12d, [rsp+0x38] ; yend 436 mov r14d, yd 437 and r14d, ~7 ; y_sb_align 438 cmp r11d, r14d 439 cmovs r11d, r14d ; imax(y_sb_align, ystart) 440 mov [rsp+0x44], r11d ; y_proj_start 441 add r14d, 8 442 cmp r12d, r14d 443 cmovs r14d, r12d ; imin(y_sb_align + 8, yend) 444 mov [rsp+0x3c], r14d ; y_proj_end 445 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \ 446 ref, x, xendi, mvx, mvy, rb, ref2ref 447 mov xd, [rsp+0x20] ; xstarti 448.xloop: 449 lea rbd, [xq*5] 450 add rbq, srcq 451 movsx refd, byte [rbq+4] 452 test refd, refd 453 jz .next_x_bad_ref 454 mov rfq, [rsp+0x48] 455 lea r14d, [16+n7q+refq] 456 mov ref2refd, [rfq+r14*4] ; rf->mfmv_ref2ref[n][b_ref-1] 457 test ref2refd, ref2refd 458 jz .next_x_bad_ref 459 lea fracq, [mv_proj] 460 movzx fracd, word [fracq+ref2refq*2] 461 mov mvd, [rbq] 462 imul fracd, [rsp+0x40] ; ref2cur 463 pmovsxwq m0, [rbq] 464 movd m1, fracd 465 punpcklqdq m1, m1 466 pmuldq m0, m1 ; mv * frac 467 pshufd m1, m0, q3311 468 paddd m0, m3 469 paddd m0, m1 470 psrad m0, 14 ; offset = (xy + (xy >> 31) + 8192) >> 14 471 pabsd m1, m0 472 packssdw m0, m0 473 psrld m1, 6 474 packuswb m1, m1 475 pxor m0, m2 ; offset ^ ref_sign 476 psignd m1, m0 ; apply_sign(abs(offset) >> 6, offset ^ refsign) 477 movq mvxq, m1 478 lea mvyd, [mvxq+yq] ; ypos 479 sar mvxq, 32 480 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \ 481 ref, x, xendi, mvx, ypos, rb, ref2ref 482 cmp yposd, [rsp+0x44] ; y_proj_start 483 jl .next_x_bad_pos_y 484 cmp yposd, [rsp+0x3c] ; y_proj_end 485 jge .next_x_bad_pos_y 486 and yposd, 15 487 add mvxq, xq ; xpos 488 imul yposq, [rsp+0x30] ; pos = (ypos & 15) * stride 489 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \ 490 ref, x, xendi, xpos, pos, rb, ref2ref 491 mov dstq, [rsp+0x28] ; dst = rp_proj + tile_row_offset 492 add posq, xposq ; pos += xpos 493 lea posq, [posq*5] 494 add dstq, posq ; dst += pos5 495 jmp .write_loop_entry 496.write_loop: 497 add rbq, 5 498 cmp refb, byte [rbq+4] 499 jne .xloop 500 cmp mvd, [rbq] 501 jne .xloop 502 add dstq, 5 503 inc xposd 504.write_loop_entry: 505 mov r12d, xd 506 and r12d, ~7 507 lea r5d, [r12-8] 508 cmp r5d, xstartd 509 cmovs r5d, xstartd ; x_proj_start 510 cmp xposd, r5d 511 jl .next_xpos 512 add r12d, 16 513 cmp xendd, r12d 514 cmovs r12d, xendd ; x_proj_end 515 cmp xposd, r12d 516 jge .next_xpos 517 mov [dstq+0], mvd 518 mov byte [dstq+4], ref2refb 519.next_xpos: 520 inc xd 521 cmp xd, xendid 522 jl .write_loop 523.next_y: 524 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n 525 add srcq, [rsp+0x18] ; stride5 526 inc yd 527 cmp yd, [rsp+0x38] ; yend 528 jne .yloop 529 mov nd, [rsp+0x14] 530 mov ystartd, [rsp+0x24] 531.next_n: 532 add n7d, 7 533 inc nd 534 cmp nd, [rsp+0x0c] ; n_mfmvs 535 jne .nloop 536.ret: 537 RET 538.next_x: 539 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _ 540 add rbq, 5 541 cmp refb, byte [rbq+4] 542 jne .xloop 543 cmp mvd, [rbq] 544 jne .xloop 545.next_x_bad_pos_y: 546 inc xd 547 cmp xd, xendid 548 jl .next_x 549 jmp .next_y 550.next_x_bad_ref: 551 inc xd 552 cmp xd, xendid 553 jl .xloop 554 jmp .next_y 555 556INIT_YMM avx2 557; refmvs_temporal_block *rp, ptrdiff_t stride, 558; refmvs_block **rr, uint8_t *ref_sign, 559; int col_end8, int row_end8, int col_start8, int row_start8 560cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \ 561 xend, yend, xstart, ystart 562%define base r12-.write1 563 lea r12, [.write1] 564 movifnidn xendd, xendm 565 movifnidn yendd, yendm 566 mov xstartd, xstartm 567 mov ystartd, ystartm 568 vpbroadcastq m4, [ref_signq] 569 vpbroadcastq m3, [base+save_ref_shuf+8] 570 vpbroadcastq m5, [base+save_cond0] 571 vpbroadcastq m6, [base+save_cond1] 572 vpbroadcastd m7, [base+pb_128] 573 mova m8, [base+save_pack0] 574 mova m9, [base+save_pack1] 575 psllq m4, 8 576 lea r9d, [xendq*5] 577 lea xstartd, [xstartq*5] 578 sub yendd, ystartd 579 add ystartd, ystartd 580 lea strideq, [strideq*5] 581 sub xstartq, r9 582 add xendd, r9d 583 add rpq, r9 584 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 585.loop_y: 586 and ystartd, 30 587 mov xq, xstartq 588 mov bq, [rrq+ystartq*8] 589 add ystartd, 2 590 lea bq, [bq+xendq*4] 591.loop_x: 592 imul candq, xq, 0x9999 593 sar candq, 16 ; x / 5 * 3 594 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 595 movu xm0, [bq+candq*8+12] ; cand_b 596 movzx r11d, byte [base+save_tmvs_avx2_table+r10*2+0] 597 movzx r10d, byte [base+save_tmvs_avx2_table+r10*2+1] 598 add r10, r12 599 add candq, r11 600 jge .calc 601 vinserti128 m0, [bq+candq*8+12], 1 602 movzx r11d, byte [bq+candq*8+22] 603 movzx r11d, byte [base+save_tmvs_avx2_table+r11*2+1] 604 add r11, r12 605.calc: 606 pshufb m1, m0, m3 607 pabsw m2, m0 608 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 609 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 610 pcmpgtd m1, m2 611 pshufd m2, m1, q2301 612 pand m1, m5 ; b0.cond0 b1.cond0 613 pand m2, m6 ; b0.cond1 b1.cond1 614 por m1, m2 ; b0.shuf b1.shuf 615 pxor m1, m7 ; if cond0|cond1 == 0 => zero out 616 pshufb m0, m1 617 call r10 618 jge .next_line 619 vextracti128 xm0, m0, 1 620 call r11 621 jl .loop_x 622.next_line: 623 add rpq, strideq 624 dec hd 625 jg .loop_y 626 RET 627.write1: 628 movd [rpq+xq+ 0], xm0 629 pextrb [rpq+xq+ 4], xm0, 4 630 add xq, 5*1 631 ret 632.write2: 633 movq [rpq+xq+0], xm0 634 psrlq xm1, xm0, 8 635 movd [rpq+xq+6], xm1 636 add xq, 5*2 637 ret 638.write4: 639 pshufb xm1, xm0, xm8 640 movu [rpq+xq+ 0], xm1 641 psrlq xm1, 8 642 movd [rpq+xq+16], xm1 643 add xq, 5*4 644 ret 645.write8: 646 vinserti128 m1, m0, xm0, 1 647 pshufb m1, m8 648 movu [rpq+xq+ 0], m1 649 psrldq xm1, 2 650 movq [rpq+xq+32], xm1 651 add xq, 5*8 652 ret 653.write16: 654 vinserti128 m1, m0, xm0, 1 655 pshufb m2, m1, m8 656 movu [rpq+xq+ 0], m2 657 pshufb m1, m9 658 movu [rpq+xq+32], m1 659 shufps xm2, xm1, q1021 660 movu [rpq+xq+64], xm2 661 add xq, 5*16 662 ret 663 664cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4 665 add bx4d, bw4d 666 tzcnt bw4d, bw4d 667 vbroadcasti128 m0, [aq] 668 lea aq, [splat_mv_avx2_table] 669 lea bx4q, [bx4q*3-32] 670 movsxd bw4q, [aq+bw4q*4] 671 pshufb m0, [splat_mv_shuf] 672 movifnidn bh4d, bh4m 673 pshufd m1, m0, q2102 674 pshufd m2, m0, q1021 675 add bw4q, aq 676.loop: 677 mov aq, [rrq] 678 add rrq, gprsize 679 lea aq, [aq+bx4q*4] 680 jmp bw4q 681.w32: 682 mova [aq-32*8], m0 683 mova [aq-32*7], m1 684 mova [aq-32*6], m2 685 mova [aq-32*5], m0 686 mova [aq-32*4], m1 687 mova [aq-32*3], m2 688.w16: 689 mova [aq-32*2], m0 690 mova [aq-32*1], m1 691 mova [aq+32*0], m2 692.w8: 693 mova [aq+32*1], m0 694 mova [aq+32*2], m1 695 mova [aq+32*3], m2 696 dec bh4d 697 jg .loop 698 RET 699.w4: 700 movu [aq+ 80], m0 701 mova [aq+112], xm1 702 dec bh4d 703 jg .loop 704 RET 705.w2: 706 movu [aq+104], xm0 707 movq [aq+120], xm2 708 dec bh4d 709 jg .loop 710 RET 711.w1: 712 movq [aq+116], xm0 713 movd [aq+124], xm1 714 dec bh4d 715 jg .loop 716 RET 717 718INIT_ZMM avx512icl 719; refmvs_temporal_block *rp, ptrdiff_t stride, 720; refmvs_block **rr, uint8_t *ref_sign, 721; int col_end8, int row_end8, int col_start8, int row_start8 722cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \ 723 xend, yend, xstart, ystart 724%define base r14-.write1 725 lea r14, [.write1] 726 movifnidn xendd, xendm 727 movifnidn yendd, yendm 728 mov xstartd, xstartm 729 mov ystartd, ystartm 730 psllq m4, [ref_signq]{bcstq}, 8 731 vpbroadcastq m3, [base+save_ref_shuf+8] 732 vbroadcasti32x4 m5, [base+cond_shuf512] 733 vbroadcasti32x4 m6, [base+save_cond0] 734 vpbroadcastd m7, [base+pb_128] 735 mova m8, [base+save_pack0] 736 movu xm9, [base+save_pack0+4] 737 lea r9d, [xendq*5] 738 lea xstartd, [xstartq*5] 739 sub yendd, ystartd 740 add ystartd, ystartd 741 lea strideq, [strideq*5] 742 sub xstartq, r9 743 add xendd, r9d 744 add rpq, r9 745 mov r10d, 0x1f 746 kmovb k2, r10d 747 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand 748.loop_y: 749 and ystartd, 30 750 mov xq, xstartq 751 mov bq, [rrq+ystartq*8] 752 add ystartd, 2 753 lea bq, [bq+xendq*4] 754.loop_x: 755 imul candq, xq, 0x9999 756 sar candq, 16 ; x / 5 * 3 757 movzx r10d, byte [bq+candq*8+22] ; cand_b->bs 758 movu xm0, [bq+candq*8+12] ; cand_b 759 movzx r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0] 760 movzx r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1] 761 add r10, r14 762 add candq, r11 763 jge .calc 764 movzx r11d, byte [bq+candq*8+22] 765 vinserti32x4 ym0, [bq+candq*8+12], 1 766 movzx r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0] 767 movzx r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1] 768 add r11, r14 769 add candq, r12 770 jge .calc 771 movzx r12d, byte [bq+candq*8+22] 772 vinserti32x4 m0, [bq+candq*8+12], 2 773 movzx r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0] 774 movzx r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1] 775 add r12, r14 776 add candq, r13 777 jge .calc 778 vinserti32x4 m0, [bq+candq*8+12], 3 779 movzx r13d, byte [bq+candq*8+22] 780 movzx r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1] 781 add r13, r14 782.calc: 783 pshufb m1, m0, m3 784 pabsw m2, m0 785 pshufb m1, m4, m1 ; ref > 0 && res_sign[ref - 1] 786 psrlw m2, 12 ; (abs(mv.x) | abs(mv.y)) < 4096 787 psubd m2, m1 788 pshufb m2, m5 ; c0 c1 c1 c0 789 pand m2, m6 790 punpckhqdq m1, m2, m2 791 vpternlogd m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80 792 pshufb m2, m0, m1 793 mova xm0, xm2 794 call r10 795 jge .next_line 796 vextracti32x4 xm0, m2, 1 797 call r11 798 jge .next_line 799 vextracti32x4 xm0, m2, 2 800 call r12 801 jge .next_line 802 vextracti32x4 xm0, m2, 3 803 call r13 804 jl .loop_x 805.next_line: 806 add rpq, strideq 807 dec hd 808 jg .loop_y 809 RET 810.write1: 811 vmovdqu8 [rpq+xq]{k2}, xm0 812 add xq, 5*1 813 ret 814.write2: 815 pshufb xm0, xm8 816 vmovdqu16 [rpq+xq]{k2}, xm0 817 add xq, 5*2 818 ret 819.write4: 820 vpermb ym0, ym8, ym0 821 vmovdqu32 [rpq+xq]{k2}, ym0 822 add xq, 5*4 823 ret 824.write8: 825 vpermb m0, m8, m0 826 vmovdqu64 [rpq+xq]{k2}, m0 827 add xq, 5*8 828 ret 829.write16: 830 vpermb m1, m8, m0 831 movu [rpq+xq+ 0], m1 832 pshufb xm0, xm9 833 movu [rpq+xq+64], xm0 834 add xq, 5*16 835 ret 836 837INIT_ZMM avx512icl 838cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4 839 vbroadcasti32x4 m0, [aq] 840 lea r1, [splat_mv_avx512icl_table] 841 tzcnt bw4d, bw4d 842 lea bx4d, [bx4q*3] 843 pshufb m0, [splat_mv_shuf] 844 movsxd bw4q, [r1+bw4q*4] 845 mov r6d, bh4m 846 add bw4q, r1 847 lea rrq, [rrq+r6*8] 848 mov r1d, 0x3f 849 neg r6 850 kmovb k1, r1d 851 jmp bw4q 852.w1: 853 mov r1, [rrq+r6*8] 854 vmovdqu16 [r1+bx4q*4]{k1}, xm0 855 inc r6 856 jl .w1 857 RET 858.w2: 859 mov r1, [rrq+r6*8] 860 vmovdqu32 [r1+bx4q*4]{k1}, ym0 861 inc r6 862 jl .w2 863 RET 864.w4: 865 mov r1, [rrq+r6*8] 866 vmovdqu64 [r1+bx4q*4]{k1}, m0 867 inc r6 868 jl .w4 869 RET 870.w8: 871 pshufd ym1, ym0, q1021 872.w8_loop: 873 mov r1, [rrq+r6*8+0] 874 mov r3, [rrq+r6*8+8] 875 movu [r1+bx4q*4+ 0], m0 876 mova [r1+bx4q*4+64], ym1 877 movu [r3+bx4q*4+ 0], m0 878 mova [r3+bx4q*4+64], ym1 879 add r6, 2 880 jl .w8_loop 881 RET 882.w16: 883 pshufd m1, m0, q1021 884 pshufd m2, m0, q2102 885.w16_loop: 886 mov r1, [rrq+r6*8+0] 887 mov r3, [rrq+r6*8+8] 888 mova [r1+bx4q*4+64*0], m0 889 mova [r1+bx4q*4+64*1], m1 890 mova [r1+bx4q*4+64*2], m2 891 mova [r3+bx4q*4+64*0], m0 892 mova [r3+bx4q*4+64*1], m1 893 mova [r3+bx4q*4+64*2], m2 894 add r6, 2 895 jl .w16_loop 896 RET 897.w32: 898 pshufd m1, m0, q1021 899 pshufd m2, m0, q2102 900.w32_loop: 901 mov r1, [rrq+r6*8] 902 lea r1, [r1+bx4q*4] 903 mova [r1+64*0], m0 904 mova [r1+64*1], m1 905 mova [r1+64*2], m2 906 mova [r1+64*3], m0 907 mova [r1+64*4], m1 908 mova [r1+64*5], m2 909 inc r6 910 jl .w32_loop 911 RET 912%endif ; ARCH_X86_64 913