• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64
30
31%macro JMP_TABLE 2-*
32    %xdefine %%prefix mangle(private_prefix %+ _%1)
33    %1_table:
34    %xdefine %%base %1_table
35    %rep %0 - 1
36        dd %%prefix %+ .w%2 - %%base
37        %rotate 1
38    %endrep
39%endmacro
40
41%macro SAVE_TMVS_TABLE 3 ; num_entries, w, suffix
42    %rep %1
43        db %2*3
44        db mangle(private_prefix %+ _save_tmvs_%3).write%2 - \
45           mangle(private_prefix %+ _save_tmvs_%3).write1
46    %endrep
47%endmacro
48
49%if ARCH_X86_64
50mv_proj:       dw    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
51               dw 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
52               dw 1024,   963,  910,  862,  819,  780,  744,  712
53               dw  682,   655,  630,  606,  585,  564,  546,  528
54splat_mv_shuf: db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
55               db  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7
56               db  8,  9, 10, 11,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
57               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11,  0,  1,  2,  3
58%endif
59save_pack0:    db  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0
60               db  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1
61save_pack1:    db  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2
62               db  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3,  4,  0,  1,  2,  3
63save_ref_shuf: db  0, -1, -1, -1,  1, -1, -1, -1,  8, -1, -1, -1,  9, -1, -1, -1
64cond_shuf512:  db  3,  3,  3,  3,  7,  7,  7,  7,  7,  7,  7,  7,  3,  3,  3,  3
65save_cond0:    db  0x80, 0x81, 0x82, 0x83, 0x89, 0x84, 0x00, 0x00
66save_cond1:    db  0x84, 0x85, 0x86, 0x87, 0x88, 0x80, 0x00, 0x00
67pb_128:        times 16 db 128
68pq_8192:       dq 8192
69
70save_tmvs_ssse3_table: SAVE_TMVS_TABLE 2, 16, ssse3
71                       SAVE_TMVS_TABLE 4,  8, ssse3
72                       SAVE_TMVS_TABLE 4,  4, ssse3
73                       SAVE_TMVS_TABLE 5,  2, ssse3
74                       SAVE_TMVS_TABLE 7,  1, ssse3
75
76%if ARCH_X86_64
77save_tmvs_avx2_table: SAVE_TMVS_TABLE 2, 16, avx2
78                      SAVE_TMVS_TABLE 4,  8, avx2
79                      SAVE_TMVS_TABLE 4,  4, avx2
80                      SAVE_TMVS_TABLE 5,  2, avx2
81                      SAVE_TMVS_TABLE 7,  1, avx2
82
83save_tmvs_avx512icl_table: SAVE_TMVS_TABLE 2, 16, avx512icl
84                           SAVE_TMVS_TABLE 4,  8, avx512icl
85                           SAVE_TMVS_TABLE 4,  4, avx512icl
86                           SAVE_TMVS_TABLE 5,  2, avx512icl
87                           SAVE_TMVS_TABLE 7,  1, avx512icl
88
89JMP_TABLE splat_mv_avx512icl, 1, 2, 4, 8, 16, 32
90JMP_TABLE splat_mv_avx2,      1, 2, 4, 8, 16, 32
91%endif
92
93JMP_TABLE splat_mv_sse2,      1, 2, 4, 8, 16, 32
94
95SECTION .text
96
97%macro movif32 2
98%if ARCH_X86_32
99    mov             %1, %2
100%endif
101%endmacro
102
103INIT_XMM ssse3
104; refmvs_temporal_block *rp, ptrdiff_t stride,
105; refmvs_block **rr, uint8_t *ref_sign,
106; int col_end8, int row_end8, int col_start8, int row_start8
107%if ARCH_X86_64
108cglobal save_tmvs, 4, 13, 11, rp, stride, rr, ref_sign, \
109                             xend, yend, xstart, ystart
110%define base_reg r12
111%else
112cglobal save_tmvs, 6, 7, 8, rp, stride, rr, ref_sign, \
113                            xend, yend, xstart, ystart
114    movq            m5, [ref_signq]
115    lea        strided, [strided*5]
116    mov        stridem, strided
117    mov             r3, xstartm
118    mov             r1, ystartm
119 DEFINE_ARGS b, ystart, rr, cand, xend, x
120%define stridemp r1m
121%define m8  [base+pb_128]
122%define m9  [base+save_pack0+ 0]
123%define m10 [base+save_pack0+16]
124%define base_reg r6
125%endif
126%define base base_reg-.write1
127    LEA       base_reg, .write1
128%if ARCH_X86_64
129    movifnidn    xendd, xendm
130    movifnidn    yendd, yendm
131    mov        xstartd, xstartm
132    mov        ystartd, ystartm
133    movq            m5, [ref_signq]
134%endif
135    movu            m4, [base+save_ref_shuf]
136    movddup         m6, [base+save_cond0]
137    movddup         m7, [base+save_cond1]
138%if ARCH_X86_64
139    mova            m8, [base+pb_128]
140    mova            m9, [base+save_pack0+ 0]
141    mova           m10, [base+save_pack0+16]
142%endif
143    psllq           m5, 8
144%if ARCH_X86_64
145    lea            r9d, [xendq*5]
146    lea        xstartd, [xstartq*5]
147    sub          yendd, ystartd
148    add        ystartd, ystartd
149    lea        strideq, [strideq*5]
150    sub        xstartq, r9
151    add          xendd, r9d
152    add            rpq, r9
153 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
154%else
155    lea             r0, [xendd*5]   ; xend5
156    lea             r3, [r3*5]      ; xstart5
157    sub             r3, r0          ; -w5
158    mov            r6m, r3
159%define xstartq r6m
160    add          xendd, r0          ; xend6
161    add            r0m, r0          ; rp+xend5
162    mov          xendm, xendd
163    sub             r5, r1          ; h
164    add             r1, r1
165    mov            r7m, r1
166    mov            r5m, r5
167%define hd r5mp
168    jmp .loop_y_noload
169%endif
170.loop_y:
171    movif32    ystartd, r7m
172    movif32      xendd, xendm
173.loop_y_noload:
174    and        ystartd, 30
175    mov             xq, xstartq
176    mov             bq, [rrq+ystartq*gprsize]
177    add        ystartd, 2
178    movif32        r7m, ystartd
179    lea             bq, [bq+xendq*4]
180.loop_x:
181%if ARCH_X86_32
182%define rpq  r3
183%define r10  r1
184%define r10d r1
185%define r11  r4
186%define r11d r4
187%endif
188    imul         candq, xq, 0x9999  ; x / 5 * 3
189    sar          candq, 16
190    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
191    movu            m0, [bq+candq*8+12]      ; cand_b
192    movzx         r11d, byte [base+save_tmvs_ssse3_table+r10*2+0]
193    movzx         r10d, byte [base+save_tmvs_ssse3_table+r10*2+1]
194    add            r10, base_reg
195    add          candq, r11
196    jge .calc
197    movu            m1, [bq+candq*8+12]
198    movzx         r11d, byte [bq+candq*8+22]
199    movzx         r11d, byte [base+save_tmvs_ssse3_table+r11*2+1]
200    add            r11, base_reg
201.calc:
202    movif32        rpq, r0m
203    ; ref check
204    punpckhqdq      m2, m0, m1
205    pshufb          m2, m4      ; b0.ref0 b0.ref1 b1.ref0 b1.ref1 | ...
206    pshufb          m3, m5, m2  ; ref > 0 && res_sign[ref - 1]
207    ; mv check
208    punpcklqdq      m2, m0, m1  ; b0.mv0 b0.mv1 b1.mv0 b1.mv1 | ...
209    pabsw           m2, m2
210    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
211    ; res
212    pcmpgtd         m3, m2
213    pshufd          m2, m3, q2301
214    pand            m3, m6      ; b0c0 b0c1 b1c0 b1c1 | ...
215    pand            m2, m7      ; b0c1 b0c0 b1c1 b1c0 | ...
216    por             m3, m2      ; b0.shuf b1.shuf | ...
217    pxor            m3, m8      ; if cond0|cond1 == 0 => zero out
218    pshufb          m0, m3
219    pshufb          m1, m3
220    call           r10
221    jge .next_line
222    pshufd          m0, m1, q3232
223    call           r11
224    jl .loop_x
225.next_line:
226    add            rpq, stridemp
227    movif32        r0m, rpq
228    dec             hd
229    jg .loop_y
230    RET
231.write1:
232    movd    [rpq+xq+0], m0
233    psrlq           m0, 8
234    movd    [rpq+xq+1], m0
235    add             xq, 5*1
236    ret
237.write2:
238    movq    [rpq+xq+0], m0
239    psrlq           m0, 8
240    movd    [rpq+xq+6], m0
241    add             xq, 5*2
242    ret
243.write4:
244    pshufb          m0, m9
245    movu   [rpq+xq+ 0], m0
246    psrlq           m0, 8
247    movd   [rpq+xq+16], m0
248    add             xq, 5*4
249    ret
250.write8:
251    pshufb          m2, m0, m9
252    movu   [rpq+xq+ 0], m2
253    pshufb          m0, m10
254    movu   [rpq+xq+16], m0
255    psrldq          m2, 2
256    movq   [rpq+xq+32], m2
257    add             xq, 5*8
258    ret
259.write16:
260    pshufb          m2, m0, m9
261    movu   [rpq+xq+ 0], m2
262    pshufb          m0, m10
263    movu   [rpq+xq+16], m0
264    shufps          m2, m0, q1032
265    movu   [rpq+xq+48], m2
266    shufps          m2, m0, q2121
267    movu   [rpq+xq+32], m2
268    shufps          m0, m2, q1032
269    movu   [rpq+xq+64], m0
270    add             xq, 5*16
271    ret
272
273INIT_XMM sse2
274; refmvs_block **rr, refmvs_block *a, int bx4, int bw4, int bh4
275cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
276    add           bx4d, bw4d
277    tzcnt         bw4d, bw4d
278    mova            m2, [aq]
279    LEA             aq, splat_mv_sse2_table
280    lea           bx4q, [bx4q*3-32]
281    movsxd        bw4q, [aq+bw4q*4]
282    movifnidn     bh4d, bh4m
283    pshufd          m0, m2, q0210
284    pshufd          m1, m2, q1021
285    pshufd          m2, m2, q2102
286    add           bw4q, aq
287.loop:
288    mov             aq, [rrq]
289    add            rrq, gprsize
290    lea             aq, [aq+bx4q*4]
291    jmp           bw4q
292.w32:
293    mova    [aq-16*16], m0
294    mova    [aq-16*15], m1
295    mova    [aq-16*14], m2
296    mova    [aq-16*13], m0
297    mova    [aq-16*12], m1
298    mova    [aq-16*11], m2
299    mova    [aq-16*10], m0
300    mova    [aq-16* 9], m1
301    mova    [aq-16* 8], m2
302    mova    [aq-16* 7], m0
303    mova    [aq-16* 6], m1
304    mova    [aq-16* 5], m2
305.w16:
306    mova    [aq-16* 4], m0
307    mova    [aq-16* 3], m1
308    mova    [aq-16* 2], m2
309    mova    [aq-16* 1], m0
310    mova    [aq+16* 0], m1
311    mova    [aq+16* 1], m2
312.w8:
313    mova    [aq+16* 2], m0
314    mova    [aq+16* 3], m1
315    mova    [aq+16* 4], m2
316.w4:
317    mova    [aq+16* 5], m0
318    mova    [aq+16* 6], m1
319    mova    [aq+16* 7], m2
320    dec           bh4d
321    jg .loop
322    RET
323.w2:
324    movu      [aq+104], m0
325    movq      [aq+120], m1
326    dec           bh4d
327    jg .loop
328    RET
329.w1:
330    movq      [aq+116], m0
331    movd      [aq+124], m2
332    dec           bh4d
333    jg .loop
334    RET
335
336%if ARCH_X86_64
337INIT_XMM sse4
338; refmvs_frame *rf, int tile_row_idx,
339; int col_start8, int col_end8, int row_start8, int row_end8
340cglobal load_tmvs, 6, 15, 4, -0x50, rf, tridx, xstart, xend, ystart, yend, \
341                                    stride, rp_proj, roff, troff, \
342                                    xendi, xstarti, iw8, ih8, dst
343    xor           r14d, r14d
344    cmp dword [rfq+212], 1          ; n_tile_threads
345    mov           ih8d, [rfq+20]    ; rf->ih8
346    mov           iw8d, [rfq+16]    ; rf->iw8
347    mov        xstartd, xstartd
348    mov          xendd, xendd
349    cmove       tridxd, r14d
350    lea       xstartid, [xstartq-8]
351    lea         xendid, [xendq+8]
352    mov        strideq, [rfq+184]
353    mov       rp_projq, [rfq+176]
354    cmp           ih8d, yendd
355    mov     [rsp+0x30], strideq
356    cmovs        yendd, ih8d
357    test      xstartid, xstartid
358    cmovs     xstartid, r14d
359    cmp           iw8d, xendid
360    cmovs       xendid, iw8d
361    mov         troffq, strideq
362    shl         troffq, 4
363    imul        troffq, tridxq
364    mov           dstd, ystartd
365    and           dstd, 15
366    imul          dstq, strideq
367    add           dstq, troffq      ; (16 * tridx + (ystart & 15)) * stride
368    lea           dstq, [dstq*5]
369    add           dstq, rp_projq
370    lea         troffq, [troffq*5]  ; 16 * tridx * stride * 5
371    lea           r13d, [xendq*5]
372    lea            r12, [strideq*5]
373 DEFINE_ARGS rf, w5, xstart, xend, ystart, yend, h, x5, \
374             _, troff, xendi, xstarti, stride5, _, dst
375    lea            w5d, [xstartq*5]
376    add             r7, troffq      ; rp_proj + tile_row_offset
377    mov             hd, yendd
378    mov     [rsp+0x28], r7
379    add           dstq, r13
380    sub            w5q, r13
381    sub             hd, ystartd
382.init_xloop_start:
383    mov            x5q, w5q
384    test           w5b, 1
385    jz .init_2blk
386    mov dword [dstq+x5q], 0x80008000
387    add            x5q, 5
388    jz .init_next_row
389.init_2blk:
390    mov dword [dstq+x5q+0], 0x80008000
391    mov dword [dstq+x5q+5], 0x80008000
392    add            x5q, 10
393    jl .init_2blk
394.init_next_row:
395    add           dstq, stride5q
396    dec             hd
397    jg .init_xloop_start
398 DEFINE_ARGS rf, _, xstart, xend, ystart, yend, n7, stride, \
399             _, _, xendi, xstarti, stride5, _, n
400    mov           r13d, [rfq+152]   ; rf->n_mfmvs
401    test          r13d, r13d
402    jz .ret
403    mov     [rsp+0x0c], r13d
404    mov        strideq, [rsp+0x30]
405    movddup         m3, [pq_8192]
406    mov            r9d, ystartd
407    mov     [rsp+0x38], yendd
408    mov     [rsp+0x20], xstartid
409    xor             nd, nd
410    xor            n7d, n7d
411    imul            r9, strideq     ; ystart * stride
412    mov     [rsp+0x48], rfq
413    mov     [rsp+0x18], stride5q
414    lea             r7, [r9*5]
415    mov     [rsp+0x24], ystartd
416    mov     [rsp+0x00], r7
417.nloop:
418 DEFINE_ARGS y, off, xstart, xend, ystart, rf, n7, refsign, \
419             ref, rp_ref, xendi, xstarti, _, _, n
420    mov            rfq, [rsp+0x48]
421    mov           refd, [rfq+56+nq*4]       ; ref2cur
422    cmp           refd, 0x80000000
423    je .next_n
424    mov     [rsp+0x40], refd
425    mov           offq, [rsp+0x00]          ; ystart * stride * 5
426    movzx         refd, byte [rfq+53+nq]    ; rf->mfmv_ref[n]
427    lea       refsignq, [refq-4]
428    mov        rp_refq, [rfq+168]
429    movq            m2, refsignq
430    add           offq, [rp_refq+refq*8]    ; r = rp_ref[ref] + row_offset
431    mov     [rsp+0x14], nd
432    mov             yd, ystartd
433.yloop:
434    mov           r11d, [rsp+0x24]          ; ystart
435    mov           r12d, [rsp+0x38]          ; yend
436    mov           r14d, yd
437    and           r14d, ~7                  ; y_sb_align
438    cmp           r11d, r14d
439    cmovs         r11d, r14d                ; imax(y_sb_align, ystart)
440    mov     [rsp+0x44], r11d                ; y_proj_start
441    add           r14d, 8
442    cmp           r12d, r14d
443    cmovs         r14d, r12d                ; imin(y_sb_align + 8, yend)
444    mov     [rsp+0x3c], r14d                ; y_proj_end
445 DEFINE_ARGS y, src, xstart, xend, frac, rf, n7, mv, \
446             ref, x, xendi, mvx, mvy, rb, ref2ref
447    mov             xd, [rsp+0x20] ; xstarti
448.xloop:
449    lea            rbd, [xq*5]
450    add            rbq, srcq
451    movsx         refd, byte [rbq+4]
452    test          refd, refd
453    jz .next_x_bad_ref
454    mov            rfq, [rsp+0x48]
455    lea           r14d, [16+n7q+refq]
456    mov       ref2refd, [rfq+r14*4]         ; rf->mfmv_ref2ref[n][b_ref-1]
457    test      ref2refd, ref2refd
458    jz .next_x_bad_ref
459    lea          fracq, [mv_proj]
460    movzx        fracd, word [fracq+ref2refq*2]
461    mov            mvd, [rbq]
462    imul         fracd, [rsp+0x40] ; ref2cur
463    pmovsxwq        m0, [rbq]
464    movd            m1, fracd
465    punpcklqdq      m1, m1
466    pmuldq          m0, m1          ; mv * frac
467    pshufd          m1, m0, q3311
468    paddd           m0, m3
469    paddd           m0, m1
470    psrad           m0, 14          ; offset = (xy + (xy >> 31) + 8192) >> 14
471    pabsd           m1, m0
472    packssdw        m0, m0
473    psrld           m1, 6
474    packuswb        m1, m1
475    pxor            m0, m2          ; offset ^ ref_sign
476    psignd          m1, m0          ; apply_sign(abs(offset) >> 6, offset ^ refsign)
477    movq          mvxq, m1
478    lea           mvyd, [mvxq+yq]   ; ypos
479    sar           mvxq, 32
480 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, \
481             ref, x, xendi, mvx, ypos, rb, ref2ref
482    cmp          yposd, [rsp+0x44] ; y_proj_start
483    jl .next_x_bad_pos_y
484    cmp          yposd, [rsp+0x3c] ; y_proj_end
485    jge .next_x_bad_pos_y
486    and          yposd, 15
487    add           mvxq, xq          ; xpos
488    imul         yposq, [rsp+0x30]  ; pos = (ypos & 15) * stride
489 DEFINE_ARGS y, src, xstart, xend, dst, _, n7, mv, \
490             ref, x, xendi, xpos, pos, rb, ref2ref
491    mov           dstq, [rsp+0x28]  ; dst = rp_proj + tile_row_offset
492    add           posq, xposq       ; pos += xpos
493    lea           posq, [posq*5]
494    add           dstq, posq        ; dst += pos5
495    jmp .write_loop_entry
496.write_loop:
497    add            rbq, 5
498    cmp           refb, byte [rbq+4]
499    jne .xloop
500    cmp            mvd, [rbq]
501    jne .xloop
502    add           dstq, 5
503    inc          xposd
504.write_loop_entry:
505    mov           r12d, xd
506    and           r12d, ~7
507    lea            r5d, [r12-8]
508    cmp            r5d, xstartd
509    cmovs          r5d, xstartd     ; x_proj_start
510    cmp          xposd, r5d
511    jl .next_xpos
512    add           r12d, 16
513    cmp          xendd, r12d
514    cmovs         r12d, xendd       ; x_proj_end
515    cmp          xposd, r12d
516    jge .next_xpos
517    mov       [dstq+0], mvd
518    mov  byte [dstq+4], ref2refb
519.next_xpos:
520    inc             xd
521    cmp             xd, xendid
522    jl .write_loop
523.next_y:
524 DEFINE_ARGS y, src, xstart, xend, ystart, _, n7, _, _, x, xendi, _, _, _, n
525    add           srcq, [rsp+0x18] ; stride5
526    inc             yd
527    cmp             yd, [rsp+0x38] ; yend
528    jne .yloop
529    mov             nd, [rsp+0x14]
530    mov        ystartd, [rsp+0x24]
531.next_n:
532    add            n7d, 7
533    inc             nd
534    cmp             nd, [rsp+0x0c] ; n_mfmvs
535    jne .nloop
536.ret:
537    RET
538.next_x:
539 DEFINE_ARGS y, src, xstart, xend, _, _, n7, mv, ref, x, xendi, _, _, rb, _
540    add            rbq, 5
541    cmp           refb, byte [rbq+4]
542    jne .xloop
543    cmp            mvd, [rbq]
544    jne .xloop
545.next_x_bad_pos_y:
546    inc             xd
547    cmp             xd, xendid
548    jl .next_x
549    jmp .next_y
550.next_x_bad_ref:
551    inc             xd
552    cmp             xd, xendid
553    jl .xloop
554    jmp .next_y
555
556INIT_YMM avx2
557; refmvs_temporal_block *rp, ptrdiff_t stride,
558; refmvs_block **rr, uint8_t *ref_sign,
559; int col_end8, int row_end8, int col_start8, int row_start8
560cglobal save_tmvs, 4, 13, 10, rp, stride, rr, ref_sign, \
561                              xend, yend, xstart, ystart
562%define base r12-.write1
563    lea            r12, [.write1]
564    movifnidn    xendd, xendm
565    movifnidn    yendd, yendm
566    mov        xstartd, xstartm
567    mov        ystartd, ystartm
568    vpbroadcastq    m4, [ref_signq]
569    vpbroadcastq    m3, [base+save_ref_shuf+8]
570    vpbroadcastq    m5, [base+save_cond0]
571    vpbroadcastq    m6, [base+save_cond1]
572    vpbroadcastd    m7, [base+pb_128]
573    mova            m8, [base+save_pack0]
574    mova            m9, [base+save_pack1]
575    psllq           m4, 8
576    lea            r9d, [xendq*5]
577    lea        xstartd, [xstartq*5]
578    sub          yendd, ystartd
579    add        ystartd, ystartd
580    lea        strideq, [strideq*5]
581    sub        xstartq, r9
582    add          xendd, r9d
583    add            rpq, r9
584 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
585.loop_y:
586    and        ystartd, 30
587    mov             xq, xstartq
588    mov             bq, [rrq+ystartq*8]
589    add        ystartd, 2
590    lea             bq, [bq+xendq*4]
591.loop_x:
592    imul         candq, xq, 0x9999
593    sar          candq, 16                   ; x / 5 * 3
594    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
595    movu           xm0, [bq+candq*8+12]      ; cand_b
596    movzx         r11d, byte [base+save_tmvs_avx2_table+r10*2+0]
597    movzx         r10d, byte [base+save_tmvs_avx2_table+r10*2+1]
598    add            r10, r12
599    add          candq, r11
600    jge .calc
601    vinserti128     m0, [bq+candq*8+12], 1
602    movzx         r11d, byte [bq+candq*8+22]
603    movzx         r11d, byte [base+save_tmvs_avx2_table+r11*2+1]
604    add            r11, r12
605.calc:
606    pshufb          m1, m0, m3
607    pabsw           m2, m0
608    pshufb          m1, m4, m1  ; ref > 0 && res_sign[ref - 1]
609    psrlw           m2, 12      ; (abs(mv.x) | abs(mv.y)) < 4096
610    pcmpgtd         m1, m2
611    pshufd          m2, m1, q2301
612    pand            m1, m5      ; b0.cond0 b1.cond0
613    pand            m2, m6      ; b0.cond1 b1.cond1
614    por             m1, m2      ; b0.shuf b1.shuf
615    pxor            m1, m7      ; if cond0|cond1 == 0 => zero out
616    pshufb          m0, m1
617    call           r10
618    jge .next_line
619    vextracti128   xm0, m0, 1
620    call           r11
621    jl .loop_x
622.next_line:
623    add            rpq, strideq
624    dec             hd
625    jg .loop_y
626    RET
627.write1:
628    movd   [rpq+xq+ 0], xm0
629    pextrb [rpq+xq+ 4], xm0, 4
630    add             xq, 5*1
631    ret
632.write2:
633    movq    [rpq+xq+0], xm0
634    psrlq          xm1, xm0, 8
635    movd    [rpq+xq+6], xm1
636    add             xq, 5*2
637    ret
638.write4:
639    pshufb         xm1, xm0, xm8
640    movu   [rpq+xq+ 0], xm1
641    psrlq          xm1, 8
642    movd   [rpq+xq+16], xm1
643    add             xq, 5*4
644    ret
645.write8:
646    vinserti128     m1, m0, xm0, 1
647    pshufb          m1, m8
648    movu   [rpq+xq+ 0], m1
649    psrldq         xm1, 2
650    movq   [rpq+xq+32], xm1
651    add             xq, 5*8
652    ret
653.write16:
654    vinserti128     m1, m0, xm0, 1
655    pshufb          m2, m1, m8
656    movu   [rpq+xq+ 0], m2
657    pshufb          m1, m9
658    movu   [rpq+xq+32], m1
659    shufps         xm2, xm1, q1021
660    movu   [rpq+xq+64], xm2
661    add             xq, 5*16
662    ret
663
664cglobal splat_mv, 4, 5, 3, rr, a, bx4, bw4, bh4
665    add           bx4d, bw4d
666    tzcnt         bw4d, bw4d
667    vbroadcasti128  m0, [aq]
668    lea             aq, [splat_mv_avx2_table]
669    lea           bx4q, [bx4q*3-32]
670    movsxd        bw4q, [aq+bw4q*4]
671    pshufb          m0, [splat_mv_shuf]
672    movifnidn     bh4d, bh4m
673    pshufd          m1, m0, q2102
674    pshufd          m2, m0, q1021
675    add           bw4q, aq
676.loop:
677    mov             aq, [rrq]
678    add            rrq, gprsize
679    lea             aq, [aq+bx4q*4]
680    jmp           bw4q
681.w32:
682    mova     [aq-32*8], m0
683    mova     [aq-32*7], m1
684    mova     [aq-32*6], m2
685    mova     [aq-32*5], m0
686    mova     [aq-32*4], m1
687    mova     [aq-32*3], m2
688.w16:
689    mova     [aq-32*2], m0
690    mova     [aq-32*1], m1
691    mova     [aq+32*0], m2
692.w8:
693    mova     [aq+32*1], m0
694    mova     [aq+32*2], m1
695    mova     [aq+32*3], m2
696    dec           bh4d
697    jg .loop
698    RET
699.w4:
700    movu      [aq+ 80], m0
701    mova      [aq+112], xm1
702    dec           bh4d
703    jg .loop
704    RET
705.w2:
706    movu      [aq+104], xm0
707    movq      [aq+120], xm2
708    dec           bh4d
709    jg .loop
710    RET
711.w1:
712    movq      [aq+116], xm0
713    movd      [aq+124], xm1
714    dec           bh4d
715    jg .loop
716    RET
717
718INIT_ZMM avx512icl
719; refmvs_temporal_block *rp, ptrdiff_t stride,
720; refmvs_block **rr, uint8_t *ref_sign,
721; int col_end8, int row_end8, int col_start8, int row_start8
722cglobal save_tmvs, 4, 15, 10, rp, stride, rr, ref_sign, \
723                              xend, yend, xstart, ystart
724%define base r14-.write1
725    lea            r14, [.write1]
726    movifnidn    xendd, xendm
727    movifnidn    yendd, yendm
728    mov        xstartd, xstartm
729    mov        ystartd, ystartm
730    psllq           m4, [ref_signq]{bcstq}, 8
731    vpbroadcastq    m3, [base+save_ref_shuf+8]
732    vbroadcasti32x4 m5, [base+cond_shuf512]
733    vbroadcasti32x4 m6, [base+save_cond0]
734    vpbroadcastd    m7, [base+pb_128]
735    mova            m8, [base+save_pack0]
736    movu           xm9, [base+save_pack0+4]
737    lea            r9d, [xendq*5]
738    lea        xstartd, [xstartq*5]
739    sub          yendd, ystartd
740    add        ystartd, ystartd
741    lea        strideq, [strideq*5]
742    sub        xstartq, r9
743    add          xendd, r9d
744    add            rpq, r9
745    mov           r10d, 0x1f
746    kmovb           k2, r10d
747 DEFINE_ARGS rp, stride, rr, x, xend, h, xstart, ystart, b, cand
748.loop_y:
749    and        ystartd, 30
750    mov             xq, xstartq
751    mov             bq, [rrq+ystartq*8]
752    add        ystartd, 2
753    lea             bq, [bq+xendq*4]
754.loop_x:
755    imul         candq, xq, 0x9999
756    sar          candq, 16                   ; x / 5 * 3
757    movzx         r10d, byte [bq+candq*8+22] ; cand_b->bs
758    movu           xm0, [bq+candq*8+12]      ; cand_b
759    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r10*2+0]
760    movzx         r10d, byte [base+save_tmvs_avx512icl_table+r10*2+1]
761    add            r10, r14
762    add          candq, r11
763    jge .calc
764    movzx         r11d, byte [bq+candq*8+22]
765    vinserti32x4   ym0, [bq+candq*8+12], 1
766    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r11*2+0]
767    movzx         r11d, byte [base+save_tmvs_avx512icl_table+r11*2+1]
768    add            r11, r14
769    add          candq, r12
770    jge .calc
771    movzx         r12d, byte [bq+candq*8+22]
772    vinserti32x4    m0, [bq+candq*8+12], 2
773    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r12*2+0]
774    movzx         r12d, byte [base+save_tmvs_avx512icl_table+r12*2+1]
775    add            r12, r14
776    add          candq, r13
777    jge .calc
778    vinserti32x4    m0, [bq+candq*8+12], 3
779    movzx         r13d, byte [bq+candq*8+22]
780    movzx         r13d, byte [base+save_tmvs_avx512icl_table+r13*2+1]
781    add            r13, r14
782.calc:
783    pshufb          m1, m0, m3
784    pabsw           m2, m0
785    pshufb          m1, m4, m1      ; ref > 0 && res_sign[ref - 1]
786    psrlw           m2, 12          ; (abs(mv.x) | abs(mv.y)) < 4096
787    psubd           m2, m1
788    pshufb          m2, m5           ; c0 c1 c1 c0
789    pand            m2, m6
790    punpckhqdq      m1, m2, m2
791    vpternlogd      m1, m2, m7, 0x56 ; (c0shuf | c1shuf) ^ 0x80
792    pshufb          m2, m0, m1
793    mova           xm0, xm2
794    call           r10
795    jge .next_line
796    vextracti32x4  xm0, m2, 1
797    call           r11
798    jge .next_line
799    vextracti32x4  xm0, m2, 2
800    call           r12
801    jge .next_line
802    vextracti32x4  xm0, m2, 3
803    call           r13
804    jl .loop_x
805.next_line:
806    add            rpq, strideq
807    dec             hd
808    jg .loop_y
809    RET
810.write1:
811    vmovdqu8 [rpq+xq]{k2}, xm0
812    add             xq, 5*1
813    ret
814.write2:
815    pshufb         xm0, xm8
816    vmovdqu16 [rpq+xq]{k2}, xm0
817    add             xq, 5*2
818    ret
819.write4:
820    vpermb         ym0, ym8, ym0
821    vmovdqu32 [rpq+xq]{k2}, ym0
822    add             xq, 5*4
823    ret
824.write8:
825    vpermb          m0, m8, m0
826    vmovdqu64 [rpq+xq]{k2}, m0
827    add             xq, 5*8
828    ret
829.write16:
830    vpermb          m1, m8, m0
831    movu   [rpq+xq+ 0], m1
832    pshufb         xm0, xm9
833    movu   [rpq+xq+64], xm0
834    add             xq, 5*16
835    ret
836
837INIT_ZMM avx512icl
838cglobal splat_mv, 4, 7, 3, rr, a, bx4, bw4, bh4
839    vbroadcasti32x4    m0, [aq]
840    lea                r1, [splat_mv_avx512icl_table]
841    tzcnt            bw4d, bw4d
842    lea              bx4d, [bx4q*3]
843    pshufb             m0, [splat_mv_shuf]
844    movsxd           bw4q, [r1+bw4q*4]
845    mov               r6d, bh4m
846    add              bw4q, r1
847    lea               rrq, [rrq+r6*8]
848    mov               r1d, 0x3f
849    neg                r6
850    kmovb              k1, r1d
851    jmp              bw4q
852.w1:
853    mov                r1, [rrq+r6*8]
854    vmovdqu16 [r1+bx4q*4]{k1}, xm0
855    inc                r6
856    jl .w1
857    RET
858.w2:
859    mov                r1, [rrq+r6*8]
860    vmovdqu32 [r1+bx4q*4]{k1}, ym0
861    inc                r6
862    jl .w2
863    RET
864.w4:
865    mov                r1, [rrq+r6*8]
866    vmovdqu64 [r1+bx4q*4]{k1}, m0
867    inc                r6
868    jl .w4
869    RET
870.w8:
871    pshufd            ym1, ym0, q1021
872.w8_loop:
873    mov                r1, [rrq+r6*8+0]
874    mov                r3, [rrq+r6*8+8]
875    movu   [r1+bx4q*4+ 0], m0
876    mova   [r1+bx4q*4+64], ym1
877    movu   [r3+bx4q*4+ 0], m0
878    mova   [r3+bx4q*4+64], ym1
879    add                r6, 2
880    jl .w8_loop
881    RET
882.w16:
883    pshufd             m1, m0, q1021
884    pshufd             m2, m0, q2102
885.w16_loop:
886    mov                r1, [rrq+r6*8+0]
887    mov                r3, [rrq+r6*8+8]
888    mova [r1+bx4q*4+64*0], m0
889    mova [r1+bx4q*4+64*1], m1
890    mova [r1+bx4q*4+64*2], m2
891    mova [r3+bx4q*4+64*0], m0
892    mova [r3+bx4q*4+64*1], m1
893    mova [r3+bx4q*4+64*2], m2
894    add                r6, 2
895    jl .w16_loop
896    RET
897.w32:
898    pshufd             m1, m0, q1021
899    pshufd             m2, m0, q2102
900.w32_loop:
901    mov                r1, [rrq+r6*8]
902    lea                r1, [r1+bx4q*4]
903    mova        [r1+64*0], m0
904    mova        [r1+64*1], m1
905    mova        [r1+64*2], m2
906    mova        [r1+64*3], m0
907    mova        [r1+64*4], m1
908    mova        [r1+64*5], m2
909    inc                r6
910    jl .w32_loop
911    RET
912%endif ; ARCH_X86_64
913