• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA
30
31; dav1d_obmc_masks[] << 9
32obmc_masks:     dw     0,     0,  9728,     0, 12800,  7168,  2560,     0
33                dw 14336, 11264,  8192,  5632,  3584,  1536,     0,     0
34                dw 15360, 13824, 12288, 10752,  9216,  7680,  6144,  5120
35                dw  4096,  3072,  2048,  1536,     0,     0,     0,     0
36                dw 15872, 14848, 14336, 13312, 12288, 11776, 10752, 10240
37                dw  9728,  8704,  8192,  7168,  6656,  6144,  5632,  4608
38                dw  4096,  3584,  3072,  2560,  2048,  2048,  1536,  1024
39
40blend_shuf:     db 0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
41spel_h_shufA:   db 0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
42spel_h_shufB:   db 4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
43spel_h_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
44spel_s_shuf2:   db 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7
45spel_s_shuf8:   db 0,  1,  8,  9,  2,  3, 10, 11,  4,  5, 12, 13,  6,  7, 14, 15
46unpckw:         db 0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
47rescale_mul:    dd 0,  1,  2,  3
48resize_shuf:    db 0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
49                db 8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
50bdct_lb_q: times 8 db 0
51           times 8 db 4
52           times 8 db 8
53           times 8 db 12
54
55pw_2:             times 8 dw 2
56pw_16:            times 4 dw 16
57prep_mul:         times 4 dw 16
58                  times 8 dw 4
59pw_64:            times 8 dw 64
60pw_256:           times 8 dw 256
61pw_2048:          times 4 dw 2048
62bidir_mul:        times 4 dw 2048
63pw_8192:          times 8 dw 8192
64pw_27615:         times 8 dw 27615
65pw_32766:         times 8 dw 32766
66pw_m512:          times 8 dw -512
67pd_63:            times 4 dd 63
68pd_64:            times 4 dd 64
69pd_512:           times 4 dd 512
70pd_m524256:       times 4 dd -524256 ; -8192 << 6 + 32
71pd_0x3ff:         times 4 dd 0x3ff
72pd_0x4000:        times 4 dd 0x4000
73pq_0x400000:      times 2 dq 0x400000
74pq_0x40000000:    times 2 dq 0x40000000
75pd_65538:         times 2 dd 65538
76
77put_bilin_h_rnd:  times 4 dw 8
78                  times 4 dw 10
79s_8tap_h_rnd:     times 2 dd 2
80                  times 2 dd 8
81put_s_8tap_v_rnd: times 2 dd 512
82                  times 2 dd 128
83s_8tap_h_sh:      dd 2, 4
84put_s_8tap_v_sh:  dd 10, 8
85bidir_rnd:        times 4 dw -16400
86                  times 4 dw -16388
87put_8tap_h_rnd:   dd 34, 34, 40, 40
88prep_8tap_1d_rnd: times 2 dd     8 - (8192 <<  4)
89prep_8tap_2d_rnd: times 4 dd    32 - (8192 <<  5)
90
91warp8x8_shift:    dd 11, 13
92warp8x8_rnd1:     dd 1024, 1024, 4096, 4096
93warp8x8_rnd2:     times 4 dw 4096
94                  times 4 dw 16384
95warp8x8t_rnd:     times 2 dd 16384 - (8192 << 15)
96
97%macro BIDIR_JMP_TABLE 2-*
98    %xdefine %1_%2_table (%%table - 2*%3)
99    %xdefine %%base %1_%2_table
100    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
101    %%table:
102    %rep %0 - 2
103        dd %%prefix %+ .w%3 - %%base
104        %rotate 1
105    %endrep
106%endmacro
107
108BIDIR_JMP_TABLE avg,        ssse3,    4, 8, 16, 32, 64, 128
109BIDIR_JMP_TABLE w_avg,      ssse3,    4, 8, 16, 32, 64, 128
110BIDIR_JMP_TABLE mask,       ssse3,    4, 8, 16, 32, 64, 128
111BIDIR_JMP_TABLE w_mask_420, ssse3,    4, 8, 16, 32, 64, 128
112BIDIR_JMP_TABLE w_mask_422, ssse3,    4, 8, 16, 32, 64, 128
113BIDIR_JMP_TABLE w_mask_444, ssse3,    4, 8, 16, 32, 64, 128
114BIDIR_JMP_TABLE blend,      ssse3,    4, 8, 16, 32
115BIDIR_JMP_TABLE blend_v,    ssse3, 2, 4, 8, 16, 32
116BIDIR_JMP_TABLE blend_h,    ssse3, 2, 4, 8, 16, 32, 64, 128
117
118%macro BASE_JMP_TABLE 3-*
119    %xdefine %1_%2_table (%%table - %3)
120    %xdefine %%base %1_%2
121    %%table:
122    %rep %0 - 2
123        dw %%base %+ _w%3 - %%base
124        %rotate 1
125    %endrep
126%endmacro
127
128%xdefine put_ssse3 mangle(private_prefix %+ _put_bilin_16bpc_ssse3.put)
129%xdefine prep_ssse3 mangle(private_prefix %+ _prep_bilin_16bpc_ssse3.prep)
130
131BASE_JMP_TABLE put,  ssse3, 2, 4, 8, 16, 32, 64, 128
132BASE_JMP_TABLE prep, ssse3,    4, 8, 16, 32, 64, 128
133
134%macro SCALED_JMP_TABLE 2-*
135    %xdefine %1_%2_table (%%table - %3)
136    %xdefine %%base mangle(private_prefix %+ _%1_16bpc_%2)
137%%table:
138    %rep %0 - 2
139        dw %%base %+ .w%3 - %%base
140        %rotate 1
141    %endrep
142    %rotate 2
143%%dy_1024:
144    %xdefine %1_%2_dy1_table (%%dy_1024 - %3)
145    %rep %0 - 2
146        dw %%base %+ .dy1_w%3 - %%base
147        %rotate 1
148    %endrep
149    %rotate 2
150%%dy_2048:
151    %xdefine %1_%2_dy2_table (%%dy_2048 - %3)
152    %rep %0 - 2
153        dw %%base %+ .dy2_w%3 - %%base
154        %rotate 1
155    %endrep
156%endmacro
157
158SCALED_JMP_TABLE put_8tap_scaled, ssse3, 2, 4, 8, 16, 32, 64, 128
159SCALED_JMP_TABLE prep_8tap_scaled, ssse3,   4, 8, 16, 32, 64, 128
160
161cextern mc_subpel_filters
162%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
163
164cextern mc_warp_filter
165cextern resize_filter
166
167SECTION .text
168
169%if UNIX64
170DECLARE_REG_TMP 7
171%else
172DECLARE_REG_TMP 5
173%endif
174
175INIT_XMM ssse3
176cglobal put_bilin_16bpc, 4, 7, 0, dst, ds, src, ss, w, h, mxy
177%define base t0-put_ssse3
178    mov                mxyd, r6m ; mx
179    LEA                  t0, put_ssse3
180    movifnidn            wd, wm
181    test               mxyd, mxyd
182    jnz .h
183    mov                mxyd, r7m ; my
184    test               mxyd, mxyd
185    jnz .v
186.put:
187    tzcnt                wd, wd
188    movzx                wd, word [base+put_ssse3_table+wq*2]
189    add                  wq, t0
190    movifnidn            hd, hm
191    jmp                  wq
192.put_w2:
193    mov                 r4d, [srcq+ssq*0]
194    mov                 r6d, [srcq+ssq*1]
195    lea                srcq, [srcq+ssq*2]
196    mov        [dstq+dsq*0], r4d
197    mov        [dstq+dsq*1], r6d
198    lea                dstq, [dstq+dsq*2]
199    sub                  hd, 2
200    jg .put_w2
201    RET
202.put_w4:
203    movq                 m0, [srcq+ssq*0]
204    movq                 m1, [srcq+ssq*1]
205    lea                srcq, [srcq+ssq*2]
206    movq       [dstq+dsq*0], m0
207    movq       [dstq+dsq*1], m1
208    lea                dstq, [dstq+dsq*2]
209    sub                  hd, 2
210    jg .put_w4
211    RET
212.put_w8:
213    movu                 m0, [srcq+ssq*0]
214    movu                 m1, [srcq+ssq*1]
215    lea                srcq, [srcq+ssq*2]
216    mova       [dstq+dsq*0], m0
217    mova       [dstq+dsq*1], m1
218    lea                dstq, [dstq+dsq*2]
219    sub                  hd, 2
220    jg .put_w8
221    RET
222.put_w16:
223    movu                 m0, [srcq+ssq*0+16*0]
224    movu                 m1, [srcq+ssq*0+16*1]
225    movu                 m2, [srcq+ssq*1+16*0]
226    movu                 m3, [srcq+ssq*1+16*1]
227    lea                srcq, [srcq+ssq*2]
228    mova  [dstq+dsq*0+16*0], m0
229    mova  [dstq+dsq*0+16*1], m1
230    mova  [dstq+dsq*1+16*0], m2
231    mova  [dstq+dsq*1+16*1], m3
232    lea                dstq, [dstq+dsq*2]
233    sub                  hd, 2
234    jg .put_w16
235    RET
236.put_w32:
237    movu                 m0, [srcq+16*0]
238    movu                 m1, [srcq+16*1]
239    movu                 m2, [srcq+16*2]
240    movu                 m3, [srcq+16*3]
241    add                srcq, ssq
242    mova        [dstq+16*0], m0
243    mova        [dstq+16*1], m1
244    mova        [dstq+16*2], m2
245    mova        [dstq+16*3], m3
246    add                dstq, dsq
247    dec                  hd
248    jg .put_w32
249    RET
250.put_w64:
251    movu                 m0, [srcq+16*0]
252    movu                 m1, [srcq+16*1]
253    movu                 m2, [srcq+16*2]
254    movu                 m3, [srcq+16*3]
255    mova        [dstq+16*0], m0
256    mova        [dstq+16*1], m1
257    mova        [dstq+16*2], m2
258    mova        [dstq+16*3], m3
259    movu                 m0, [srcq+16*4]
260    movu                 m1, [srcq+16*5]
261    movu                 m2, [srcq+16*6]
262    movu                 m3, [srcq+16*7]
263    add                srcq, ssq
264    mova        [dstq+16*4], m0
265    mova        [dstq+16*5], m1
266    mova        [dstq+16*6], m2
267    mova        [dstq+16*7], m3
268    add                dstq, dsq
269    dec                  hd
270    jg .put_w64
271    RET
272.put_w128:
273    add                srcq, 16*8
274    add                dstq, 16*8
275.put_w128_loop:
276    movu                 m0, [srcq-16*8]
277    movu                 m1, [srcq-16*7]
278    movu                 m2, [srcq-16*6]
279    movu                 m3, [srcq-16*5]
280    mova        [dstq-16*8], m0
281    mova        [dstq-16*7], m1
282    mova        [dstq-16*6], m2
283    mova        [dstq-16*5], m3
284    movu                 m0, [srcq-16*4]
285    movu                 m1, [srcq-16*3]
286    movu                 m2, [srcq-16*2]
287    movu                 m3, [srcq-16*1]
288    mova        [dstq-16*4], m0
289    mova        [dstq-16*3], m1
290    mova        [dstq-16*2], m2
291    mova        [dstq-16*1], m3
292    movu                 m0, [srcq+16*0]
293    movu                 m1, [srcq+16*1]
294    movu                 m2, [srcq+16*2]
295    movu                 m3, [srcq+16*3]
296    mova        [dstq+16*0], m0
297    mova        [dstq+16*1], m1
298    mova        [dstq+16*2], m2
299    mova        [dstq+16*3], m3
300    movu                 m0, [srcq+16*4]
301    movu                 m1, [srcq+16*5]
302    movu                 m2, [srcq+16*6]
303    movu                 m3, [srcq+16*7]
304    add                srcq, ssq
305    mova        [dstq+16*4], m0
306    mova        [dstq+16*5], m1
307    mova        [dstq+16*6], m2
308    mova        [dstq+16*7], m3
309    add                dstq, dsq
310    dec                  hd
311    jg .put_w128_loop
312    RET
313.h:
314    movd                 m5, mxyd
315    mov                mxyd, r7m ; my
316    mova                 m4, [base+pw_16]
317    pshufb               m5, [base+pw_256]
318    psubw                m4, m5
319    test               mxyd, mxyd
320    jnz .hv
321    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
322    mov                 r6d, r8m ; bitdepth_max
323    shr                 r6d, 11
324    movddup              m3, [base+put_bilin_h_rnd+r6*8]
325    movifnidn            hd, hm
326    sub                  wd, 8
327    jg .h_w16
328    je .h_w8
329    cmp                  wd, -4
330    je .h_w4
331.h_w2:
332    movq                 m1, [srcq+ssq*0]
333    movhps               m1, [srcq+ssq*1]
334    lea                srcq, [srcq+ssq*2]
335    pmullw               m0, m4, m1
336    psrlq                m1, 16
337    pmullw               m1, m5
338    paddw                m0, m3
339    paddw                m0, m1
340    psrlw                m0, 4
341    movd       [dstq+dsq*0], m0
342    punpckhqdq           m0, m0
343    movd       [dstq+dsq*1], m0
344    lea                dstq, [dstq+dsq*2]
345    sub                  hd, 2
346    jg .h_w2
347    RET
348.h_w4:
349    movq                 m0, [srcq+ssq*0]
350    movhps               m0, [srcq+ssq*1]
351    movq                 m1, [srcq+ssq*0+2]
352    movhps               m1, [srcq+ssq*1+2]
353    lea                srcq, [srcq+ssq*2]
354    pmullw               m0, m4
355    pmullw               m1, m5
356    paddw                m0, m3
357    paddw                m0, m1
358    psrlw                m0, 4
359    movq       [dstq+dsq*0], m0
360    movhps     [dstq+dsq*1], m0
361    lea                dstq, [dstq+dsq*2]
362    sub                  hd, 2
363    jg .h_w4
364    RET
365.h_w8:
366    movu                 m0, [srcq+ssq*0]
367    movu                 m1, [srcq+ssq*0+2]
368    pmullw               m0, m4
369    pmullw               m1, m5
370    paddw                m0, m3
371    paddw                m0, m1
372    movu                 m1, [srcq+ssq*1]
373    movu                 m2, [srcq+ssq*1+2]
374    lea                srcq, [srcq+ssq*2]
375    pmullw               m1, m4
376    pmullw               m2, m5
377    paddw                m1, m3
378    paddw                m1, m2
379    psrlw                m0, 4
380    psrlw                m1, 4
381    mova       [dstq+dsq*0], m0
382    mova       [dstq+dsq*1], m1
383    lea                dstq, [dstq+dsq*2]
384    sub                  hd, 2
385    jg .h_w8
386    RET
387.h_w16:
388    lea                srcq, [srcq+wq*2]
389    lea                dstq, [dstq+wq*2]
390    neg                  wq
391.h_w16_loop0:
392    mov                  r6, wq
393.h_w16_loop:
394    movu                 m0, [srcq+r6*2+ 0]
395    movu                 m1, [srcq+r6*2+ 2]
396    pmullw               m0, m4
397    pmullw               m1, m5
398    paddw                m0, m3
399    paddw                m0, m1
400    movu                 m1, [srcq+r6*2+16]
401    movu                 m2, [srcq+r6*2+18]
402    pmullw               m1, m4
403    pmullw               m2, m5
404    paddw                m1, m3
405    paddw                m1, m2
406    psrlw                m0, 4
407    psrlw                m1, 4
408    mova   [dstq+r6*2+16*0], m0
409    mova   [dstq+r6*2+16*1], m1
410    add                  r6, 16
411    jl .h_w16_loop
412    add                srcq, ssq
413    add                dstq, dsq
414    dec                  hd
415    jg .h_w16_loop0
416    RET
417.v:
418    shl                mxyd, 11
419    movd                 m5, mxyd
420    pshufb               m5, [base+pw_256]
421    movifnidn            hd, hm
422    cmp                  wd, 4
423    jg .v_w8
424    je .v_w4
425.v_w2:
426    movd                 m0, [srcq+ssq*0]
427.v_w2_loop:
428    movd                 m1, [srcq+ssq*1]
429    lea                srcq, [srcq+ssq*2]
430    punpcklqdq           m2, m0, m1
431    movd                 m0, [srcq+ssq*0]
432    punpcklqdq           m1, m0
433    psubw                m1, m2
434    pmulhrsw             m1, m5
435    paddw                m1, m2
436    movd       [dstq+dsq*0], m1
437    punpckhqdq           m1, m1
438    movd       [dstq+dsq*1], m1
439    lea                dstq, [dstq+dsq*2]
440    sub                  hd, 2
441    jg .v_w2_loop
442    RET
443.v_w4:
444    movq                 m0, [srcq+ssq*0]
445.v_w4_loop:
446    movq                 m1, [srcq+ssq*1]
447    lea                srcq, [srcq+ssq*2]
448    punpcklqdq           m2, m0, m1
449    movq                 m0, [srcq+ssq*0]
450    punpcklqdq           m1, m0
451    psubw                m1, m2
452    pmulhrsw             m1, m5
453    paddw                m1, m2
454    movq       [dstq+dsq*0], m1
455    movhps     [dstq+dsq*1], m1
456    lea                dstq, [dstq+dsq*2]
457    sub                  hd, 2
458    jg .v_w4_loop
459    RET
460.v_w8:
461%if ARCH_X86_64
462%if WIN64
463    push                 r7
464%endif
465    shl                  wd, 5
466    mov                  r7, srcq
467    lea                 r6d, [wq+hq-256]
468    mov                  r4, dstq
469%else
470    mov                  r6, srcq
471%endif
472.v_w8_loop0:
473    movu                 m0, [srcq+ssq*0]
474.v_w8_loop:
475    movu                 m3, [srcq+ssq*1]
476    lea                srcq, [srcq+ssq*2]
477    psubw                m1, m3, m0
478    pmulhrsw             m1, m5
479    paddw                m1, m0
480    movu                 m0, [srcq+ssq*0]
481    psubw                m2, m0, m3
482    pmulhrsw             m2, m5
483    paddw                m2, m3
484    mova       [dstq+dsq*0], m1
485    mova       [dstq+dsq*1], m2
486    lea                dstq, [dstq+dsq*2]
487    sub                  hd, 2
488    jg .v_w8_loop
489%if ARCH_X86_64
490    add                  r7, 16
491    add                  r4, 16
492    movzx                hd, r6b
493    mov                srcq, r7
494    mov                dstq, r4
495    sub                 r6d, 1<<8
496%else
497    mov                dstq, dstmp
498    add                  r6, 16
499    mov                  hd, hm
500    add                dstq, 16
501    mov                srcq, r6
502    mov               dstmp, dstq
503    sub                  wd, 8
504%endif
505    jg .v_w8_loop0
506%if WIN64
507    pop                 r7
508%endif
509    RET
510.hv:
511    WIN64_SPILL_XMM       8
512    shl                mxyd, 11
513    mova                 m3, [base+pw_2]
514    movd                 m6, mxyd
515    mova                 m7, [base+pw_8192]
516    pshufb               m6, [base+pw_256]
517    test          dword r8m, 0x800
518    jnz .hv_12bpc
519    psllw                m4, 2
520    psllw                m5, 2
521    mova                 m7, [base+pw_2048]
522.hv_12bpc:
523    movifnidn            hd, hm
524    cmp                  wd, 4
525    jg .hv_w8
526    je .hv_w4
527.hv_w2:
528    movddup              m0, [srcq+ssq*0]
529    pshufhw              m1, m0, q0321
530    pmullw               m0, m4
531    pmullw               m1, m5
532    paddw                m0, m3
533    paddw                m0, m1
534    psrlw                m0, 2
535.hv_w2_loop:
536    movq                 m2, [srcq+ssq*1]
537    lea                srcq, [srcq+ssq*2]
538    movhps               m2, [srcq+ssq*0]
539    pmullw               m1, m4, m2
540    psrlq                m2, 16
541    pmullw               m2, m5
542    paddw                m1, m3
543    paddw                m1, m2
544    psrlw                m1, 2            ; 1 _ 2 _
545    shufpd               m2, m0, m1, 0x01 ; 0 _ 1 _
546    mova                 m0, m1
547    psubw                m1, m2
548    paddw                m1, m1
549    pmulhw               m1, m6
550    paddw                m1, m2
551    pmulhrsw             m1, m7
552    movd       [dstq+dsq*0], m1
553    punpckhqdq           m1, m1
554    movd       [dstq+dsq*1], m1
555    lea                dstq, [dstq+dsq*2]
556    sub                  hd, 2
557    jg .hv_w2_loop
558    RET
559.hv_w4:
560    movddup              m0, [srcq+ssq*0]
561    movddup              m1, [srcq+ssq*0+2]
562    pmullw               m0, m4
563    pmullw               m1, m5
564    paddw                m0, m3
565    paddw                m0, m1
566    psrlw                m0, 2
567.hv_w4_loop:
568    movq                 m1, [srcq+ssq*1]
569    movq                 m2, [srcq+ssq*1+2]
570    lea                srcq, [srcq+ssq*2]
571    movhps               m1, [srcq+ssq*0]
572    movhps               m2, [srcq+ssq*0+2]
573    pmullw               m1, m4
574    pmullw               m2, m5
575    paddw                m1, m3
576    paddw                m1, m2
577    psrlw                m1, 2            ; 1 2
578    shufpd               m2, m0, m1, 0x01 ; 0 1
579    mova                 m0, m1
580    psubw                m1, m2
581    paddw                m1, m1
582    pmulhw               m1, m6
583    paddw                m1, m2
584    pmulhrsw             m1, m7
585    movq       [dstq+dsq*0], m1
586    movhps     [dstq+dsq*1], m1
587    lea                dstq, [dstq+dsq*2]
588    sub                  hd, 2
589    jg .hv_w4_loop
590    RET
591.hv_w8:
592%if ARCH_X86_64
593%if WIN64
594    push                 r7
595%endif
596    shl                  wd, 5
597    lea                 r6d, [wq+hq-256]
598    mov                  r4, srcq
599    mov                  r7, dstq
600%else
601    mov                  r6, srcq
602%endif
603.hv_w8_loop0:
604    movu                 m0, [srcq+ssq*0]
605    movu                 m1, [srcq+ssq*0+2]
606    pmullw               m0, m4
607    pmullw               m1, m5
608    paddw                m0, m3
609    paddw                m0, m1
610    psrlw                m0, 2
611.hv_w8_loop:
612    movu                 m1, [srcq+ssq*1]
613    movu                 m2, [srcq+ssq*1+2]
614    lea                srcq, [srcq+ssq*2]
615    pmullw               m1, m4
616    pmullw               m2, m5
617    paddw                m1, m3
618    paddw                m1, m2
619    psrlw                m1, 2
620    psubw                m2, m1, m0
621    paddw                m2, m2
622    pmulhw               m2, m6
623    paddw                m2, m0
624    pmulhrsw             m2, m7
625    mova       [dstq+dsq*0], m2
626    movu                 m0, [srcq+ssq*0]
627    movu                 m2, [srcq+ssq*0+2]
628    pmullw               m0, m4
629    pmullw               m2, m5
630    paddw                m0, m3
631    paddw                m0, m2
632    psrlw                m0, 2
633    psubw                m2, m0, m1
634    paddw                m2, m2
635    pmulhw               m2, m6
636    paddw                m2, m1
637    pmulhrsw             m2, m7
638    mova       [dstq+dsq*1], m2
639    lea                dstq, [dstq+dsq*2]
640    sub                  hd, 2
641    jg .hv_w8_loop
642%if ARCH_X86_64
643    add                  r4, 16
644    add                  r7, 16
645    movzx                hd, r6b
646    mov                srcq, r4
647    mov                dstq, r7
648    sub                 r6d, 1<<8
649%else
650    mov                dstq, dstmp
651    add                  r6, 16
652    mov                  hd, hm
653    add                dstq, 16
654    mov                srcq, r6
655    mov               dstmp, dstq
656    sub                  wd, 8
657%endif
658    jg .hv_w8_loop0
659%if WIN64
660    pop                  r7
661%endif
662    RET
663
664cglobal prep_bilin_16bpc, 4, 7, 0, tmp, src, stride, w, h, mxy, stride3
665%define base r6-prep_ssse3
666    movifnidn          mxyd, r5m ; mx
667    LEA                  r6, prep_ssse3
668    movifnidn            hd, hm
669    test               mxyd, mxyd
670    jnz .h
671    mov                mxyd, r6m ; my
672    test               mxyd, mxyd
673    jnz .v
674.prep:
675    tzcnt                wd, wd
676    movzx                wd, word [base+prep_ssse3_table+wq*2]
677    mov                 r5d, r7m ; bitdepth_max
678    mova                 m5, [base+pw_8192]
679    add                  wq, r6
680    shr                 r5d, 11
681    movddup              m4, [base+prep_mul+r5*8]
682    lea            stride3q, [strideq*3]
683    jmp                  wq
684.prep_w4:
685    movq                 m0, [srcq+strideq*0]
686    movhps               m0, [srcq+strideq*1]
687    movq                 m1, [srcq+strideq*2]
688    movhps               m1, [srcq+stride3q ]
689    lea                srcq, [srcq+strideq*4]
690    pmullw               m0, m4
691    pmullw               m1, m4
692    psubw                m0, m5
693    psubw                m1, m5
694    mova        [tmpq+16*0], m0
695    mova        [tmpq+16*1], m1
696    add                tmpq, 16*2
697    sub                  hd, 4
698    jg .prep_w4
699    RET
700.prep_w8:
701    movu                 m0, [srcq+strideq*0]
702    movu                 m1, [srcq+strideq*1]
703    movu                 m2, [srcq+strideq*2]
704    movu                 m3, [srcq+stride3q ]
705    lea                srcq, [srcq+strideq*4]
706    REPX     {pmullw x, m4}, m0, m1, m2, m3
707    REPX     {psubw  x, m5}, m0, m1, m2, m3
708    mova        [tmpq+16*0], m0
709    mova        [tmpq+16*1], m1
710    mova        [tmpq+16*2], m2
711    mova        [tmpq+16*3], m3
712    add                tmpq, 16*4
713    sub                  hd, 4
714    jg .prep_w8
715    RET
716.prep_w16:
717    movu                 m0, [srcq+strideq*0+16*0]
718    movu                 m1, [srcq+strideq*0+16*1]
719    movu                 m2, [srcq+strideq*1+16*0]
720    movu                 m3, [srcq+strideq*1+16*1]
721    lea                srcq, [srcq+strideq*2]
722    REPX     {pmullw x, m4}, m0, m1, m2, m3
723    REPX     {psubw  x, m5}, m0, m1, m2, m3
724    mova        [tmpq+16*0], m0
725    mova        [tmpq+16*1], m1
726    mova        [tmpq+16*2], m2
727    mova        [tmpq+16*3], m3
728    add                tmpq, 16*4
729    sub                  hd, 2
730    jg .prep_w16
731    RET
732.prep_w32:
733    movu                 m0, [srcq+16*0]
734    movu                 m1, [srcq+16*1]
735    movu                 m2, [srcq+16*2]
736    movu                 m3, [srcq+16*3]
737    add                srcq, strideq
738    REPX     {pmullw x, m4}, m0, m1, m2, m3
739    REPX     {psubw  x, m5}, m0, m1, m2, m3
740    mova        [tmpq+16*0], m0
741    mova        [tmpq+16*1], m1
742    mova        [tmpq+16*2], m2
743    mova        [tmpq+16*3], m3
744    add                tmpq, 16*4
745    dec                  hd
746    jg .prep_w32
747    RET
748.prep_w64:
749    movu                 m0, [srcq+16*0]
750    movu                 m1, [srcq+16*1]
751    movu                 m2, [srcq+16*2]
752    movu                 m3, [srcq+16*3]
753    REPX     {pmullw x, m4}, m0, m1, m2, m3
754    REPX     {psubw  x, m5}, m0, m1, m2, m3
755    mova        [tmpq+16*0], m0
756    mova        [tmpq+16*1], m1
757    mova        [tmpq+16*2], m2
758    mova        [tmpq+16*3], m3
759    movu                 m0, [srcq+16*4]
760    movu                 m1, [srcq+16*5]
761    movu                 m2, [srcq+16*6]
762    movu                 m3, [srcq+16*7]
763    add                srcq, strideq
764    REPX     {pmullw x, m4}, m0, m1, m2, m3
765    REPX     {psubw  x, m5}, m0, m1, m2, m3
766    mova        [tmpq+16*4], m0
767    mova        [tmpq+16*5], m1
768    mova        [tmpq+16*6], m2
769    mova        [tmpq+16*7], m3
770    add                tmpq, 16*8
771    dec                  hd
772    jg .prep_w64
773    RET
774.prep_w128:
775    movu                 m0, [srcq+16* 0]
776    movu                 m1, [srcq+16* 1]
777    movu                 m2, [srcq+16* 2]
778    movu                 m3, [srcq+16* 3]
779    REPX     {pmullw x, m4}, m0, m1, m2, m3
780    REPX     {psubw  x, m5}, m0, m1, m2, m3
781    mova        [tmpq+16*0], m0
782    mova        [tmpq+16*1], m1
783    mova        [tmpq+16*2], m2
784    mova        [tmpq+16*3], m3
785    movu                 m0, [srcq+16* 4]
786    movu                 m1, [srcq+16* 5]
787    movu                 m2, [srcq+16* 6]
788    movu                 m3, [srcq+16* 7]
789    REPX     {pmullw x, m4}, m0, m1, m2, m3
790    REPX     {psubw  x, m5}, m0, m1, m2, m3
791    mova        [tmpq+16*4], m0
792    mova        [tmpq+16*5], m1
793    mova        [tmpq+16*6], m2
794    mova        [tmpq+16*7], m3
795    movu                 m0, [srcq+16* 8]
796    movu                 m1, [srcq+16* 9]
797    movu                 m2, [srcq+16*10]
798    movu                 m3, [srcq+16*11]
799    add                tmpq, 16*16
800    REPX     {pmullw x, m4}, m0, m1, m2, m3
801    REPX     {psubw  x, m5}, m0, m1, m2, m3
802    mova        [tmpq-16*8], m0
803    mova        [tmpq-16*7], m1
804    mova        [tmpq-16*6], m2
805    mova        [tmpq-16*5], m3
806    movu                 m0, [srcq+16*12]
807    movu                 m1, [srcq+16*13]
808    movu                 m2, [srcq+16*14]
809    movu                 m3, [srcq+16*15]
810    add                srcq, strideq
811    REPX     {pmullw x, m4}, m0, m1, m2, m3
812    REPX     {psubw  x, m5}, m0, m1, m2, m3
813    mova        [tmpq-16*4], m0
814    mova        [tmpq-16*3], m1
815    mova        [tmpq-16*2], m2
816    mova        [tmpq-16*1], m3
817    dec                  hd
818    jg .prep_w128
819    RET
820.h:
821    movd                 m4, mxyd
822    mov                mxyd, r6m ; my
823    mova                 m3, [base+pw_16]
824    pshufb               m4, [base+pw_256]
825    mova                 m5, [base+pw_32766]
826    psubw                m3, m4
827    test          dword r7m, 0x800
828    jnz .h_12bpc
829    psllw                m3, 2
830    psllw                m4, 2
831.h_12bpc:
832    test               mxyd, mxyd
833    jnz .hv
834    sub                  wd, 8
835    je .h_w8
836    jg .h_w16
837.h_w4:
838    movq                 m0, [srcq+strideq*0]
839    movhps               m0, [srcq+strideq*1]
840    movq                 m1, [srcq+strideq*0+2]
841    movhps               m1, [srcq+strideq*1+2]
842    lea                srcq, [srcq+strideq*2]
843    pmullw               m0, m3
844    pmullw               m1, m4
845    psubw                m0, m5
846    paddw                m0, m1
847    psraw                m0, 2
848    mova             [tmpq], m0
849    add                tmpq, 16
850    sub                  hd, 2
851    jg .h_w4
852    RET
853.h_w8:
854    movu                 m0, [srcq+strideq*0]
855    movu                 m1, [srcq+strideq*0+2]
856    pmullw               m0, m3
857    pmullw               m1, m4
858    psubw                m0, m5
859    paddw                m0, m1
860    movu                 m1, [srcq+strideq*1]
861    movu                 m2, [srcq+strideq*1+2]
862    lea                srcq, [srcq+strideq*2]
863    pmullw               m1, m3
864    pmullw               m2, m4
865    psubw                m1, m5
866    paddw                m1, m2
867    psraw                m0, 2
868    psraw                m1, 2
869    mova        [tmpq+16*0], m0
870    mova        [tmpq+16*1], m1
871    add                tmpq, 16*2
872    sub                  hd, 2
873    jg .h_w8
874    RET
875.h_w16:
876    lea                srcq, [srcq+wq*2]
877    neg                  wq
878.h_w16_loop0:
879    mov                  r6, wq
880.h_w16_loop:
881    movu                 m0, [srcq+r6*2+ 0]
882    movu                 m1, [srcq+r6*2+ 2]
883    pmullw               m0, m3
884    pmullw               m1, m4
885    psubw                m0, m5
886    paddw                m0, m1
887    movu                 m1, [srcq+r6*2+16]
888    movu                 m2, [srcq+r6*2+18]
889    pmullw               m1, m3
890    pmullw               m2, m4
891    psubw                m1, m5
892    paddw                m1, m2
893    psraw                m0, 2
894    psraw                m1, 2
895    mova        [tmpq+16*0], m0
896    mova        [tmpq+16*1], m1
897    add                tmpq, 16*2
898    add                  r6, 16
899    jl .h_w16_loop
900    add                srcq, strideq
901    dec                  hd
902    jg .h_w16_loop0
903    RET
904.v:
905    movd                 m4, mxyd
906    mova                 m3, [base+pw_16]
907    pshufb               m4, [base+pw_256]
908    mova                 m5, [base+pw_32766]
909    psubw                m3, m4
910    test          dword r7m, 0x800
911    jnz .v_12bpc
912    psllw                m3, 2
913    psllw                m4, 2
914.v_12bpc:
915    cmp                  wd, 8
916    je .v_w8
917    jg .v_w16
918.v_w4:
919    movq                 m0, [srcq+strideq*0]
920.v_w4_loop:
921    movq                 m2, [srcq+strideq*1]
922    lea                srcq, [srcq+strideq*2]
923    punpcklqdq           m1, m0, m2 ; 0 1
924    movq                 m0, [srcq+strideq*0]
925    punpcklqdq           m2, m0     ; 1 2
926    pmullw               m1, m3
927    pmullw               m2, m4
928    psubw                m1, m5
929    paddw                m1, m2
930    psraw                m1, 2
931    mova             [tmpq], m1
932    add                tmpq, 16
933    sub                  hd, 2
934    jg .v_w4_loop
935    RET
936.v_w8:
937    movu                 m0, [srcq+strideq*0]
938.v_w8_loop:
939    movu                 m2, [srcq+strideq*1]
940    lea                srcq, [srcq+strideq*2]
941    pmullw               m0, m3
942    pmullw               m1, m4, m2
943    psubw                m0, m5
944    paddw                m1, m0
945    movu                 m0, [srcq+strideq*0]
946    psraw                m1, 2
947    pmullw               m2, m3
948    mova        [tmpq+16*0], m1
949    pmullw               m1, m4, m0
950    psubw                m2, m5
951    paddw                m1, m2
952    psraw                m1, 2
953    mova        [tmpq+16*1], m1
954    add                tmpq, 16*2
955    sub                  hd, 2
956    jg .v_w8_loop
957    RET
958.v_w16:
959%if WIN64
960    push                 r7
961%endif
962    mov                  r5, srcq
963%if ARCH_X86_64
964    lea                 r6d, [wq*4-32]
965    mov                  wd, wd
966    lea                 r6d, [hq+r6*8]
967    mov                  r7, tmpq
968%else
969    mov                 r6d, wd
970%endif
971.v_w16_loop0:
972    movu                 m0, [srcq+strideq*0]
973.v_w16_loop:
974    movu                 m2, [srcq+strideq*1]
975    lea                srcq, [srcq+strideq*2]
976    pmullw               m0, m3
977    pmullw               m1, m4, m2
978    psubw                m0, m5
979    paddw                m1, m0
980    movu                 m0, [srcq+strideq*0]
981    psraw                m1, 2
982    pmullw               m2, m3
983    mova        [tmpq+wq*0], m1
984    pmullw               m1, m4, m0
985    psubw                m2, m5
986    paddw                m1, m2
987    psraw                m1, 2
988    mova        [tmpq+wq*2], m1
989    lea                tmpq, [tmpq+wq*4]
990    sub                  hd, 2
991    jg .v_w16_loop
992%if ARCH_X86_64
993    add                  r5, 16
994    add                  r7, 16
995    movzx                hd, r6b
996    mov                srcq, r5
997    mov                tmpq, r7
998    sub                 r6d, 1<<8
999%else
1000    mov                tmpq, tmpmp
1001    add                  r5, 16
1002    mov                  hd, hm
1003    add                tmpq, 16
1004    mov                srcq, r5
1005    mov               tmpmp, tmpq
1006    sub                 r6d, 8
1007%endif
1008    jg .v_w16_loop0
1009%if WIN64
1010    pop                  r7
1011%endif
1012    RET
1013.hv:
1014    WIN64_SPILL_XMM       7
1015    shl                mxyd, 11
1016    movd                 m6, mxyd
1017    pshufb               m6, [base+pw_256]
1018    cmp                  wd, 8
1019    je .hv_w8
1020    jg .hv_w16
1021.hv_w4:
1022    movddup              m0, [srcq+strideq*0]
1023    movddup              m1, [srcq+strideq*0+2]
1024    pmullw               m0, m3
1025    pmullw               m1, m4
1026    psubw                m0, m5
1027    paddw                m0, m1
1028    psraw                m0, 2
1029.hv_w4_loop:
1030    movq                 m1, [srcq+strideq*1]
1031    movq                 m2, [srcq+strideq*1+2]
1032    lea                srcq, [srcq+strideq*2]
1033    movhps               m1, [srcq+strideq*0]
1034    movhps               m2, [srcq+strideq*0+2]
1035    pmullw               m1, m3
1036    pmullw               m2, m4
1037    psubw                m1, m5
1038    paddw                m1, m2
1039    psraw                m1, 2            ; 1 2
1040    shufpd               m2, m0, m1, 0x01 ; 0 1
1041    mova                 m0, m1
1042    psubw                m1, m2
1043    pmulhrsw             m1, m6
1044    paddw                m1, m2
1045    mova             [tmpq], m1
1046    add                tmpq, 16
1047    sub                  hd, 2
1048    jg .hv_w4_loop
1049    RET
1050.hv_w8:
1051    movu                 m0, [srcq+strideq*0]
1052    movu                 m1, [srcq+strideq*0+2]
1053    pmullw               m0, m3
1054    pmullw               m1, m4
1055    psubw                m0, m5
1056    paddw                m0, m1
1057    psraw                m0, 2
1058.hv_w8_loop:
1059    movu                 m1, [srcq+strideq*1]
1060    movu                 m2, [srcq+strideq*1+2]
1061    lea                srcq, [srcq+strideq*2]
1062    pmullw               m1, m3
1063    pmullw               m2, m4
1064    psubw                m1, m5
1065    paddw                m1, m2
1066    psraw                m1, 2
1067    psubw                m2, m1, m0
1068    pmulhrsw             m2, m6
1069    paddw                m2, m0
1070    mova        [tmpq+16*0], m2
1071    movu                 m0, [srcq+strideq*0]
1072    movu                 m2, [srcq+strideq*0+2]
1073    pmullw               m0, m3
1074    pmullw               m2, m4
1075    psubw                m0, m5
1076    paddw                m0, m2
1077    psraw                m0, 2
1078    psubw                m2, m0, m1
1079    pmulhrsw             m2, m6
1080    paddw                m2, m1
1081    mova        [tmpq+16*1], m2
1082    add                tmpq, 16*2
1083    sub                  hd, 2
1084    jg .hv_w8_loop
1085    RET
1086.hv_w16:
1087%if WIN64
1088    push                 r7
1089%endif
1090    mov                  r5, srcq
1091%if ARCH_X86_64
1092    lea                 r6d, [wq*4-32]
1093    mov                  wd, wd
1094    lea                 r6d, [hq+r6*8]
1095    mov                  r7, tmpq
1096%else
1097    mov                 r6d, wd
1098%endif
1099.hv_w16_loop0:
1100    movu                 m0, [srcq+strideq*0]
1101    movu                 m1, [srcq+strideq*0+2]
1102    pmullw               m0, m3
1103    pmullw               m1, m4
1104    psubw                m0, m5
1105    paddw                m0, m1
1106    psraw                m0, 2
1107.hv_w16_loop:
1108    movu                 m1, [srcq+strideq*1]
1109    movu                 m2, [srcq+strideq*1+2]
1110    lea                srcq, [srcq+strideq*2]
1111    pmullw               m1, m3
1112    pmullw               m2, m4
1113    psubw                m1, m5
1114    paddw                m1, m2
1115    psraw                m1, 2
1116    psubw                m2, m1, m0
1117    pmulhrsw             m2, m6
1118    paddw                m2, m0
1119    mova        [tmpq+wq*0], m2
1120    movu                 m0, [srcq+strideq*0]
1121    movu                 m2, [srcq+strideq*0+2]
1122    pmullw               m0, m3
1123    pmullw               m2, m4
1124    psubw                m0, m5
1125    paddw                m0, m2
1126    psraw                m0, 2
1127    psubw                m2, m0, m1
1128    pmulhrsw             m2, m6
1129    paddw                m2, m1
1130    mova        [tmpq+wq*2], m2
1131    lea                tmpq, [tmpq+wq*4]
1132    sub                  hd, 2
1133    jg .hv_w16_loop
1134%if ARCH_X86_64
1135    add                  r5, 16
1136    add                  r7, 16
1137    movzx                hd, r6b
1138    mov                srcq, r5
1139    mov                tmpq, r7
1140    sub                 r6d, 1<<8
1141%else
1142    mov                tmpq, tmpmp
1143    add                  r5, 16
1144    mov                  hd, hm
1145    add                tmpq, 16
1146    mov                srcq, r5
1147    mov               tmpmp, tmpq
1148    sub                 r6d, 8
1149%endif
1150    jg .hv_w16_loop0
1151%if WIN64
1152    pop                  r7
1153%endif
1154    RET
1155
1156; int8_t subpel_filters[5][15][8]
1157%assign FILTER_REGULAR (0*15 << 16) | 3*15
1158%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1159%assign FILTER_SHARP   (2*15 << 16) | 3*15
1160
1161%macro FN 4 ; prefix, type, type_h, type_v
1162cglobal %1_%2_16bpc
1163    mov                 t0d, FILTER_%3
1164%ifidn %3, %4
1165    mov                 t1d, t0d
1166%else
1167    mov                 t1d, FILTER_%4
1168%endif
1169%ifnidn %2, regular ; skip the jump in the last filter
1170    jmp mangle(private_prefix %+ _%1_16bpc %+ SUFFIX)
1171%endif
1172%endmacro
1173
1174%if ARCH_X86_32
1175DECLARE_REG_TMP 1, 2, 6
1176%elif WIN64
1177DECLARE_REG_TMP 4, 5, 8
1178%else
1179DECLARE_REG_TMP 7, 8, 8
1180%endif
1181
1182%define PUT_8TAP_FN FN put_8tap,
1183PUT_8TAP_FN sharp,          SHARP,   SHARP
1184PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
1185PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
1186PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH
1187PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR
1188PUT_8TAP_FN regular_sharp,  REGULAR, SHARP
1189PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR
1190PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH
1191PUT_8TAP_FN regular,        REGULAR, REGULAR
1192
1193%if ARCH_X86_32
1194cglobal put_8tap_16bpc, 0, 7, 8, dst, ds, src, ss, w, h, mx, my
1195%define mxb r0b
1196%define mxd r0
1197%define mxq r0
1198%define myb r1b
1199%define myd r1
1200%define myq r1
1201%define  m8 [esp+16*0]
1202%define  m9 [esp+16*1]
1203%define m10 [esp+16*2]
1204%define m11 [esp+16*3]
1205%define m12 [esp+16*4]
1206%define m13 [esp+16*5]
1207%define m14 [esp+16*6]
1208%define m15 [esp+16*7]
1209%else
1210cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1211%endif
1212%define base t2-put_ssse3
1213    imul                mxd, mxm, 0x010101
1214    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
1215    imul                myd, mym, 0x010101
1216    add                 myd, t1d ; 8tap_v, my, 4tap_v
1217    LEA                  t2, put_ssse3
1218    movifnidn            wd, wm
1219    movifnidn          srcq, srcmp
1220    movifnidn           ssq, ssmp
1221    movifnidn            hd, hm
1222    test                mxd, 0xf00
1223    jnz .h
1224    test                myd, 0xf00
1225    jnz .v
1226    tzcnt                wd, wd
1227    movzx                wd, word [base+put_ssse3_table+wq*2]
1228    movifnidn          dstq, dstmp
1229    movifnidn           dsq, dsmp
1230    add                  wq, t2
1231%if WIN64
1232    pop                  r8
1233    pop                  r7
1234%endif
1235    jmp                  wq
1236.h:
1237    test                myd, 0xf00
1238    jnz .hv
1239    mov                 myd, r8m
1240    movd                 m5, r8m
1241    shr                 myd, 11
1242    movddup              m4, [base+put_8tap_h_rnd+myq*8]
1243    movifnidn           dsq, dsmp
1244    pshufb               m5, [base+pw_256]
1245    cmp                  wd, 4
1246    jg .h_w8
1247    movzx               mxd, mxb
1248    lea                srcq, [srcq-2]
1249    movq                 m3, [base+subpel_filters+mxq*8]
1250    movifnidn          dstq, dstmp
1251    punpcklbw            m3, m3
1252    psraw                m3, 8 ; sign-extend
1253    je .h_w4
1254.h_w2:
1255    mova                 m2, [base+spel_h_shuf2]
1256    pshufd               m3, m3, q2121
1257.h_w2_loop:
1258    movu                 m0, [srcq+ssq*0]
1259    movu                 m1, [srcq+ssq*1]
1260    lea                srcq, [srcq+ssq*2]
1261    pshufb               m0, m2
1262    pshufb               m1, m2
1263    pmaddwd              m0, m3
1264    pmaddwd              m1, m3
1265    phaddd               m0, m1
1266    paddd                m0, m4
1267    psrad                m0, 6
1268    packssdw             m0, m0
1269    pxor                 m1, m1
1270    pminsw               m0, m5
1271    pmaxsw               m0, m1
1272    movd       [dstq+dsq*0], m0
1273    pshuflw              m0, m0, q3232
1274    movd       [dstq+dsq*1], m0
1275    lea                dstq, [dstq+dsq*2]
1276    sub                  hd, 2
1277    jg .h_w2_loop
1278    RET
1279.h_w4:
1280    WIN64_SPILL_XMM       8
1281    mova                 m6, [base+spel_h_shufA]
1282    mova                 m7, [base+spel_h_shufB]
1283    pshufd               m2, m3, q1111
1284    pshufd               m3, m3, q2222
1285.h_w4_loop:
1286    movu                 m1, [srcq]
1287    add                srcq, ssq
1288    pshufb               m0, m1, m6 ; 0 1 1 2 2 3 3 4
1289    pshufb               m1, m7     ; 2 3 3 4 4 5 5 6
1290    pmaddwd              m0, m2
1291    pmaddwd              m1, m3
1292    paddd                m0, m4
1293    paddd                m0, m1
1294    psrad                m0, 6
1295    packssdw             m0, m0
1296    pxor                 m1, m1
1297    pminsw               m0, m5
1298    pmaxsw               m0, m1
1299    movq             [dstq], m0
1300    add                dstq, dsq
1301    dec                  hd
1302    jg .h_w4_loop
1303    RET
1304.h_w8:
1305    WIN64_SPILL_XMM      12
1306    shr                 mxd, 16
1307    movq                 m3, [base+subpel_filters+mxq*8]
1308    movifnidn          dstq, dstmp
1309    mova                 m6, [base+spel_h_shufA]
1310    mova                 m7, [base+spel_h_shufB]
1311%if UNIX64
1312    mov                  wd, wd
1313%endif
1314    lea                srcq, [srcq+wq*2]
1315    punpcklbw            m3, m3
1316    lea                dstq, [dstq+wq*2]
1317    psraw                m3, 8
1318    neg                  wq
1319%if ARCH_X86_32
1320    ALLOC_STACK       -16*4
1321    pshufd               m0, m3, q0000
1322    pshufd               m1, m3, q1111
1323    pshufd               m2, m3, q2222
1324    pshufd               m3, m3, q3333
1325    mova                 m8, m0
1326    mova                 m9, m1
1327    mova                m10, m2
1328    mova                m11, m3
1329%else
1330    pshufd               m8, m3, q0000
1331    pshufd               m9, m3, q1111
1332    pshufd              m10, m3, q2222
1333    pshufd              m11, m3, q3333
1334%endif
1335.h_w8_loop0:
1336    mov                  r6, wq
1337.h_w8_loop:
1338    movu                 m0, [srcq+r6*2- 6]
1339    movu                 m1, [srcq+r6*2+ 2]
1340    pshufb               m2, m0, m6   ; 0 1 1 2 2 3 3 4
1341    pshufb               m0, m7       ; 2 3 3 4 4 5 5 6
1342    pmaddwd              m2, m8       ; abcd0
1343    pmaddwd              m0, m9       ; abcd1
1344    pshufb               m3, m1, m6   ; 4 5 5 6 6 7 7 8
1345    pshufb               m1, m7       ; 6 7 7 8 8 9 9 a
1346    paddd                m2, m4
1347    paddd                m0, m2
1348    pmaddwd              m2, m10, m3  ; abcd2
1349    pmaddwd              m3, m8       ; efgh0
1350    paddd                m0, m2
1351    pmaddwd              m2, m11, m1  ; abcd3
1352    pmaddwd              m1, m9       ; efgh1
1353    paddd                m0, m2
1354    movu                 m2, [srcq+r6*2+10]
1355    paddd                m3, m4
1356    paddd                m1, m3
1357    pshufb               m3, m2, m6   ; 8 9 9 a a b b c
1358    pshufb               m2, m7       ; a b b c c d d e
1359    pmaddwd              m3, m10      ; efgh2
1360    pmaddwd              m2, m11      ; efgh3
1361    paddd                m1, m3
1362    paddd                m1, m2
1363    psrad                m0, 6
1364    psrad                m1, 6
1365    packssdw             m0, m1
1366    pxor                 m1, m1
1367    pminsw               m0, m5
1368    pmaxsw               m0, m1
1369    mova        [dstq+r6*2], m0
1370    add                  r6, 8
1371    jl .h_w8_loop
1372    add                srcq, ssq
1373    add                dstq, dsq
1374    dec                  hd
1375    jg .h_w8_loop0
1376    RET
1377.v:
1378    movzx               mxd, myb
1379    shr                 myd, 16
1380    cmp                  hd, 6
1381    cmovb               myd, mxd
1382    movq                 m3, [base+subpel_filters+myq*8]
1383    WIN64_SPILL_XMM      15
1384    movd                 m7, r8m
1385    movifnidn          dstq, dstmp
1386    movifnidn           dsq, dsmp
1387    punpcklbw            m3, m3
1388    pshufb               m7, [base+pw_256]
1389    psraw                m3, 8 ; sign-extend
1390%if ARCH_X86_32
1391    ALLOC_STACK       -16*7
1392    pshufd               m0, m3, q0000
1393    pshufd               m1, m3, q1111
1394    pshufd               m2, m3, q2222
1395    pshufd               m3, m3, q3333
1396    mova                 m8, m0
1397    mova                 m9, m1
1398    mova                m10, m2
1399    mova                m11, m3
1400%else
1401    pshufd               m8, m3, q0000
1402    pshufd               m9, m3, q1111
1403    pshufd              m10, m3, q2222
1404    pshufd              m11, m3, q3333
1405%endif
1406    lea                  r6, [ssq*3]
1407    sub                srcq, r6
1408    cmp                  wd, 2
1409    jne .v_w4
1410.v_w2:
1411    movd                 m1, [srcq+ssq*0]
1412    movd                 m4, [srcq+ssq*1]
1413    movd                 m2, [srcq+ssq*2]
1414    add                srcq, r6
1415    movd                 m5, [srcq+ssq*0]
1416    movd                 m3, [srcq+ssq*1]
1417    movd                 m6, [srcq+ssq*2]
1418    add                srcq, r6
1419    movd                 m0, [srcq+ssq*0]
1420    punpckldq            m1, m4      ; 0 1
1421    punpckldq            m4, m2      ; 1 2
1422    punpckldq            m2, m5      ; 2 3
1423    punpckldq            m5, m3      ; 3 4
1424    punpckldq            m3, m6      ; 4 5
1425    punpckldq            m6, m0      ; 5 6
1426    punpcklwd            m1, m4      ; 01 12
1427    punpcklwd            m2, m5      ; 23 34
1428    punpcklwd            m3, m6      ; 45 56
1429    pxor                 m6, m6
1430.v_w2_loop:
1431    movd                 m4, [srcq+ssq*1]
1432    lea                srcq, [srcq+ssq*2]
1433    pmaddwd              m5, m8, m1  ; a0 b0
1434    mova                 m1, m2
1435    pmaddwd              m2, m9      ; a1 b1
1436    paddd                m5, m2
1437    mova                 m2, m3
1438    pmaddwd              m3, m10     ; a2 b2
1439    paddd                m5, m3
1440    punpckldq            m3, m0, m4  ; 6 7
1441    movd                 m0, [srcq+ssq*0]
1442    punpckldq            m4, m0      ; 7 8
1443    punpcklwd            m3, m4      ; 67 78
1444    pmaddwd              m4, m11, m3 ; a3 b3
1445    paddd                m5, m4
1446    psrad                m5, 5
1447    packssdw             m5, m5
1448    pmaxsw               m5, m6
1449    pavgw                m5, m6
1450    pminsw               m5, m7
1451    movd       [dstq+dsq*0], m5
1452    pshuflw              m5, m5, q3232
1453    movd       [dstq+dsq*1], m5
1454    lea                dstq, [dstq+dsq*2]
1455    sub                  hd, 2
1456    jg .v_w2_loop
1457    RET
1458.v_w4:
1459%if ARCH_X86_32
1460    shl                  wd, 14
1461%if STACK_ALIGNMENT < 16
1462    mov          [esp+4*29], srcq
1463    mov          [esp+4*30], dstq
1464%else
1465    mov               srcmp, srcq
1466%endif
1467    lea                  wd, [wq+hq-(1<<16)]
1468%else
1469    shl                  wd, 6
1470    mov                  r7, srcq
1471    mov                  r8, dstq
1472    lea                  wd, [wq+hq-(1<<8)]
1473%endif
1474.v_w4_loop0:
1475    movq                 m1, [srcq+ssq*0]
1476    movq                 m2, [srcq+ssq*1]
1477    movq                 m3, [srcq+ssq*2]
1478    add                srcq, r6
1479    movq                 m4, [srcq+ssq*0]
1480    movq                 m5, [srcq+ssq*1]
1481    movq                 m6, [srcq+ssq*2]
1482    add                srcq, r6
1483    movq                 m0, [srcq+ssq*0]
1484    punpcklwd            m1, m2      ; 01
1485    punpcklwd            m2, m3      ; 12
1486    punpcklwd            m3, m4      ; 23
1487    punpcklwd            m4, m5      ; 34
1488    punpcklwd            m5, m6      ; 45
1489    punpcklwd            m6, m0      ; 56
1490%if ARCH_X86_32
1491    jmp .v_w4_loop_start
1492.v_w4_loop:
1493    mova                 m1, m12
1494    mova                 m2, m13
1495    mova                 m3, m14
1496.v_w4_loop_start:
1497    pmaddwd              m1, m8      ; a0
1498    pmaddwd              m2, m8      ; b0
1499    mova                m12, m3
1500    mova                m13, m4
1501    pmaddwd              m3, m9      ; a1
1502    pmaddwd              m4, m9      ; b1
1503    paddd                m1, m3
1504    paddd                m2, m4
1505    mova                m14, m5
1506    mova                 m4, m6
1507    pmaddwd              m5, m10     ; a2
1508    pmaddwd              m6, m10     ; b2
1509    paddd                m1, m5
1510    paddd                m2, m6
1511    movq                 m6, [srcq+ssq*1]
1512    lea                srcq, [srcq+ssq*2]
1513    punpcklwd            m5, m0, m6  ; 67
1514    movq                 m0, [srcq+ssq*0]
1515    pmaddwd              m3, m11, m5 ; a3
1516    punpcklwd            m6, m0      ; 78
1517    paddd                m1, m3
1518    pmaddwd              m3, m11, m6 ; b3
1519    paddd                m2, m3
1520    psrad                m1, 5
1521    psrad                m2, 5
1522    packssdw             m1, m2
1523    pxor                 m2, m2
1524    pmaxsw               m1, m2
1525    pavgw                m1, m2
1526    pminsw               m1, m7
1527    movq       [dstq+dsq*0], m1
1528    movhps     [dstq+dsq*1], m1
1529    lea                dstq, [dstq+dsq*2]
1530    sub                  hd, 2
1531    jg .v_w4_loop
1532%if STACK_ALIGNMENT < 16
1533    mov                srcq, [esp+4*29]
1534    mov                dstq, [esp+4*30]
1535    movzx                hd, ww
1536    add                srcq, 8
1537    add                dstq, 8
1538    mov          [esp+4*29], srcq
1539    mov          [esp+4*30], dstq
1540%else
1541    mov                srcq, srcmp
1542    mov                dstq, dstmp
1543    movzx                hd, ww
1544    add                srcq, 8
1545    add                dstq, 8
1546    mov               srcmp, srcq
1547    mov               dstmp, dstq
1548%endif
1549    sub                  wd, 1<<16
1550%else
1551.v_w4_loop:
1552    pmaddwd             m12, m8, m1  ; a0
1553    pmaddwd             m13, m8, m2  ; b0
1554    mova                 m1, m3
1555    mova                 m2, m4
1556    pmaddwd              m3, m9      ; a1
1557    pmaddwd              m4, m9      ; b1
1558    paddd               m12, m3
1559    paddd               m13, m4
1560    mova                 m3, m5
1561    mova                 m4, m6
1562    pmaddwd              m5, m10     ; a2
1563    pmaddwd              m6, m10     ; b2
1564    paddd               m12, m5
1565    paddd               m13, m6
1566    movq                 m6, [srcq+ssq*1]
1567    lea                srcq, [srcq+ssq*2]
1568    punpcklwd            m5, m0, m6  ; 67
1569    movq                 m0, [srcq+ssq*0]
1570    pmaddwd             m14, m11, m5 ; a3
1571    punpcklwd            m6, m0      ; 78
1572    paddd               m12, m14
1573    pmaddwd             m14, m11, m6 ; b3
1574    paddd               m13, m14
1575    psrad               m12, 5
1576    psrad               m13, 5
1577    packssdw            m12, m13
1578    pxor                m13, m13
1579    pmaxsw              m12, m13
1580    pavgw               m12, m13
1581    pminsw              m12, m7
1582    movq       [dstq+dsq*0], m12
1583    movhps     [dstq+dsq*1], m12
1584    lea                dstq, [dstq+dsq*2]
1585    sub                  hd, 2
1586    jg .v_w4_loop
1587    add                  r7, 8
1588    add                  r8, 8
1589    movzx                hd, wb
1590    mov                srcq, r7
1591    mov                dstq, r8
1592    sub                  wd, 1<<8
1593%endif
1594    jg .v_w4_loop0
1595    RET
1596.hv:
1597    RESET_STACK_STATE
1598%if ARCH_X86_32
1599    movd                 m4, r8m
1600    mova                 m6, [base+pd_512]
1601    pshufb               m4, [base+pw_256]
1602%else
1603%if WIN64
1604    ALLOC_STACK        16*6, 16
1605%endif
1606    movd                m15, r8m
1607    pshufb              m15, [base+pw_256]
1608%endif
1609    cmp                  wd, 4
1610    jg .hv_w8
1611    movzx               mxd, mxb
1612    je .hv_w4
1613    movq                 m0, [base+subpel_filters+mxq*8]
1614    movzx               mxd, myb
1615    shr                 myd, 16
1616    cmp                  hd, 6
1617    cmovb               myd, mxd
1618    movq                 m3, [base+subpel_filters+myq*8]
1619%if ARCH_X86_32
1620    mov                dstq, dstmp
1621    mov                 dsq, dsmp
1622    mova                 m5, [base+spel_h_shuf2]
1623    ALLOC_STACK       -16*8
1624%else
1625    mova                 m6, [base+pd_512]
1626    mova                 m9, [base+spel_h_shuf2]
1627%endif
1628    pshuflw              m0, m0, q2121
1629    pxor                 m7, m7
1630    punpcklbw            m7, m0
1631    punpcklbw            m3, m3
1632    psraw                m3, 8 ; sign-extend
1633    test          dword r8m, 0x800
1634    jz .hv_w2_10bpc
1635    psraw                m7, 2
1636    psllw                m3, 2
1637.hv_w2_10bpc:
1638    lea                  r6, [ssq*3]
1639    sub                srcq, 2
1640    sub                srcq, r6
1641%if ARCH_X86_32
1642    pshufd               m0, m3, q0000
1643    pshufd               m1, m3, q1111
1644    pshufd               m2, m3, q2222
1645    pshufd               m3, m3, q3333
1646    mova                 m9, m5
1647    mova                m11, m0
1648    mova                m12, m1
1649    mova                m13, m2
1650    mova                m14, m3
1651    mova                m15, m4
1652%else
1653    pshufd              m11, m3, q0000
1654    pshufd              m12, m3, q1111
1655    pshufd              m13, m3, q2222
1656    pshufd              m14, m3, q3333
1657%endif
1658    movu                 m2, [srcq+ssq*0]
1659    movu                 m3, [srcq+ssq*1]
1660    movu                 m1, [srcq+ssq*2]
1661    add                srcq, r6
1662    movu                 m4, [srcq+ssq*0]
1663%if ARCH_X86_32
1664    REPX    {pshufb  x, m5}, m2, m3, m1, m4
1665%else
1666    REPX    {pshufb  x, m9}, m2, m3, m1, m4
1667%endif
1668    REPX    {pmaddwd x, m7}, m2, m3, m1, m4
1669    phaddd               m2, m3        ; 0 1
1670    phaddd               m1, m4        ; 2 3
1671    movu                 m3, [srcq+ssq*1]
1672    movu                 m4, [srcq+ssq*2]
1673    add                srcq, r6
1674    movu                 m0, [srcq+ssq*0]
1675%if ARCH_X86_32
1676    REPX    {pshufb  x, m5}, m3, m4, m0
1677%else
1678    REPX    {pshufb  x, m9}, m3, m4, m0
1679%endif
1680    REPX    {pmaddwd x, m7}, m3, m4, m0
1681    phaddd               m3, m4        ; 4 5
1682    phaddd               m0, m0        ; 6 6
1683    REPX    {paddd   x, m6}, m2, m1, m3, m0
1684    REPX    {psrad   x, 10}, m2, m1, m3, m0
1685    packssdw             m2, m1        ; 0 1 2 3
1686    packssdw             m3, m0        ; 4 5 6 _
1687    palignr              m4, m3, m2, 4 ; 1 2 3 4
1688    pshufd               m5, m3, q0321 ; 5 6 _ _
1689    punpcklwd            m1, m2, m4    ; 01 12
1690    punpckhwd            m2, m4        ; 23 34
1691    punpcklwd            m3, m5        ; 45 56
1692.hv_w2_loop:
1693    movu                 m4, [srcq+ssq*1]
1694    lea                srcq, [srcq+ssq*2]
1695    movu                 m5, [srcq+ssq*0]
1696    pshufb               m4, m9
1697    pshufb               m5, m9
1698    pmaddwd              m4, m7
1699    pmaddwd              m5, m7
1700    phaddd               m4, m5
1701    pmaddwd              m5, m11, m1   ; a0 b0
1702    mova                 m1, m2
1703    pmaddwd              m2, m12       ; a1 b1
1704    paddd                m5, m2
1705    mova                 m2, m3
1706    pmaddwd              m3, m13       ; a2 b2
1707    paddd                m5, m3
1708    paddd                m4, m6
1709    psrad                m4, 10        ; 7 8
1710    packssdw             m0, m4
1711    pshufd               m3, m0, q2103
1712    punpckhwd            m3, m0        ; 67 78
1713    mova                 m0, m4
1714    pmaddwd              m4, m14, m3   ; a3 b3
1715    paddd                m5, m6
1716    paddd                m5, m4
1717    psrad                m5, 10
1718    packssdw             m5, m5
1719    pxor                 m4, m4
1720    pminsw               m5, m15
1721    pmaxsw               m5, m4
1722    movd       [dstq+dsq*0], m5
1723    pshuflw              m5, m5, q3232
1724    movd       [dstq+dsq*1], m5
1725    lea                dstq, [dstq+dsq*2]
1726    sub                  hd, 2
1727    jg .hv_w2_loop
1728    RET
1729.hv_w8:
1730    shr                 mxd, 16
1731.hv_w4:
1732    movq                 m2, [base+subpel_filters+mxq*8]
1733    movzx               mxd, myb
1734    shr                 myd, 16
1735    cmp                  hd, 6
1736    cmovb               myd, mxd
1737    movq                 m3, [base+subpel_filters+myq*8]
1738%if ARCH_X86_32
1739    RESET_STACK_STATE
1740    mov                dstq, dstmp
1741    mov                 dsq, dsmp
1742    mova                 m0, [base+spel_h_shufA]
1743    mova                 m1, [base+spel_h_shufB]
1744    ALLOC_STACK      -16*15
1745    mova                 m8, m0
1746    mova                 m9, m1
1747    mova                m14, m6
1748%else
1749    mova                 m8, [base+spel_h_shufA]
1750    mova                 m9, [base+spel_h_shufB]
1751%endif
1752    pxor                 m0, m0
1753    punpcklbw            m0, m2
1754    punpcklbw            m3, m3
1755    psraw                m3, 8
1756    test          dword r8m, 0x800
1757    jz .hv_w4_10bpc
1758    psraw                m0, 2
1759    psllw                m3, 2
1760.hv_w4_10bpc:
1761    lea                  r6, [ssq*3]
1762    sub                srcq, 6
1763    sub                srcq, r6
1764%if ARCH_X86_32
1765    %define tmp esp+16*8
1766    shl                  wd, 14
1767%if STACK_ALIGNMENT < 16
1768    mov          [esp+4*61], srcq
1769    mov          [esp+4*62], dstq
1770%else
1771    mov               srcmp, srcq
1772%endif
1773    mova         [tmp+16*5], m4
1774    lea                  wd, [wq+hq-(1<<16)]
1775    pshufd               m1, m0, q0000
1776    pshufd               m2, m0, q1111
1777    pshufd               m5, m0, q2222
1778    pshufd               m0, m0, q3333
1779    mova                m10, m1
1780    mova                m11, m2
1781    mova                m12, m5
1782    mova                m13, m0
1783%else
1784%if WIN64
1785    %define tmp rsp
1786%else
1787    %define tmp rsp-104 ; red zone
1788%endif
1789    shl                  wd, 6
1790    mov                  r7, srcq
1791    mov                  r8, dstq
1792    lea                  wd, [wq+hq-(1<<8)]
1793    pshufd              m10, m0, q0000
1794    pshufd              m11, m0, q1111
1795    pshufd              m12, m0, q2222
1796    pshufd              m13, m0, q3333
1797    mova         [tmp+16*5], m15
1798%endif
1799    pshufd               m0, m3, q0000
1800    pshufd               m1, m3, q1111
1801    pshufd               m2, m3, q2222
1802    pshufd               m3, m3, q3333
1803    mova         [tmp+16*1], m0
1804    mova         [tmp+16*2], m1
1805    mova         [tmp+16*3], m2
1806    mova         [tmp+16*4], m3
1807%macro PUT_8TAP_HV_H 4-5 m14 ; dst/src+0, src+8, tmp, shift, [pd_512]
1808    pshufb              m%3, m%1, m8 ; 0 1 1 2 2 3 3 4
1809    pshufb              m%1, m9      ; 2 3 3 4 4 5 5 6
1810    pmaddwd             m%3, m10
1811    pmaddwd             m%1, m11
1812    paddd               m%3, %5
1813    paddd               m%1, m%3
1814    pshufb              m%3, m%2, m8 ; 4 5 5 6 6 7 7 8
1815    pshufb              m%2, m9      ; 6 7 7 8 8 9 9 a
1816    pmaddwd             m%3, m12
1817    pmaddwd             m%2, m13
1818    paddd               m%1, m%3
1819    paddd               m%1, m%2
1820    psrad               m%1, %4
1821%endmacro
1822.hv_w4_loop0:
1823%if ARCH_X86_64
1824    mova                m14, [pd_512]
1825%endif
1826    movu                 m4, [srcq+ssq*0+0]
1827    movu                 m1, [srcq+ssq*0+8]
1828    movu                 m5, [srcq+ssq*1+0]
1829    movu                 m2, [srcq+ssq*1+8]
1830    movu                 m6, [srcq+ssq*2+0]
1831    movu                 m3, [srcq+ssq*2+8]
1832    add                srcq, r6
1833    PUT_8TAP_HV_H         4, 1, 0, 10
1834    PUT_8TAP_HV_H         5, 2, 0, 10
1835    PUT_8TAP_HV_H         6, 3, 0, 10
1836    movu                 m7, [srcq+ssq*0+0]
1837    movu                 m2, [srcq+ssq*0+8]
1838    movu                 m1, [srcq+ssq*1+0]
1839    movu                 m3, [srcq+ssq*1+8]
1840    PUT_8TAP_HV_H         7, 2, 0, 10
1841    PUT_8TAP_HV_H         1, 3, 0, 10
1842    movu                 m2, [srcq+ssq*2+0]
1843    movu                 m3, [srcq+ssq*2+8]
1844    add                srcq, r6
1845    PUT_8TAP_HV_H         2, 3, 0, 10
1846    packssdw             m4, m7      ; 0 3
1847    packssdw             m5, m1      ; 1 4
1848    movu                 m0, [srcq+ssq*0+0]
1849    movu                 m1, [srcq+ssq*0+8]
1850    PUT_8TAP_HV_H         0, 1, 3, 10
1851    packssdw             m6, m2      ; 2 5
1852    packssdw             m7, m0      ; 3 6
1853    punpcklwd            m1, m4, m5  ; 01
1854    punpckhwd            m4, m5      ; 34
1855    punpcklwd            m2, m5, m6  ; 12
1856    punpckhwd            m5, m6      ; 45
1857    punpcklwd            m3, m6, m7  ; 23
1858    punpckhwd            m6, m7      ; 56
1859%if ARCH_X86_32
1860    jmp .hv_w4_loop_start
1861.hv_w4_loop:
1862    mova                 m1, [tmp+16*6]
1863    mova                 m2, m15
1864.hv_w4_loop_start:
1865    mova                 m7, [tmp+16*1]
1866    pmaddwd              m1, m7      ; a0
1867    pmaddwd              m2, m7      ; b0
1868    mova                 m7, [tmp+16*2]
1869    mova         [tmp+16*6], m3
1870    pmaddwd              m3, m7      ; a1
1871    mova                m15, m4
1872    pmaddwd              m4, m7      ; b1
1873    mova                 m7, [tmp+16*3]
1874    paddd                m1, m3
1875    paddd                m2, m4
1876    mova                 m3, m5
1877    pmaddwd              m5, m7      ; a2
1878    mova                 m4, m6
1879    pmaddwd              m6, m7      ; b2
1880    paddd                m1, m5
1881    paddd                m2, m6
1882    movu                 m7, [srcq+ssq*1+0]
1883    movu                 m5, [srcq+ssq*1+8]
1884    lea                srcq, [srcq+ssq*2]
1885    PUT_8TAP_HV_H         7, 5, 6, 10
1886    packssdw             m0, m7      ; 6 7
1887    mova         [tmp+16*0], m0
1888    movu                 m0, [srcq+ssq*0+0]
1889    movu                 m5, [srcq+ssq*0+8]
1890    PUT_8TAP_HV_H         0, 5, 6, 10
1891    mova                 m6, [tmp+16*0]
1892    packssdw             m7, m0      ; 7 8
1893    punpcklwd            m5, m6, m7  ; 67
1894    punpckhwd            m6, m7      ; 78
1895    pmaddwd              m7, m5, [tmp+16*4]
1896    paddd                m1, m7      ; a3
1897    pmaddwd              m7, m6, [tmp+16*4]
1898    paddd                m2, m7      ; b3
1899    psrad                m1, 9
1900    psrad                m2, 9
1901    packssdw             m1, m2
1902    pxor                 m7, m7
1903    pmaxsw               m1, m7
1904    pavgw                m7, m1
1905    pminsw               m7, [tmp+16*5]
1906    movq       [dstq+dsq*0], m7
1907    movhps     [dstq+dsq*1], m7
1908    lea                dstq, [dstq+dsq*2]
1909    sub                  hd, 2
1910    jg .hv_w4_loop
1911%if STACK_ALIGNMENT < 16
1912    mov                srcq, [esp+4*61]
1913    mov                dstq, [esp+4*62]
1914    add                srcq, 8
1915    add                dstq, 8
1916    mov          [esp+4*61], srcq
1917    mov          [esp+4*62], dstq
1918%else
1919    mov                srcq, srcmp
1920    mov                dstq, dstmp
1921    add                srcq, 8
1922    add                dstq, 8
1923    mov               srcmp, srcq
1924    mov               dstmp, dstq
1925%endif
1926    movzx                hd, ww
1927    sub                  wd, 1<<16
1928%else
1929.hv_w4_loop:
1930    mova                m15, [tmp+16*1]
1931    pmaddwd             m14, m15, m1 ; a0
1932    pmaddwd             m15, m2      ; b0
1933    mova                 m7, [tmp+16*2]
1934    mova                 m1, m3
1935    pmaddwd              m3, m7      ; a1
1936    mova                 m2, m4
1937    pmaddwd              m4, m7      ; b1
1938    mova                 m7, [tmp+16*3]
1939    paddd               m14, m3
1940    paddd               m15, m4
1941    mova                 m3, m5
1942    pmaddwd              m5, m7      ; a2
1943    mova                 m4, m6
1944    pmaddwd              m6, m7      ; b2
1945    paddd               m14, m5
1946    paddd               m15, m6
1947    movu                 m7, [srcq+ssq*1+0]
1948    movu                 m5, [srcq+ssq*1+8]
1949    lea                srcq, [srcq+ssq*2]
1950    PUT_8TAP_HV_H         7, 5, 6, 10, [pd_512]
1951    packssdw             m0, m7      ; 6 7
1952    mova         [tmp+16*0], m0
1953    movu                 m0, [srcq+ssq*0+0]
1954    movu                 m5, [srcq+ssq*0+8]
1955    PUT_8TAP_HV_H         0, 5, 6, 10, [pd_512]
1956    mova                 m6, [tmp+16*0]
1957    packssdw             m7, m0      ; 7 8
1958    punpcklwd            m5, m6, m7  ; 67
1959    punpckhwd            m6, m7      ; 78
1960    pmaddwd              m7, m5, [tmp+16*4]
1961    paddd               m14, m7      ; a3
1962    pmaddwd              m7, m6, [tmp+16*4]
1963    paddd               m15, m7      ; b3
1964    psrad               m14, 9
1965    psrad               m15, 9
1966    packssdw            m14, m15
1967    pxor                 m7, m7
1968    pmaxsw              m14, m7
1969    pavgw                m7, m14
1970    pminsw               m7, [tmp+16*5]
1971    movq       [dstq+dsq*0], m7
1972    movhps     [dstq+dsq*1], m7
1973    lea                dstq, [dstq+dsq*2]
1974    sub                  hd, 2
1975    jg .hv_w4_loop
1976    add                  r7, 8
1977    add                  r8, 8
1978    movzx                hd, wb
1979    mov                srcq, r7
1980    mov                dstq, r8
1981    sub                  wd, 1<<8
1982%endif
1983    jg .hv_w4_loop0
1984    RET
1985%undef tmp
1986
1987%if ARCH_X86_32
1988DECLARE_REG_TMP 2, 1, 6, 4
1989%elif WIN64
1990DECLARE_REG_TMP 6, 4, 7, 4
1991%else
1992DECLARE_REG_TMP 6, 7, 7, 8
1993%endif
1994
1995%define PREP_8TAP_FN FN prep_8tap,
1996PREP_8TAP_FN sharp,          SHARP,   SHARP
1997PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH
1998PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP
1999PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH
2000PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR
2001PREP_8TAP_FN regular_sharp,  REGULAR, SHARP
2002PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR
2003PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH
2004PREP_8TAP_FN regular,        REGULAR, REGULAR
2005
2006%if ARCH_X86_32
2007cglobal prep_8tap_16bpc, 0, 7, 8, tmp, src, ss, w, h, mx, my
2008%define mxb r0b
2009%define mxd r0
2010%define mxq r0
2011%define myb r2b
2012%define myd r2
2013%define myq r2
2014%else
2015cglobal prep_8tap_16bpc, 4, 8, 0, tmp, src, ss, w, h, mx, my
2016%endif
2017%define base t2-prep_ssse3
2018    imul                mxd, mxm, 0x010101
2019    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2020    imul                myd, mym, 0x010101
2021    add                 myd, t1d ; 8tap_v, my, 4tap_v
2022    LEA                  t2, prep_ssse3
2023    movifnidn            wd, wm
2024    movifnidn          srcq, srcmp
2025    test                mxd, 0xf00
2026    jnz .h
2027    movifnidn            hd, hm
2028    test                myd, 0xf00
2029    jnz .v
2030    tzcnt                wd, wd
2031    mov                 myd, r7m ; bitdepth_max
2032    movzx                wd, word [base+prep_ssse3_table+wq*2]
2033    mova                 m5, [base+pw_8192]
2034    shr                 myd, 11
2035    add                  wq, t2
2036    movddup              m4, [base+prep_mul+myq*8]
2037    movifnidn           ssq, ssmp
2038    movifnidn          tmpq, tmpmp
2039    lea                  r6, [ssq*3]
2040%if WIN64
2041    pop                  r7
2042%endif
2043    jmp                  wq
2044.h:
2045    test                myd, 0xf00
2046    jnz .hv
2047    movifnidn           ssq, r2mp
2048    movifnidn            hd, r4m
2049    movddup              m5, [base+prep_8tap_1d_rnd]
2050    cmp                  wd, 4
2051    jne .h_w8
2052    movzx               mxd, mxb
2053    movq                 m0, [base+subpel_filters+mxq*8]
2054    mova                 m3, [base+spel_h_shufA]
2055    mova                 m4, [base+spel_h_shufB]
2056    movifnidn          tmpq, tmpmp
2057    sub                srcq, 2
2058    WIN64_SPILL_XMM       8
2059    punpcklbw            m0, m0
2060    psraw                m0, 8
2061    test          dword r7m, 0x800
2062    jnz .h_w4_12bpc
2063    psllw                m0, 2
2064.h_w4_12bpc:
2065    pshufd               m6, m0, q1111
2066    pshufd               m7, m0, q2222
2067.h_w4_loop:
2068    movu                 m1, [srcq+ssq*0]
2069    movu                 m2, [srcq+ssq*1]
2070    lea                srcq, [srcq+ssq*2]
2071    pshufb               m0, m1, m3 ; 0 1 1 2 2 3 3 4
2072    pshufb               m1, m4     ; 2 3 3 4 4 5 5 6
2073    pmaddwd              m0, m6
2074    pmaddwd              m1, m7
2075    paddd                m0, m5
2076    paddd                m0, m1
2077    pshufb               m1, m2, m3
2078    pshufb               m2, m4
2079    pmaddwd              m1, m6
2080    pmaddwd              m2, m7
2081    paddd                m1, m5
2082    paddd                m1, m2
2083    psrad                m0, 4
2084    psrad                m1, 4
2085    packssdw             m0, m1
2086    mova             [tmpq], m0
2087    add                tmpq, 16
2088    sub                  hd, 2
2089    jg .h_w4_loop
2090    RET
2091.h_w8:
2092    WIN64_SPILL_XMM      11
2093    shr                 mxd, 16
2094    movq                 m2, [base+subpel_filters+mxq*8]
2095    mova                 m4, [base+spel_h_shufA]
2096    mova                 m6, [base+spel_h_shufB]
2097    movifnidn          tmpq, r0mp
2098    add                  wd, wd
2099    punpcklbw            m2, m2
2100    add                srcq, wq
2101    psraw                m2, 8
2102    add                tmpq, wq
2103    neg                  wq
2104    test          dword r7m, 0x800
2105    jnz .h_w8_12bpc
2106    psllw                m2, 2
2107.h_w8_12bpc:
2108    pshufd               m7, m2, q0000
2109%if ARCH_X86_32
2110    ALLOC_STACK       -16*3
2111    pshufd               m0, m2, q1111
2112    pshufd               m1, m2, q2222
2113    pshufd               m2, m2, q3333
2114    mova                 m8, m0
2115    mova                 m9, m1
2116    mova                m10, m2
2117%else
2118    pshufd               m8, m2, q1111
2119    pshufd               m9, m2, q2222
2120    pshufd              m10, m2, q3333
2121%endif
2122.h_w8_loop0:
2123    mov                  r6, wq
2124.h_w8_loop:
2125    movu                 m0, [srcq+r6- 6]
2126    movu                 m1, [srcq+r6+ 2]
2127    pshufb               m2, m0, m4  ; 0 1 1 2 2 3 3 4
2128    pshufb               m0, m6      ; 2 3 3 4 4 5 5 6
2129    pmaddwd              m2, m7      ; abcd0
2130    pmaddwd              m0, m8      ; abcd1
2131    pshufb               m3, m1, m4  ; 4 5 5 6 6 7 7 8
2132    pshufb               m1, m6      ; 6 7 7 8 8 9 9 a
2133    paddd                m2, m5
2134    paddd                m0, m2
2135    pmaddwd              m2, m9, m3  ; abcd2
2136    pmaddwd              m3, m7      ; efgh0
2137    paddd                m0, m2
2138    pmaddwd              m2, m10, m1 ; abcd3
2139    pmaddwd              m1, m8      ; efgh1
2140    paddd                m0, m2
2141    movu                 m2, [srcq+r6+10]
2142    paddd                m3, m5
2143    paddd                m1, m3
2144    pshufb               m3, m2, m4  ; a b b c c d d e
2145    pshufb               m2, m6      ; 8 9 9 a a b b c
2146    pmaddwd              m3, m9      ; efgh2
2147    pmaddwd              m2, m10     ; efgh3
2148    paddd                m1, m3
2149    paddd                m1, m2
2150    psrad                m0, 4
2151    psrad                m1, 4
2152    packssdw             m0, m1
2153    mova          [tmpq+r6], m0
2154    add                  r6, 16
2155    jl .h_w8_loop
2156    add                srcq, ssq
2157    sub                tmpq, wq
2158    dec                  hd
2159    jg .h_w8_loop0
2160    RET
2161.v:
2162    movzx               mxd, myb
2163    shr                 myd, 16
2164    cmp                  hd, 4
2165    cmove               myd, mxd
2166    movq                 m3, [base+subpel_filters+myq*8]
2167    WIN64_SPILL_XMM      15
2168    movddup              m7, [base+prep_8tap_1d_rnd]
2169    movifnidn           ssq, r2mp
2170    movifnidn          tmpq, r0mp
2171    punpcklbw            m3, m3
2172    psraw                m3, 8 ; sign-extend
2173    test          dword r7m, 0x800
2174    jnz .v_12bpc
2175    psllw                m3, 2
2176.v_12bpc:
2177%if ARCH_X86_32
2178    ALLOC_STACK       -16*7
2179    pshufd               m0, m3, q0000
2180    pshufd               m1, m3, q1111
2181    pshufd               m2, m3, q2222
2182    pshufd               m3, m3, q3333
2183    mova                 m8, m0
2184    mova                 m9, m1
2185    mova                m10, m2
2186    mova                m11, m3
2187%else
2188    pshufd               m8, m3, q0000
2189    pshufd               m9, m3, q1111
2190    pshufd              m10, m3, q2222
2191    pshufd              m11, m3, q3333
2192%endif
2193    lea                  r6, [ssq*3]
2194    sub                srcq, r6
2195    mov                 r6d, wd
2196    shl                  wd, 6
2197    mov                  r5, srcq
2198%if ARCH_X86_64
2199    mov                  r7, tmpq
2200%elif STACK_ALIGNMENT < 16
2201    mov          [esp+4*29], tmpq
2202%endif
2203    lea                  wd, [wq+hq-(1<<8)]
2204.v_loop0:
2205    movq                 m1, [srcq+ssq*0]
2206    movq                 m2, [srcq+ssq*1]
2207    lea                srcq, [srcq+ssq*2]
2208    movq                 m3, [srcq+ssq*0]
2209    movq                 m4, [srcq+ssq*1]
2210    lea                srcq, [srcq+ssq*2]
2211    movq                 m5, [srcq+ssq*0]
2212    movq                 m6, [srcq+ssq*1]
2213    lea                srcq, [srcq+ssq*2]
2214    movq                 m0, [srcq+ssq*0]
2215    punpcklwd            m1, m2      ; 01
2216    punpcklwd            m2, m3      ; 12
2217    punpcklwd            m3, m4      ; 23
2218    punpcklwd            m4, m5      ; 34
2219    punpcklwd            m5, m6      ; 45
2220    punpcklwd            m6, m0      ; 56
2221%if ARCH_X86_32
2222    jmp .v_loop_start
2223.v_loop:
2224    mova                 m1, m12
2225    mova                 m2, m13
2226    mova                 m3, m14
2227.v_loop_start:
2228    pmaddwd              m1, m8      ; a0
2229    pmaddwd              m2, m8      ; b0
2230    mova                m12, m3
2231    mova                m13, m4
2232    pmaddwd              m3, m9      ; a1
2233    pmaddwd              m4, m9      ; b1
2234    paddd                m1, m3
2235    paddd                m2, m4
2236    mova                m14, m5
2237    mova                 m4, m6
2238    pmaddwd              m5, m10     ; a2
2239    pmaddwd              m6, m10     ; b2
2240    paddd                m1, m5
2241    paddd                m2, m6
2242    movq                 m6, [srcq+ssq*1]
2243    lea                srcq, [srcq+ssq*2]
2244    punpcklwd            m5, m0, m6  ; 67
2245    movq                 m0, [srcq+ssq*0]
2246    pmaddwd              m3, m11, m5 ; a3
2247    punpcklwd            m6, m0      ; 78
2248    paddd                m1, m7
2249    paddd                m1, m3
2250    pmaddwd              m3, m11, m6 ; b3
2251    paddd                m2, m7
2252    paddd                m2, m3
2253    psrad                m1, 4
2254    psrad                m2, 4
2255    packssdw             m1, m2
2256    movq        [tmpq+r6*0], m1
2257    movhps      [tmpq+r6*2], m1
2258    lea                tmpq, [tmpq+r6*4]
2259    sub                  hd, 2
2260    jg .v_loop
2261%if STACK_ALIGNMENT < 16
2262    mov                tmpq, [esp+4*29]
2263    add                  r5, 8
2264    add                tmpq, 8
2265    mov                srcq, r5
2266    mov          [esp+4*29], tmpq
2267%else
2268    mov                tmpq, tmpmp
2269    add                  r5, 8
2270    add                tmpq, 8
2271    mov                srcq, r5
2272    mov               tmpmp, tmpq
2273%endif
2274%else
2275.v_loop:
2276    pmaddwd             m12, m8, m1  ; a0
2277    pmaddwd             m13, m8, m2  ; b0
2278    mova                 m1, m3
2279    mova                 m2, m4
2280    pmaddwd              m3, m9      ; a1
2281    pmaddwd              m4, m9      ; b1
2282    paddd               m12, m3
2283    paddd               m13, m4
2284    mova                 m3, m5
2285    mova                 m4, m6
2286    pmaddwd              m5, m10     ; a2
2287    pmaddwd              m6, m10     ; b2
2288    paddd               m12, m5
2289    paddd               m13, m6
2290    movq                 m6, [srcq+ssq*1]
2291    lea                srcq, [srcq+ssq*2]
2292    punpcklwd            m5, m0, m6  ; 67
2293    movq                 m0, [srcq+ssq*0]
2294    pmaddwd             m14, m11, m5 ; a3
2295    punpcklwd            m6, m0      ; 78
2296    paddd               m12, m7
2297    paddd               m12, m14
2298    pmaddwd             m14, m11, m6 ; b3
2299    paddd               m13, m7
2300    paddd               m13, m14
2301    psrad               m12, 4
2302    psrad               m13, 4
2303    packssdw            m12, m13
2304    movq        [tmpq+r6*0], m12
2305    movhps      [tmpq+r6*2], m12
2306    lea                tmpq, [tmpq+r6*4]
2307    sub                  hd, 2
2308    jg .v_loop
2309    add                  r5, 8
2310    add                  r7, 8
2311    mov                srcq, r5
2312    mov                tmpq, r7
2313%endif
2314    movzx                hd, wb
2315    sub                  wd, 1<<8
2316    jg .v_loop0
2317    RET
2318.hv:
2319    RESET_STACK_STATE
2320    movzx               t3d, mxb
2321    shr                 mxd, 16
2322    cmp                  wd, 4
2323    cmove               mxd, t3d
2324    movifnidn            hd, r4m
2325    movq                 m2, [base+subpel_filters+mxq*8]
2326    movzx               mxd, myb
2327    shr                 myd, 16
2328    cmp                  hd, 4
2329    cmove               myd, mxd
2330    movq                 m3, [base+subpel_filters+myq*8]
2331%if ARCH_X86_32
2332    mov                 ssq, r2mp
2333    mov                tmpq, r0mp
2334    mova                 m0, [base+spel_h_shufA]
2335    mova                 m1, [base+spel_h_shufB]
2336    mova                 m4, [base+prep_8tap_2d_rnd]
2337    ALLOC_STACK      -16*14
2338    mova                 m8, m0
2339    mova                 m9, m1
2340    mova                m14, m4
2341%else
2342%if WIN64
2343    ALLOC_STACK        16*6, 16
2344%endif
2345    mova                 m8, [base+spel_h_shufA]
2346    mova                 m9, [base+spel_h_shufB]
2347%endif
2348    pxor                 m0, m0
2349    punpcklbw            m0, m2
2350    punpcklbw            m3, m3
2351    psraw                m0, 4
2352    psraw                m3, 8
2353    test          dword r7m, 0x800
2354    jz .hv_10bpc
2355    psraw                m0, 2
2356.hv_10bpc:
2357    lea                  r6, [ssq*3]
2358    sub                srcq, 6
2359    sub                srcq, r6
2360    mov                 r6d, wd
2361    shl                  wd, 6
2362    mov                  r5, srcq
2363%if ARCH_X86_32
2364    %define             tmp  esp+16*8
2365%if STACK_ALIGNMENT < 16
2366    mov          [esp+4*61], tmpq
2367%endif
2368    pshufd               m1, m0, q0000
2369    pshufd               m2, m0, q1111
2370    pshufd               m5, m0, q2222
2371    pshufd               m0, m0, q3333
2372    mova                m10, m1
2373    mova                m11, m2
2374    mova                m12, m5
2375    mova                m13, m0
2376%else
2377%if WIN64
2378    %define             tmp  rsp
2379%else
2380    %define             tmp  rsp-88 ; red zone
2381%endif
2382    mov                  r7, tmpq
2383    pshufd              m10, m0, q0000
2384    pshufd              m11, m0, q1111
2385    pshufd              m12, m0, q2222
2386    pshufd              m13, m0, q3333
2387%endif
2388    lea                  wd, [wq+hq-(1<<8)]
2389    pshufd               m0, m3, q0000
2390    pshufd               m1, m3, q1111
2391    pshufd               m2, m3, q2222
2392    pshufd               m3, m3, q3333
2393    mova         [tmp+16*1], m0
2394    mova         [tmp+16*2], m1
2395    mova         [tmp+16*3], m2
2396    mova         [tmp+16*4], m3
2397.hv_loop0:
2398%if ARCH_X86_64
2399    mova                m14, [prep_8tap_2d_rnd]
2400%endif
2401    movu                 m4, [srcq+ssq*0+0]
2402    movu                 m1, [srcq+ssq*0+8]
2403    movu                 m5, [srcq+ssq*1+0]
2404    movu                 m2, [srcq+ssq*1+8]
2405    lea                srcq, [srcq+ssq*2]
2406    movu                 m6, [srcq+ssq*0+0]
2407    movu                 m3, [srcq+ssq*0+8]
2408    PUT_8TAP_HV_H         4, 1, 0, 6
2409    PUT_8TAP_HV_H         5, 2, 0, 6
2410    PUT_8TAP_HV_H         6, 3, 0, 6
2411    movu                 m7, [srcq+ssq*1+0]
2412    movu                 m2, [srcq+ssq*1+8]
2413    lea                srcq, [srcq+ssq*2]
2414    movu                 m1, [srcq+ssq*0+0]
2415    movu                 m3, [srcq+ssq*0+8]
2416    PUT_8TAP_HV_H         7, 2, 0, 6
2417    PUT_8TAP_HV_H         1, 3, 0, 6
2418    movu                 m2, [srcq+ssq*1+0]
2419    movu                 m3, [srcq+ssq*1+8]
2420    lea                srcq, [srcq+ssq*2]
2421    PUT_8TAP_HV_H         2, 3, 0, 6
2422    packssdw             m4, m7      ; 0 3
2423    packssdw             m5, m1      ; 1 4
2424    movu                 m0, [srcq+ssq*0+0]
2425    movu                 m1, [srcq+ssq*0+8]
2426    PUT_8TAP_HV_H         0, 1, 3, 6
2427    packssdw             m6, m2      ; 2 5
2428    packssdw             m7, m0      ; 3 6
2429    punpcklwd            m1, m4, m5  ; 01
2430    punpckhwd            m4, m5      ; 34
2431    punpcklwd            m2, m5, m6  ; 12
2432    punpckhwd            m5, m6      ; 45
2433    punpcklwd            m3, m6, m7  ; 23
2434    punpckhwd            m6, m7      ; 56
2435%if ARCH_X86_32
2436    jmp .hv_loop_start
2437.hv_loop:
2438    mova                 m1, [tmp+16*5]
2439    mova                 m2, m15
2440.hv_loop_start:
2441    mova                 m7, [tmp+16*1]
2442    pmaddwd              m1, m7      ; a0
2443    pmaddwd              m2, m7      ; b0
2444    mova                 m7, [tmp+16*2]
2445    mova         [tmp+16*5], m3
2446    pmaddwd              m3, m7      ; a1
2447    mova                m15, m4
2448    pmaddwd              m4, m7      ; b1
2449    mova                 m7, [tmp+16*3]
2450    paddd                m1, m14
2451    paddd                m2, m14
2452    paddd                m1, m3
2453    paddd                m2, m4
2454    mova                 m3, m5
2455    pmaddwd              m5, m7      ; a2
2456    mova                 m4, m6
2457    pmaddwd              m6, m7      ; b2
2458    paddd                m1, m5
2459    paddd                m2, m6
2460    movu                 m7, [srcq+ssq*1+0]
2461    movu                 m5, [srcq+ssq*1+8]
2462    lea                srcq, [srcq+ssq*2]
2463    PUT_8TAP_HV_H         7, 5, 6, 6
2464    packssdw             m0, m7      ; 6 7
2465    mova         [tmp+16*0], m0
2466    movu                 m0, [srcq+ssq*0+0]
2467    movu                 m5, [srcq+ssq*0+8]
2468    PUT_8TAP_HV_H         0, 5, 6, 6
2469    mova                 m6, [tmp+16*0]
2470    packssdw             m7, m0      ; 7 8
2471    punpcklwd            m5, m6, m7  ; 67
2472    punpckhwd            m6, m7      ; 78
2473    pmaddwd              m7, m5, [tmp+16*4]
2474    paddd                m1, m7      ; a3
2475    pmaddwd              m7, m6, [tmp+16*4]
2476    paddd                m2, m7      ; b3
2477    psrad                m1, 6
2478    psrad                m2, 6
2479    packssdw             m1, m2
2480    movq        [tmpq+r6*0], m1
2481    movhps      [tmpq+r6*2], m1
2482    lea                tmpq, [tmpq+r6*4]
2483    sub                  hd, 2
2484    jg .hv_loop
2485%if STACK_ALIGNMENT < 16
2486    mov                tmpq, [esp+4*61]
2487    add                  r5, 8
2488    add                tmpq, 8
2489    mov                srcq, r5
2490    mov          [esp+4*61], tmpq
2491%else
2492    mov                tmpq, tmpmp
2493    add                  r5, 8
2494    add                tmpq, 8
2495    mov                srcq, r5
2496    mov               tmpmp, tmpq
2497%endif
2498%else
2499.hv_loop:
2500    mova                m15, [tmp+16*1]
2501    mova                 m7, [prep_8tap_2d_rnd]
2502    pmaddwd             m14, m15, m1 ; a0
2503    pmaddwd             m15, m2      ; b0
2504    paddd               m14, m7
2505    paddd               m15, m7
2506    mova                 m7, [tmp+16*2]
2507    mova                 m1, m3
2508    pmaddwd              m3, m7      ; a1
2509    mova                 m2, m4
2510    pmaddwd              m4, m7      ; b1
2511    mova                 m7, [tmp+16*3]
2512    paddd               m14, m3
2513    paddd               m15, m4
2514    mova                 m3, m5
2515    pmaddwd              m5, m7      ; a2
2516    mova                 m4, m6
2517    pmaddwd              m6, m7      ; b2
2518    paddd               m14, m5
2519    paddd               m15, m6
2520    movu                 m7, [srcq+ssq*1+0]
2521    movu                 m5, [srcq+ssq*1+8]
2522    lea                srcq, [srcq+ssq*2]
2523    PUT_8TAP_HV_H         7, 5, 6, 6, [prep_8tap_2d_rnd]
2524    packssdw             m0, m7      ; 6 7
2525    mova         [tmp+16*0], m0
2526    movu                 m0, [srcq+ssq*0+0]
2527    movu                 m5, [srcq+ssq*0+8]
2528    PUT_8TAP_HV_H         0, 5, 6, 6, [prep_8tap_2d_rnd]
2529    mova                 m6, [tmp+16*0]
2530    packssdw             m7, m0      ; 7 8
2531    punpcklwd            m5, m6, m7  ; 67
2532    punpckhwd            m6, m7      ; 78
2533    pmaddwd              m7, m5, [tmp+16*4]
2534    paddd               m14, m7      ; a3
2535    pmaddwd              m7, m6, [tmp+16*4]
2536    paddd               m15, m7      ; b3
2537    psrad               m14, 6
2538    psrad               m15, 6
2539    packssdw            m14, m15
2540    movq        [tmpq+r6*0], m14
2541    movhps      [tmpq+r6*2], m14
2542    lea                tmpq, [tmpq+r6*4]
2543    sub                  hd, 2
2544    jg .hv_loop
2545    add                  r5, 8
2546    add                  r7, 8
2547    mov                srcq, r5
2548    mov                tmpq, r7
2549%endif
2550    movzx                hd, wb
2551    sub                  wd, 1<<8
2552    jg .hv_loop0
2553    RET
2554%undef tmp
2555
2556%macro movifprep 2
2557 %if isprep
2558    mov %1, %2
2559 %endif
2560%endmacro
2561
2562%macro SAVE_REG 1
2563 %xdefine r%1_save  r%1
2564 %xdefine r%1q_save r%1q
2565 %xdefine r%1d_save r%1d
2566 %if ARCH_X86_32
2567  %define r%1m_save [rstk+stack_offset+(%1+1)*4]
2568 %endif
2569%endmacro
2570
2571%macro LOAD_REG 1
2572 %xdefine r%1  r%1_save
2573 %xdefine r%1q r%1q_save
2574 %xdefine r%1d r%1d_save
2575 %if ARCH_X86_32
2576  %define r%1m r%1m_save
2577 %endif
2578 %undef r%1d_save
2579 %undef r%1q_save
2580 %undef r%1_save
2581%endmacro
2582
2583%macro REMAP_REG 2-3
2584 %xdefine r%1  r%2
2585 %xdefine r%1q r%2q
2586 %xdefine r%1d r%2d
2587 %if ARCH_X86_32
2588  %if %3 == 0
2589   %xdefine r%1m r%2m
2590  %else
2591   %define r%1m [rstk+stack_offset+(%1+1)*4]
2592  %endif
2593 %endif
2594%endmacro
2595
2596%macro MCT_8TAP_SCALED_REMAP_REGS_TO_PREV 0
2597 %if isprep
2598  %if ARCH_X86_64
2599   SAVE_REG 14
2600   %assign %%i 14
2601   %rep 14
2602    %assign %%j %%i-1
2603    REMAP_REG %%i, %%j
2604    %assign %%i %%i-1
2605   %endrep
2606  %else
2607   SAVE_REG 5
2608   %assign %%i 5
2609   %rep 5
2610    %assign %%j %%i-1
2611    REMAP_REG %%i, %%j, 0
2612    %assign %%i %%i-1
2613   %endrep
2614  %endif
2615 %endif
2616%endmacro
2617
2618%macro MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT 0
2619 %if isprep
2620  %assign %%i 1
2621  %if ARCH_X86_64
2622   %rep 13
2623    %assign %%j %%i+1
2624    REMAP_REG %%i, %%j
2625    %assign %%i %%i+1
2626   %endrep
2627   LOAD_REG 14
2628  %else
2629   %rep 4
2630    %assign %%j %%i+1
2631    REMAP_REG %%i, %%j, 1
2632    %assign %%i %%i+1
2633   %endrep
2634   LOAD_REG 5
2635  %endif
2636 %endif
2637%endmacro
2638
2639%macro MC_8TAP_SCALED_RET 0-1 1 ; leave_mapping_unchanged
2640    MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
2641    RET
2642 %if %1
2643    MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
2644 %endif
2645%endmacro
2646
2647%if ARCH_X86_32
2648 %macro MC_4TAP_SCALED_H 1 ; dst_mem
2649    movu                 m7, [srcq+ssq*0]
2650    movu                 m2, [srcq+ssq*1]
2651    movu                 m5, [r4  +ssq*0]
2652    movu                 m6, [r4  +ssq*1]
2653    lea                srcq, [srcq+ssq*2]
2654    lea                  r4, [r4  +ssq*2]
2655    REPX    {pshufb x, m12}, m7, m2
2656    REPX   {pmaddwd x, m13}, m7, m2
2657    REPX    {pshufb x, m14}, m5, m6
2658    REPX   {pmaddwd x, m15}, m5, m6
2659    phaddd               m7, m5
2660    phaddd               m2, m6
2661    mova                 m5, [esp+0x00]
2662    movd                 m6, [esp+0x10]
2663    paddd                m7, m5
2664    paddd                m2, m5
2665    psrad                m7, m6
2666    psrad                m2, m6
2667    packssdw             m7, m2
2668    mova           [stk+%1], m7
2669 %endmacro
2670%endif
2671
2672%if ARCH_X86_64
2673 %macro MC_8TAP_SCALED_H 8 ; dst, tmp[0-6]
2674    movu                m%1, [srcq+ r4*2]
2675    movu                m%2, [srcq+ r6*2]
2676    movu                m%3, [srcq+ r7*2]
2677    movu                m%4, [srcq+ r9*2]
2678    movu                m%5, [srcq+r10*2]
2679    movu                m%6, [srcq+r11*2]
2680    movu                m%7, [srcq+r13*2]
2681    movu                m%8, [srcq+ rX*2]
2682    add                srcq, ssq
2683    pmaddwd             m%1, [stk+0x10]
2684    pmaddwd             m%2, [stk+0x20]
2685    pmaddwd             m%3, [stk+0x30]
2686    pmaddwd             m%4, [stk+0x40]
2687    pmaddwd             m%5, [stk+0x50]
2688    pmaddwd             m%6, [stk+0x60]
2689    pmaddwd             m%7, [stk+0x70]
2690    pmaddwd             m%8, [stk+0x80]
2691    phaddd              m%1, m%2
2692    phaddd              m%3, m%4
2693    phaddd              m%5, m%6
2694    phaddd              m%7, m%8
2695    phaddd              m%1, m%3
2696    phaddd              m%5, m%7
2697    paddd               m%1, hround
2698    paddd               m%5, hround
2699    psrad               m%1, m12
2700    psrad               m%5, m12
2701    packssdw            m%1, m%5
2702 %endmacro
2703%else
2704 %macro MC_8TAP_SCALED_H 2-3 1 ; weights_mem_start, h_mem, load_fh_offsets
2705  %if %3 == 1
2706    mov                  r0, [stk+ 0]
2707    mov                  rX, [stk+ 4]
2708    mov                  r4, [stk+ 8]
2709    mov                  r5, [stk+12]
2710  %endif
2711    movu                 m0, [srcq+r0*2]
2712    movu                 m1, [srcq+rX*2]
2713    movu                 m2, [srcq+r4*2]
2714    movu                 m3, [srcq+r5*2]
2715    mov                  r0, [stk+16]
2716    mov                  rX, [stk+20]
2717    mov                  r4, [stk+24]
2718    mov                  r5, [stk+28]
2719    pmaddwd              m0, [stk+%1+0x00]
2720    pmaddwd              m1, [stk+%1+0x10]
2721    pmaddwd              m2, [stk+%1+0x20]
2722    pmaddwd              m3, [stk+%1+0x30]
2723    phaddd               m0, m1
2724    phaddd               m2, m3
2725    movu                 m4, [srcq+r0*2]
2726    movu                 m5, [srcq+rX*2]
2727    movu                 m6, [srcq+r4*2]
2728    movu                 m7, [srcq+r5*2]
2729    add                srcq, ssq
2730    pmaddwd              m4, [stk+%1+0xa0]
2731    pmaddwd              m5, [stk+%1+0xb0]
2732    pmaddwd              m6, [stk+%1+0xc0]
2733    pmaddwd              m7, [stk+%1+0xd0]
2734    phaddd               m4, m5
2735    phaddd               m6, m7
2736    phaddd               m0, m2
2737    phaddd               m4, m6
2738    paddd                m0, hround
2739    paddd                m4, hround
2740    psrad                m0, m12
2741    psrad                m4, m12
2742    packssdw             m0, m4
2743  %if %2 != 0
2744    mova           [stk+%2], m0
2745  %endif
2746 %endmacro
2747%endif
2748
2749%macro MC_8TAP_SCALED 1
2750%ifidn %1, put
2751 %assign isput  1
2752 %assign isprep 0
2753 %if ARCH_X86_64
2754  %if required_stack_alignment <= STACK_ALIGNMENT
2755cglobal put_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
2756  %else
2757cglobal put_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
2758  %endif
2759 %else ; ARCH_X86_32
2760  %if required_stack_alignment <= STACK_ALIGNMENT
2761cglobal put_8tap_scaled_16bpc, 0, 7, 8, 0x200, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
2762  %else
2763cglobal put_8tap_scaled_16bpc, 0, 7, 8, -0x200-0x30, dst, ds, src, ss, w, h, mx, my, dx, dy, pxmax
2764  %endif
2765 %endif
2766 %xdefine base_reg r12
2767%else ; prep
2768 %assign isput  0
2769 %assign isprep 1
2770 %if ARCH_X86_64
2771  %if required_stack_alignment <= STACK_ALIGNMENT
2772cglobal prep_8tap_scaled_16bpc, 2, 15, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
2773   %xdefine tmp_stridem r14q
2774  %else
2775cglobal prep_8tap_scaled_16bpc, 2, 14, 16, 0x1c0, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
2776   %define tmp_stridem qword [stk+0x138]
2777  %endif
2778  %xdefine base_reg r11
2779 %else ; ARCH_X86_32
2780  %if required_stack_alignment <= STACK_ALIGNMENT
2781cglobal prep_8tap_scaled_16bpc, 0, 7, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
2782  %else
2783cglobal prep_8tap_scaled_16bpc, 0, 6, 8, 0x200, tmp, src, ss, w, h, mx, my, dx, dy, pxmax
2784  %endif
2785  %define tmp_stridem dword [stk+0x138]
2786 %endif
2787%endif
2788%if ARCH_X86_32
2789    mov         [esp+0x1f0], t0d
2790    mov         [esp+0x1f4], t1d
2791 %if isput && required_stack_alignment > STACK_ALIGNMENT
2792    mov                dstd, dstm
2793    mov                 dsd, dsm
2794    mov                srcd, srcm
2795    mov                 ssd, ssm
2796    mov                  hd, hm
2797    mov                  r4, mxm
2798  %define r0m  [esp+0x200]
2799  %define dsm  [esp+0x204]
2800  %define dsmp dsm
2801  %define r1m  dsm
2802  %define r2m  [esp+0x208]
2803  %define ssm  [esp+0x20c]
2804  %define r3m  ssm
2805  %define hm   [esp+0x210]
2806  %define mxm  [esp+0x214]
2807    mov                 r0m, dstd
2808    mov                 dsm, dsd
2809    mov                 r2m, srcd
2810    mov                 ssm, ssd
2811    mov                  hm, hd
2812    mov                  r0, mym
2813    mov                  r1, dxm
2814    mov                  r2, dym
2815  %define mym    [esp+0x218]
2816  %define dxm    [esp+0x21c]
2817  %define dym    [esp+0x220]
2818    mov                 mxm, r4
2819    mov                 mym, r0
2820    mov                 dxm, r1
2821    mov                 dym, r2
2822    tzcnt                wd, wm
2823 %endif
2824 %if isput
2825    mov                  r3, pxmaxm
2826  %define pxmaxm r3
2827 %else
2828    mov                  r2, pxmaxm
2829 %endif
2830 %if isprep && required_stack_alignment > STACK_ALIGNMENT
2831  %xdefine base_reg r5
2832 %else
2833  %xdefine base_reg r6
2834 %endif
2835%endif
2836    LEA            base_reg, %1_8tap_scaled_16bpc_ssse3
2837%xdefine base base_reg-%1_8tap_scaled_16bpc_ssse3
2838%if ARCH_X86_64 || isprep || required_stack_alignment <= STACK_ALIGNMENT
2839    tzcnt                wd, wm
2840%endif
2841%if ARCH_X86_64
2842 %if isput
2843    mov                 r7d, pxmaxm
2844 %endif
2845%else
2846 %define m8  m0
2847 %define m9  m1
2848 %define m14 m4
2849 %define m15 m3
2850%endif
2851    movd                 m8, dxm
2852    movd                m14, mxm
2853%if isput
2854    movd                m15, pxmaxm
2855%endif
2856    pshufd               m8, m8, q0000
2857    pshufd              m14, m14, q0000
2858%if isput
2859    pshuflw             m15, m15, q0000
2860    punpcklqdq          m15, m15
2861%endif
2862%if isprep
2863 %if UNIX64
2864    mov                 r5d, t0d
2865  DECLARE_REG_TMP 5, 7
2866 %endif
2867 %if ARCH_X86_64
2868    mov                 r6d, pxmaxm
2869 %endif
2870%endif
2871%if ARCH_X86_64
2872    mov                 dyd, dym
2873%endif
2874%if isput
2875 %if WIN64
2876    mov                 r8d, hm
2877  DEFINE_ARGS dst, ds, src, ss, w, _, _, my, h, dy, ss3
2878  %define hm r5m
2879  %define dxm r8m
2880 %elif ARCH_X86_64
2881  DEFINE_ARGS dst, ds, src, ss, w, h, _, my, dx, dy, ss3
2882  %define hm r6m
2883 %else
2884 %endif
2885 %if ARCH_X86_64
2886  %if required_stack_alignment > STACK_ALIGNMENT
2887   %define dsm [rsp+0x138]
2888   %define rX r1
2889   %define rXd r1d
2890  %else
2891   %define dsm dsq
2892   %define rX r14
2893   %define rXd r14d
2894  %endif
2895 %else
2896  %define rX r1
2897 %endif
2898%else ; prep
2899 %if WIN64
2900    mov                 r7d, hm
2901  DEFINE_ARGS tmp, src, ss, w, _, _, my, h, dy, ss3
2902  %define hm r4m
2903  %define dxm r7m
2904 %elif ARCH_X86_64
2905  DEFINE_ARGS tmp, src, ss, w, h, _, my, dx, dy, ss3
2906  %xdefine hm r7m
2907 %endif
2908 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
2909 %if ARCH_X86_64
2910  %define rX r14
2911  %define rXd r14d
2912 %else
2913  %define rX r3
2914 %endif
2915%endif
2916%if ARCH_X86_64
2917    shr                 r7d, 11
2918    mova                m10, [base+pd_0x3ff]
2919    movddup             m11, [base+s_8tap_h_rnd+r7*8]
2920    movd                m12, [base+s_8tap_h_sh+r7*4]
2921 %if isput
2922    movddup             m13, [base+put_s_8tap_v_rnd+r7*8]
2923    movd                 m7, [base+put_s_8tap_v_sh+r7*4]
2924  %define pxmaxm [rsp]
2925    mova             pxmaxm, m15
2926    punpcklqdq          m12, m7
2927 %endif
2928    lea                ss3q, [ssq*3]
2929    movzx               r7d, t1b
2930    shr                 t1d, 16
2931    cmp                  hd, 6
2932    cmovs               t1d, r7d
2933    sub                srcq, ss3q
2934%else
2935 %define m10    [base+pd_0x3ff]
2936 %define m11    [esp+0x00]
2937 %define m12    [esp+0x10]
2938    shr                  r3, 11
2939    movddup              m1, [base+s_8tap_h_rnd+r3*8]
2940    movd                 m2, [base+s_8tap_h_sh+r3*4]
2941 %if isput
2942  %define m13    [esp+0x20]
2943  %define pxmaxm [esp+0x30]
2944  %define stk esp+0x40
2945    movddup              m5, [base+put_s_8tap_v_rnd+r3*8]
2946    movd                 m6, [base+put_s_8tap_v_sh+r3*4]
2947    mova             pxmaxm, m15
2948    punpcklqdq           m2, m6
2949    mova                m13, m5
2950 %else
2951  %define m13 [base+pd_m524256]
2952 %endif
2953    mov                 ssd, ssm
2954    mova                m11, m1
2955    mova                m12, m2
2956 MCT_8TAP_SCALED_REMAP_REGS_TO_DEFAULT
2957    mov                  r1, [esp+0x1f4]
2958    lea                  r0, [ssd*3]
2959    movzx                r2, r1b
2960    shr                  r1, 16
2961    cmp            dword hm, 6
2962    cmovs                r1, r2
2963    mov         [esp+0x1f4], r1
2964 %if isprep
2965    mov                  r1, r1m
2966 %endif
2967    mov                  r2, r2m
2968    sub                srcq, r0
2969 MCT_8TAP_SCALED_REMAP_REGS_TO_PREV
2970 %define ss3q r0
2971 %define myd r4
2972 %define dyd dword dym
2973 %define hd  dword hm
2974%endif
2975    cmp                 dyd, 1024
2976    je .dy1
2977    cmp                 dyd, 2048
2978    je .dy2
2979    movzx                wd, word [base+%1_8tap_scaled_ssse3_table+wq*2]
2980    add                  wq, base_reg
2981    jmp                  wq
2982%if isput
2983.w2:
2984 %if ARCH_X86_64
2985    mov                 myd, mym
2986    movzx               t0d, t0b
2987    sub                srcq, 2
2988    movd                m15, t0d
2989 %else
2990    movzx                r4, byte [esp+0x1f0]
2991    sub                srcq, 2
2992    movd                m15, r4
2993 %endif
2994    pxor                 m9, m9
2995    punpckldq            m9, m8
2996    paddd               m14, m9 ; mx+dx*[0-1]
2997 %if ARCH_X86_64
2998    mova                 m9, [base+pd_0x4000]
2999 %endif
3000    pshufd              m15, m15, q0000
3001    pand                 m8, m14, m10
3002    psrld                m8, 6
3003    paddd               m15, m8
3004    movd                r4d, m15
3005    pshufd              m15, m15, q0321
3006 %if ARCH_X86_64
3007    movd                r6d, m15
3008 %else
3009    movd                r3d, m15
3010 %endif
3011    mova                 m5, [base+bdct_lb_q]
3012    mova                 m6, [base+spel_s_shuf2]
3013    movd                m15, [base+subpel_filters+r4*8+2]
3014 %if ARCH_X86_64
3015    movd                 m7, [base+subpel_filters+r6*8+2]
3016 %else
3017    movd                 m7, [base+subpel_filters+r3*8+2]
3018 %endif
3019    pxor                 m2, m2
3020    pcmpeqd              m8, m2
3021    psrld               m14, 10
3022    paddd               m14, m14
3023 %if ARCH_X86_32
3024    mov                  r3, r3m
3025    pshufb              m14, m5
3026    paddb               m14, m6
3027    mova              [stk], m14
3028    SWAP                 m5, m0
3029    SWAP                 m6, m3
3030  %define m15 m6
3031 %endif
3032    movu                 m0, [srcq+ssq*0]
3033    movu                 m1, [srcq+ssq*1]
3034    movu                 m2, [srcq+ssq*2]
3035    movu                 m3, [srcq+ss3q ]
3036    lea                srcq, [srcq+ssq*4]
3037    punpckldq           m15, m7
3038 %if ARCH_X86_64
3039    pshufb              m14, m5
3040    paddb               m14, m6
3041    pand                 m9, m8
3042    pandn                m8, m15
3043    SWAP                m15, m8
3044    por                 m15, m9
3045    movu                 m4, [srcq+ssq*0]
3046    movu                 m5, [srcq+ssq*1]
3047    movu                 m6, [srcq+ssq*2]
3048    movu                 m7, [srcq+ss3q ]
3049    lea                srcq, [srcq+ssq*4]
3050 %else
3051    pand                 m7, m5, [base+pd_0x4000]
3052    pandn                m5, m15
3053    por                  m5, m7
3054  %define m15 m5
3055 %endif
3056    punpcklbw           m15, m15
3057    psraw               m15, 8
3058    REPX    {pshufb x, m14}, m0, m1, m2, m3
3059    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
3060 %if ARCH_X86_64
3061    REPX    {pshufb x, m14}, m4, m5, m6, m7
3062    REPX   {pmaddwd x, m15}, m4, m5, m6, m7
3063    phaddd               m0, m1
3064    phaddd               m2, m3
3065    phaddd               m4, m5
3066    phaddd               m6, m7
3067    REPX     {paddd x, m11}, m0, m2, m4, m6
3068    REPX     {psrad x, m12}, m0, m2, m4, m6
3069    packssdw             m0, m2 ; 0 1 2 3
3070    packssdw             m4, m6 ; 4 5 6 7
3071    SWAP                 m1, m4
3072 %else
3073    mova         [stk+0x10], m15
3074    phaddd               m0, m1
3075    phaddd               m2, m3
3076    movu                 m1, [srcq+ssq*0]
3077    movu                 m7, [srcq+ssq*1]
3078    movu                 m6, [srcq+ssq*2]
3079    movu                 m3, [srcq+ss3q ]
3080    lea                srcq, [srcq+ssq*4]
3081    REPX    {pshufb x, m14}, m1, m7, m6, m3
3082    REPX   {pmaddwd x, m15}, m1, m7, m6, m3
3083    phaddd               m1, m7
3084    phaddd               m6, m3
3085    REPX     {paddd x, m11}, m0, m2, m1, m6
3086    REPX     {psrad x, m12}, m0, m2, m1, m6
3087    packssdw             m0, m2
3088    packssdw             m1, m6
3089  %define m14 [stk+0x00]
3090  %define m15 [stk+0x10]
3091 %endif
3092    palignr              m2, m1, m0, 4 ; 1 2 3 4
3093    punpcklwd            m3, m0, m2    ; 01 12
3094    punpckhwd            m0, m2        ; 23 34
3095    pshufd               m5, m1, q0321 ; 5 6 7 _
3096    punpcklwd            m2, m1, m5    ; 45 56
3097    punpckhwd            m4, m1, m5    ; 67 __
3098 %if ARCH_X86_32
3099    mov                 myd, mym
3100    mov                  r0, r0m
3101    mova         [stk+0x20], m3
3102    mova         [stk+0x30], m0
3103    mova         [stk+0x40], m2
3104    mova         [stk+0x50], m4
3105 %endif
3106.w2_loop:
3107    and                 myd, 0x3ff
3108 %if ARCH_X86_64
3109    mov                 r6d, 64 << 24
3110    mov                 r4d, myd
3111    shr                 r4d, 6
3112    lea                 r4d, [t1+r4]
3113    cmovnz              r6q, [base+subpel_filters+r4*8]
3114    movq                m10, r6q
3115    punpcklbw           m10, m10
3116    psraw               m10, 8
3117    pshufd               m7, m10, q0000
3118    pshufd               m8, m10, q1111
3119    pmaddwd              m5, m3, m7
3120    pmaddwd              m6, m0, m8
3121    pshufd               m9, m10, q2222
3122    pshufd              m10, m10, q3333
3123    pmaddwd              m7, m2, m9
3124    pmaddwd              m8, m4, m10
3125    paddd                m5, m6
3126    paddd                m7, m8
3127 %else
3128    mov                  r1, [esp+0x1f4]
3129    xor                  r3, r3
3130    mov                  r5, myd
3131    shr                  r5, 6
3132    lea                  r1, [r1+r5]
3133    mov                  r5, 64 << 24
3134    cmovnz               r3, [base+subpel_filters+r1*8+4]
3135    cmovnz               r5, [base+subpel_filters+r1*8+0]
3136    movd                 m6, r3
3137    movd                 m7, r5
3138    punpckldq            m7, m6
3139    punpcklbw            m7, m7
3140    psraw                m7, 8
3141    pshufd               m5, m7, q0000
3142    pshufd               m6, m7, q1111
3143    pmaddwd              m3, m5
3144    pmaddwd              m0, m6
3145    pshufd               m5, m7, q2222
3146    pshufd               m7, m7, q3333
3147    pmaddwd              m2, m5
3148    pmaddwd              m4, m7
3149    paddd                m3, m0
3150    paddd                m2, m4
3151    SWAP                 m5, m3
3152    SWAP                 m7, m2
3153  %define m8 m3
3154 %endif
3155    paddd                m5, m13
3156    pshufd               m6, m12, q1032
3157    pxor                 m8, m8
3158    paddd                m5, m7
3159    psrad                m5, m6
3160    packssdw             m5, m5
3161    pmaxsw               m5, m8
3162    pminsw               m5, pxmaxm
3163    movd             [dstq], m5
3164    add                dstq, dsmp
3165    dec                  hd
3166    jz .ret
3167 %if ARCH_X86_64
3168    add                 myd, dyd
3169 %else
3170    add                 myd, dym
3171 %endif
3172    test                myd, ~0x3ff
3173 %if ARCH_X86_32
3174    SWAP                 m3, m5
3175    SWAP                 m2, m7
3176    mova                 m3, [stk+0x20]
3177    mova                 m0, [stk+0x30]
3178    mova                 m2, [stk+0x40]
3179    mova                 m4, [stk+0x50]
3180 %endif
3181    jz .w2_loop
3182 %if ARCH_X86_32
3183    mov                  r3, r3m
3184 %endif
3185    movu                 m5, [srcq]
3186    test                myd, 0x400
3187    jz .w2_skip_line
3188    add                srcq, ssq
3189    shufps               m3, m0, q1032      ; 01 12
3190    shufps               m0, m2, q1032      ; 23 34
3191    shufps               m2, m4, q1032      ; 45 56
3192    pshufb               m5, m14
3193    pmaddwd              m5, m15
3194    phaddd               m5, m5
3195    paddd                m5, m11
3196    psrad                m5, m12
3197    packssdw             m5, m5
3198    palignr              m4, m5, m1, 12
3199    punpcklqdq           m1, m4, m4         ; 6 7 6 7
3200    punpcklwd            m4, m1, m5         ; 67 __
3201 %if ARCH_X86_32
3202    mova         [stk+0x20], m3
3203    mova         [stk+0x30], m0
3204    mova         [stk+0x40], m2
3205    mova         [stk+0x50], m4
3206 %endif
3207    jmp .w2_loop
3208.w2_skip_line:
3209    movu                 m6, [srcq+ssq*1]
3210    lea                srcq, [srcq+ssq*2]
3211    mova                 m3, m0             ; 01 12
3212    mova                 m0, m2             ; 23 34
3213    pshufb               m5, m14
3214    pshufb               m6, m14
3215    pmaddwd              m5, m15
3216    pmaddwd              m6, m15
3217    phaddd               m5, m6
3218    paddd                m5, m11
3219    psrad                m5, m12
3220    packssdw             m5, m5             ; 6 7 6 7
3221    punpckhqdq           m1, m5             ; 4 5 6 7
3222    pshufd               m5, m1, q0321      ; 5 6 7 _
3223    punpcklwd            m2, m1, m5         ; 45 56
3224    punpckhwd            m4, m1, m5         ; 67 __
3225 %if ARCH_X86_32
3226    mova         [stk+0x20], m3
3227    mova         [stk+0x30], m0
3228    mova         [stk+0x40], m2
3229    mova         [stk+0x50], m4
3230 %endif
3231    jmp .w2_loop
3232%endif
3233INIT_XMM ssse3
3234.w4:
3235%if ARCH_X86_64
3236    mov                 myd, mym
3237    mova         [rsp+0x10], m11
3238    mova         [rsp+0x20], m12
3239 %if isput
3240    mova         [rsp+0x30], m13
3241 %endif
3242    movzx               t0d, t0b
3243    sub                srcq, 2
3244    movd                m15, t0d
3245%else
3246 %define m8  m0
3247 %xdefine m14 m4
3248 %define m15 m3
3249    movzx                r4, byte [esp+0x1f0]
3250    sub                srcq, 2
3251    movd                m15, r4
3252%endif
3253    pmaddwd              m8, [base+rescale_mul]
3254%if ARCH_X86_64
3255    mova                 m9, [base+pd_0x4000]
3256%else
3257 %define m9 [base+pd_0x4000]
3258%endif
3259    pshufd              m15, m15, q0000
3260    paddd               m14, m8 ; mx+dx*[0-3]
3261    pand                 m0, m14, m10
3262    psrld                m0, 6
3263    paddd               m15, m0
3264    pshufd               m7, m15, q1032
3265%if ARCH_X86_64
3266    movd                r4d, m15
3267    movd               r11d, m7
3268    pshufd              m15, m15, q0321
3269    pshufd               m7, m7, q0321
3270    movd                r6d, m15
3271    movd               r13d, m7
3272    mova                m10, [base+bdct_lb_q+ 0]
3273    mova                m11, [base+bdct_lb_q+16]
3274    movd                m13, [base+subpel_filters+ r4*8+2]
3275    movd                 m2, [base+subpel_filters+ r6*8+2]
3276    movd                m15, [base+subpel_filters+r11*8+2]
3277    movd                 m4, [base+subpel_filters+r13*8+2]
3278%else
3279    movd                 r0, m15
3280    movd                 r4, m7
3281    pshufd              m15, m15, q0321
3282    pshufd               m7, m7, q0321
3283    movd                 rX, m15
3284    movd                 r5, m7
3285    mova                 m5, [base+bdct_lb_q+ 0]
3286    mova                 m6, [base+bdct_lb_q+16]
3287    movd                 m1, [base+subpel_filters+r0*8+2]
3288    movd                 m2, [base+subpel_filters+rX*8+2]
3289    movd                 m3, [base+subpel_filters+r4*8+2]
3290    movd                 m7, [base+subpel_filters+r5*8+2]
3291    movifprep            r3, r3m
3292    SWAP                 m4, m7
3293 %define m10 m5
3294 %define m11 m6
3295 %define m12 m1
3296 %define m13 m1
3297%endif
3298    psrld               m14, 10
3299    paddd               m14, m14
3300    punpckldq           m13, m2
3301    punpckldq           m15, m4
3302    punpcklqdq          m13, m15
3303    pxor                 m2, m2
3304    pcmpeqd              m0, m2
3305%if ARCH_X86_64
3306    pand                 m9, m0
3307%else
3308    pand                 m2, m9, m0
3309 %define m9 m2
3310    SWAP                 m7, m4
3311%endif
3312    pandn                m0, m13
3313%if ARCH_X86_64
3314    SWAP                m13, m0
3315%else
3316 %define m13 m0
3317%endif
3318    por                 m13, m9
3319    punpckhbw           m15, m13, m13
3320    punpcklbw           m13, m13
3321    psraw               m15, 8
3322    psraw               m13, 8
3323    pshufb              m12, m14, m10
3324    pshufb              m14, m11
3325    mova                m10, [base+spel_s_shuf2]
3326    movd                r4d, m14
3327    shr                 r4d, 24
3328%if ARCH_X86_32
3329    mova         [stk+0x20], m13
3330    mova         [stk+0x30], m15
3331    pxor                 m2, m2
3332%endif
3333    pshufb               m7, m14, m2
3334    psubb               m14, m7
3335    paddb               m12, m10
3336    paddb               m14, m10
3337%if ARCH_X86_64
3338    lea                  r6, [r4+ssq*1]
3339    lea                 r11, [r4+ssq*2]
3340    lea                 r13, [r4+ss3q ]
3341    movu                 m7, [srcq+ssq*0]
3342    movu                 m9, [srcq+ssq*1]
3343    movu                 m8, [srcq+ssq*2]
3344    movu                m10, [srcq+ss3q ]
3345    movu                 m1, [srcq+r4   ]
3346    movu                 m3, [srcq+r6   ]
3347    movu                 m2, [srcq+r11  ]
3348    movu                 m4, [srcq+r13  ]
3349    lea                srcq, [srcq+ssq*4]
3350    REPX    {pshufb x, m12}, m7, m9, m8, m10
3351    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
3352    REPX    {pshufb x, m14}, m1, m2, m3, m4
3353    REPX   {pmaddwd x, m15}, m1, m2, m3, m4
3354    mova                 m5, [rsp+0x10]
3355    movd                xm6, [rsp+0x20]
3356    phaddd               m7, m1
3357    phaddd               m9, m3
3358    phaddd               m8, m2
3359    phaddd              m10, m4
3360    movu                 m1, [srcq+ssq*0]
3361    movu                 m2, [srcq+ssq*1]
3362    movu                 m3, [srcq+ssq*2]
3363    movu                 m4, [srcq+ss3q ]
3364    REPX      {paddd x, m5}, m7, m9, m8, m10
3365    REPX     {psrad x, xm6}, m7, m9, m8, m10
3366    packssdw             m7, m9  ; 0 1
3367    packssdw             m8, m10 ; 2 3
3368    movu                 m0, [srcq+r4   ]
3369    movu                 m9, [srcq+r6   ]
3370    movu                m10, [srcq+r11  ]
3371    movu                m11, [srcq+r13  ]
3372    lea                srcq, [srcq+ssq*4]
3373    REPX    {pshufb x, m12}, m1, m2, m3, m4
3374    REPX   {pmaddwd x, m13}, m1, m2, m3, m4
3375    REPX    {pshufb x, m14}, m0, m9, m10, m11
3376    REPX   {pmaddwd x, m15}, m0, m9, m10, m11
3377    phaddd               m1, m0
3378    phaddd               m2, m9
3379    phaddd               m3, m10
3380    phaddd               m4, m11
3381    REPX      {paddd x, m5}, m1, m2, m3, m4
3382    REPX     {psrad x, xm6}, m1, m2, m3, m4
3383    packssdw             m1, m2 ; 4 5
3384    packssdw             m3, m4 ; 6 7
3385    SWAP                 m9, m1
3386    shufps               m4, m7, m8, q1032  ; 1 2
3387    shufps               m5, m8, m9, q1032  ; 3 4
3388    shufps               m6, m9, m3, q1032  ; 5 6
3389    pshufd              m10, m3, q1032      ; 7 _
3390    punpcklwd            m0, m7, m4 ; 01
3391    punpckhwd            m7, m4     ; 12
3392    punpcklwd            m1, m8, m5 ; 23
3393    punpckhwd            m8, m5     ; 34
3394    punpcklwd            m2, m9, m6 ; 45
3395    punpckhwd            m9, m6     ; 56
3396    punpcklwd            m3, m10    ; 67
3397    mova         [rsp+0x40], m7
3398    mova         [rsp+0x50], m8
3399    mova         [rsp+0x60], m9
3400%else
3401    mova         [stk+0x00], m12
3402    mova         [stk+0x10], m14
3403    add                  r4, srcq
3404    MC_4TAP_SCALED_H   0x40 ; 0 1
3405    MC_4TAP_SCALED_H   0x50 ; 2 3
3406    MC_4TAP_SCALED_H   0x60 ; 4 5
3407    MC_4TAP_SCALED_H   0x70 ; 6 7
3408    mova                 m4, [stk+0x40]
3409    mova                 m5, [stk+0x50]
3410    mova                 m6, [stk+0x60]
3411    mova                 m7, [stk+0x70]
3412    mov          [stk+0xc0], r4
3413    shufps               m1, m4, m5, q1032 ; 1 2
3414    shufps               m2, m5, m6, q1032 ; 3 4
3415    shufps               m3, m6, m7, q1032 ; 5 6
3416    pshufd               m0, m7, q1032     ; 7 _
3417    mova         [stk+0xb0], m0
3418    punpcklwd            m0, m4, m1         ; 01
3419    punpckhwd            m4, m1             ; 12
3420    punpcklwd            m1, m5, m2         ; 23
3421    punpckhwd            m5, m2             ; 34
3422    punpcklwd            m2, m6, m3         ; 45
3423    punpckhwd            m6, m3             ; 56
3424    punpcklwd            m3, m7, [stk+0xb0] ; 67
3425    mov                 myd, mym
3426    mov                  r0, r0m
3427    mova         [stk+0x40], m0 ; 01
3428    mova         [stk+0x50], m1 ; 23
3429    mova         [stk+0x60], m2 ; 45
3430    mova         [stk+0x70], m3 ; 67
3431    mova         [stk+0x80], m4 ; 12
3432    mova         [stk+0x90], m5 ; 34
3433    mova         [stk+0xa0], m6 ; 56
3434 %define m12 [stk+0x00]
3435 %define m14 [stk+0x10]
3436 %define m13 [stk+0x20]
3437 %define m15 [stk+0x30]
3438 %define hrnd_mem [esp+0x00]
3439 %define hsh_mem  [esp+0x10]
3440 %if isput
3441  %define vrnd_mem [esp+0x20]
3442 %else
3443  %define vrnd_mem [base+pd_m524256]
3444 %endif
3445%endif
3446.w4_loop:
3447    and                 myd, 0x3ff
3448%if ARCH_X86_64
3449    mov                r11d, 64 << 24
3450    mov                r13d, myd
3451    shr                r13d, 6
3452    lea                r13d, [t1+r13]
3453    cmovnz             r11q, [base+subpel_filters+r13*8]
3454    movq                 m9, r11q
3455    punpcklbw            m9, m9
3456    psraw                m9, 8
3457    pshufd               m7, m9, q0000
3458    pshufd               m8, m9, q1111
3459    pmaddwd              m4, m0, m7
3460    pmaddwd              m5, m1, m8
3461    pshufd               m7, m9, q2222
3462    pshufd               m9, m9, q3333
3463    pmaddwd              m6, m2, m7
3464    pmaddwd              m8, m3, m9
3465 %if isput
3466    movd                 m9, [rsp+0x28]
3467  %define vrnd_mem [rsp+0x30]
3468 %else
3469  %define vrnd_mem [base+pd_m524256]
3470 %endif
3471    paddd                m4, m5
3472    paddd                m6, m8
3473    paddd                m4, m6
3474    paddd                m4, vrnd_mem
3475%else
3476    mov                 mym, myd
3477    mov                  r5, [esp+0x1f4]
3478    xor                  r3, r3
3479    shr                  r4, 6
3480    lea                  r5, [r5+r4]
3481    mov                  r4, 64 << 24
3482    cmovnz               r4, [base+subpel_filters+r5*8+0]
3483    cmovnz               r3, [base+subpel_filters+r5*8+4]
3484    movd                 m7, r4
3485    movd                 m6, r3
3486    punpckldq            m7, m6
3487    punpcklbw            m7, m7
3488    psraw                m7, 8
3489    pshufd               m4, m7, q0000
3490    pshufd               m5, m7, q1111
3491    pshufd               m6, m7, q2222
3492    pshufd               m7, m7, q3333
3493    pmaddwd              m0, m4
3494    pmaddwd              m1, m5
3495    pmaddwd              m2, m6
3496    pmaddwd              m3, m7
3497 %if isput
3498    movd                 m4, [esp+0x18]
3499 %endif
3500    paddd                m0, m1
3501    paddd                m2, m3
3502    paddd                m0, vrnd_mem
3503    paddd                m0, m2
3504    SWAP                 m4, m0
3505 %define m9 m0
3506%endif
3507%if isput
3508    pxor                 m5, m5
3509    psrad                m4, m9
3510    packssdw             m4, m4
3511    pmaxsw               m4, m5
3512    pminsw               m4, pxmaxm
3513    movq             [dstq], m4
3514    add                dstq, dsmp
3515%else
3516    psrad                m4, 6
3517    packssdw             m4, m4
3518    movq             [tmpq], m4
3519    add                tmpq, 8
3520%endif
3521    dec                  hd
3522    jz .ret
3523%if ARCH_X86_64
3524    add                 myd, dyd
3525    test                myd, ~0x3ff
3526    jz .w4_loop
3527    mova                 m8, [rsp+0x10]
3528    movd                 m9, [rsp+0x20]
3529    movu                 m4, [srcq]
3530    movu                 m5, [srcq+r4]
3531    test                myd, 0x400
3532    jz .w4_skip_line
3533    mova                 m0, [rsp+0x40]
3534    mova         [rsp+0x40], m1
3535    mova                 m1, [rsp+0x50]
3536    mova         [rsp+0x50], m2
3537    mova                 m2, [rsp+0x60]
3538    mova         [rsp+0x60], m3
3539    pshufb               m4, m12
3540    pshufb               m5, m14
3541    pmaddwd              m4, m13
3542    pmaddwd              m5, m15
3543    phaddd               m4, m5
3544    paddd                m4, m8
3545    psrad                m4, m9
3546    packssdw             m4, m4
3547    punpcklwd            m3, m10, m4
3548    mova                m10, m4
3549    add                srcq, ssq
3550    jmp .w4_loop
3551.w4_skip_line:
3552    movu                 m6, [srcq+ssq*1]
3553    movu                 m7, [srcq+r6]
3554    mova                 m0, [rsp+0x50]
3555    mova                m11, [rsp+0x60]
3556    pshufb               m4, m12
3557    pshufb               m6, m12
3558    pshufb               m5, m14
3559    pshufb               m7, m14
3560    pmaddwd              m4, m13
3561    pmaddwd              m6, m13
3562    pmaddwd              m5, m15
3563    pmaddwd              m7, m15
3564    mova         [rsp+0x40], m0
3565    mova         [rsp+0x50], m11
3566    phaddd               m4, m5
3567    phaddd               m6, m7
3568    paddd                m4, m8
3569    paddd                m6, m8
3570    psrad                m4, m9
3571    psrad                m6, m9
3572    packssdw             m4, m6
3573    punpcklwd            m9, m10, m4
3574    mova         [rsp+0x60], m9
3575    pshufd              m10, m4, q1032
3576    mova                 m0, m1
3577    mova                 m1, m2
3578    mova                 m2, m3
3579    punpcklwd            m3, m4, m10
3580    lea                srcq, [srcq+ssq*2]
3581    jmp .w4_loop
3582%else
3583    SWAP                 m0, m4
3584    mov                 myd, mym
3585    mov                  r3, r3m
3586    add                 myd, dym
3587    test                myd, ~0x3ff
3588    jnz .w4_next_line
3589    mova                 m0, [stk+0x40]
3590    mova                 m1, [stk+0x50]
3591    mova                 m2, [stk+0x60]
3592    mova                 m3, [stk+0x70]
3593    jmp .w4_loop
3594.w4_next_line:
3595    mov                  r5, [stk+0xc0]
3596    movu                 m4, [srcq]
3597    movu                 m5, [r5]
3598    test                myd, 0x400
3599    jz .w4_skip_line
3600    add          [stk+0xc0], ssq
3601    mova                 m0, [stk+0x80]
3602    mova                 m3, [stk+0x50]
3603    mova         [stk+0x40], m0
3604    mova         [stk+0x80], m3
3605    mova                 m1, [stk+0x90]
3606    mova                 m6, [stk+0x60]
3607    mova         [stk+0x50], m1
3608    mova         [stk+0x90], m6
3609    mova                 m2, [stk+0xa0]
3610    mova                 m7, [stk+0x70]
3611    mova         [stk+0x60], m2
3612    mova         [stk+0xa0], m7
3613    pshufb               m4, m12
3614    pshufb               m5, m14
3615    pmaddwd              m4, m13
3616    pmaddwd              m5, m15
3617    phaddd               m4, m5
3618    paddd                m4, hrnd_mem
3619    psrad                m4, hsh_mem
3620    packssdw             m4, m4
3621    punpcklwd            m3, [stk+0xb0], m4
3622    mova         [stk+0xb0], m4
3623    mova         [stk+0x70], m3
3624    add                srcq, ssq
3625    jmp .w4_loop
3626.w4_skip_line:
3627    movu                 m6, [srcq+ssq*1]
3628    movu                 m7, [r5  +ssq*1]
3629    lea                  r5, [r5  +ssq*2]
3630    mov          [stk+0xc0], r5
3631    mova                 m0, [stk+0x50]
3632    mova                 m1, [stk+0x60]
3633    mova                 m2, [stk+0x70]
3634    mova                 m3, [stk+0x90]
3635    pshufb               m4, m12
3636    pshufb               m6, m12
3637    pshufb               m5, m14
3638    pshufb               m7, m14
3639    pmaddwd              m4, m13
3640    pmaddwd              m6, m13
3641    pmaddwd              m5, m15
3642    pmaddwd              m7, m15
3643    mova         [stk+0x40], m0
3644    mova         [stk+0x50], m1
3645    mova         [stk+0x60], m2
3646    mova         [stk+0x80], m3
3647    phaddd               m4, m5
3648    phaddd               m6, m7
3649    mova                 m5, [stk+0xa0]
3650    mova                 m7, [stk+0xb0]
3651    paddd                m4, hrnd_mem
3652    paddd                m6, hrnd_mem
3653    psrad                m4, hsh_mem
3654    psrad                m6, hsh_mem
3655    packssdw             m4, m6
3656    punpcklwd            m7, m4
3657    pshufd               m6, m4, q1032
3658    mova         [stk+0x90], m5
3659    mova         [stk+0xa0], m7
3660    mova         [stk+0xb0], m6
3661    punpcklwd            m3, m4, m6
3662    mova         [stk+0x70], m3
3663    lea                srcq, [srcq+ssq*2]
3664    jmp .w4_loop
3665%endif
3666INIT_XMM ssse3
3667%if ARCH_X86_64
3668 %define stk rsp+0x20
3669%endif
3670.w8:
3671    mov    dword [stk+0xf0], 1
3672    movifprep   tmp_stridem, 16
3673    jmp .w_start
3674.w16:
3675    mov    dword [stk+0xf0], 2
3676    movifprep   tmp_stridem, 32
3677    jmp .w_start
3678.w32:
3679    mov    dword [stk+0xf0], 4
3680    movifprep   tmp_stridem, 64
3681    jmp .w_start
3682.w64:
3683    mov    dword [stk+0xf0], 8
3684    movifprep   tmp_stridem, 128
3685    jmp .w_start
3686.w128:
3687    mov    dword [stk+0xf0], 16
3688    movifprep   tmp_stridem, 256
3689.w_start:
3690%if ARCH_X86_64
3691 %ifidn %1, put
3692    movifnidn           dsm, dsq
3693 %endif
3694    mova         [rsp+0x10], m11
3695 %define hround m11
3696    shr                 t0d, 16
3697    movd                m15, t0d
3698 %if isprep
3699    mova                m13, [base+pd_m524256]
3700 %endif
3701%else
3702 %define hround [esp+0x00]
3703 %define m12    [esp+0x10]
3704 %define m10    [base+pd_0x3ff]
3705 %define m8  m0
3706 %xdefine m14 m4
3707 %define m15 m3
3708 %if isprep
3709  %define ssq ssm
3710 %endif
3711    mov                  r4, [esp+0x1f0]
3712    shr                  r4, 16
3713    movd                m15, r4
3714    mov                  r0, r0m
3715    mov                 myd, mym
3716%endif
3717    sub                srcq, 6
3718    pslld                m7, m8, 2 ; dx*4
3719    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
3720    pshufd              m15, m15, q0000
3721    paddd               m14, m8 ; mx+dx*[0-3]
3722    mova        [stk+0x100], m7
3723    mova        [stk+0x120], m15
3724    mov         [stk+0x0f8], srcq
3725    mov         [stk+0x130], r0q ; dstq / tmpq
3726%if ARCH_X86_64 && UNIX64
3727    mov                  hm, hd
3728%elif ARCH_X86_32
3729    mov                  r5, hm
3730    mov         [stk+0x0f4], myd
3731    mov         [stk+0x134], r5
3732%endif
3733    jmp .hloop
3734.hloop_prep:
3735    dec   dword [stk+0x0f0]
3736    jz .ret
3737%if ARCH_X86_64
3738    add   qword [stk+0x130], 16
3739    mov                  hd, hm
3740%else
3741    add   dword [stk+0x130], 16
3742    mov                 myd, [stk+0x0f4]
3743    mov                  r5, [stk+0x134]
3744    mov                  r0, [stk+0x130]
3745%endif
3746    mova                 m7, [stk+0x100]
3747    mova                m14, [stk+0x110]
3748%if ARCH_X86_64
3749    mova                m10, [base+pd_0x3ff]
3750    mova                m11, [rsp+0x10]
3751%endif
3752    mova                m15, [stk+0x120]
3753    mov                srcq, [stk+0x0f8]
3754%if ARCH_X86_64
3755    mov                 r0q, [stk+0x130] ; dstq / tmpq
3756%else
3757    mov                 mym, myd
3758    mov                  hm, r5
3759    mov                 r0m, r0
3760    mov                  r3, r3m
3761%endif
3762    paddd               m14, m7
3763.hloop:
3764%if ARCH_X86_64
3765    mova                 m9, [base+pq_0x40000000]
3766%else
3767 %define m9 [base+pq_0x40000000]
3768%endif
3769    pxor                 m1, m1
3770    psrld                m2, m14, 10
3771    mova              [stk], m2
3772    pand                 m6, m14, m10
3773    psrld                m6, 6
3774    paddd                m5, m15, m6
3775    pcmpeqd              m6, m1
3776    pshufd               m2, m5, q1032
3777%if ARCH_X86_64
3778    movd                r4d, m5
3779    movd                r6d, m2
3780    pshufd               m5, m5, q0321
3781    pshufd               m2, m2, q0321
3782    movd                r7d, m5
3783    movd                r9d, m2
3784    movq                 m0, [base+subpel_filters+r4*8]
3785    movq                 m1, [base+subpel_filters+r6*8]
3786    movhps               m0, [base+subpel_filters+r7*8]
3787    movhps               m1, [base+subpel_filters+r9*8]
3788%else
3789    movd                 r0, m5
3790    movd                 rX, m2
3791    pshufd               m5, m5, q0321
3792    pshufd               m2, m2, q0321
3793    movd                 r4, m5
3794    movd                 r5, m2
3795    movq                 m0, [base+subpel_filters+r0*8]
3796    movq                 m1, [base+subpel_filters+rX*8]
3797    movhps               m0, [base+subpel_filters+r4*8]
3798    movhps               m1, [base+subpel_filters+r5*8]
3799%endif
3800    paddd               m14, m7 ; mx+dx*[4-7]
3801    pand                 m5, m14, m10
3802    psrld                m5, 6
3803    paddd               m15, m5
3804    pxor                 m2, m2
3805    pcmpeqd              m5, m2
3806    mova        [stk+0x110], m14
3807    pshufd               m4, m15, q1032
3808%if ARCH_X86_64
3809    movd               r10d, m15
3810    movd               r11d, m4
3811    pshufd              m15, m15, q0321
3812    pshufd               m4, m4, q0321
3813    movd               r13d, m15
3814    movd                rXd, m4
3815    movq                 m2, [base+subpel_filters+r10*8]
3816    movq                 m3, [base+subpel_filters+r11*8]
3817    movhps               m2, [base+subpel_filters+r13*8]
3818    movhps               m3, [base+subpel_filters+ rX*8]
3819    psrld               m14, 10
3820    movq                r11, m14
3821    punpckhqdq          m14, m14
3822    movq                 rX, m14
3823    mov                r10d, r11d
3824    shr                 r11, 32
3825    mov                r13d, rXd
3826    shr                  rX, 32
3827    mov                 r4d, [stk+ 0]
3828    mov                 r6d, [stk+ 4]
3829    mov                 r7d, [stk+ 8]
3830    mov                 r9d, [stk+12]
3831    pshufd               m4, m6, q1100
3832    pshufd               m6, m6, q3322
3833    pshufd              m14, m5, q1100
3834    pshufd               m5, m5, q3322
3835    pand                 m7, m9, m4
3836    pand                 m8, m9, m6
3837    pand                m15, m9, m14
3838    pand                 m9, m9, m5
3839    pandn                m4, m0
3840    pandn                m6, m1
3841    pandn               m14, m2
3842    pandn                m5, m3
3843    por                  m7, m4
3844    por                  m8, m6
3845    por                 m15, m14
3846    por                  m9, m5
3847    punpcklbw            m0, m7, m7
3848    punpckhbw            m7, m7
3849    punpcklbw            m1, m8, m8
3850    punpckhbw            m8, m8
3851    psraw                m0, 8
3852    psraw                m7, 8
3853    psraw                m1, 8
3854    psraw                m8, 8
3855    punpcklbw            m2, m15, m15
3856    punpckhbw           m15, m15
3857    punpcklbw            m3, m9, m9
3858    punpckhbw            m9, m9
3859    psraw                m2, 8
3860    psraw               m15, 8
3861    psraw                m3, 8
3862    psraw                m9, 8
3863    mova         [stk+0x10], m0
3864    mova         [stk+0x20], m7
3865    mova         [stk+0x30], m1
3866    mova         [stk+0x40], m8
3867    mova         [stk+0x50], m2
3868    mova         [stk+0x60], m15
3869    mova         [stk+0x70], m3
3870    mova         [stk+0x80], m9
3871    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
3872    mova         [stk+0x90], m1
3873    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
3874    mova         [stk+0xa0], m2
3875    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
3876    mova         [stk+0xb0], m3
3877    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
3878    mova         [stk+0xc0], m4
3879    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
3880    mova         [stk+0xd0], m5
3881    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
3882    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
3883    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
3884    mova                 m5, [stk+0xd0]
3885    mova                 m1, [stk+0x90]
3886    mova                 m2, [stk+0xa0]
3887    mova                 m3, [stk+0xb0]
3888    mova                 m9, [stk+0xc0]
3889    mov                 myd, mym
3890    mov                 dyd, dym
3891    punpcklwd            m4, m5, m6 ; 45a
3892    punpckhwd            m5, m6     ; 45b
3893    punpcklwd            m6, m7, m8 ; 67a
3894    punpckhwd            m7, m8     ; 67b
3895    punpcklwd            m0, m1, m2 ; 01a
3896    punpckhwd            m1, m2     ; 01b
3897    punpcklwd            m2, m3, m9 ; 23a
3898    punpckhwd            m3, m9     ; 23b
3899    mova         [stk+0x90], m4
3900    mova         [stk+0xa0], m5
3901    mova         [stk+0xb0], m6
3902    mova         [stk+0xc0], m7
3903 %define hround [rsp+0x10]
3904.vloop:
3905    and                 myd, 0x3ff
3906    mov                 r6d, 64 << 24
3907    mov                 r4d, myd
3908    shr                 r4d, 6
3909    lea                 r4d, [t1+r4]
3910    cmovnz              r6q, [base+subpel_filters+r4*8]
3911    movq                m11, r6q
3912    punpcklbw           m11, m11
3913    psraw               m11, 8
3914    pshufd               m5, m11, q0000
3915    pshufd               m7, m11, q1111
3916    pshufd              m10, m11, q2222
3917    pshufd              m11, m11, q3333
3918    pmaddwd              m4, m5, m0
3919    pmaddwd              m5, m5, m1
3920    pmaddwd              m6, m7, m2
3921    pmaddwd              m7, m7, m3
3922    paddd                m4, m13
3923    paddd                m5, m13
3924    paddd                m4, m6
3925    paddd                m5, m7
3926    pmaddwd              m6, [stk+0x90], m10
3927    pmaddwd              m7, [stk+0xa0], m10
3928    pmaddwd              m8, [stk+0xb0], m11
3929    pmaddwd              m9, [stk+0xc0], m11
3930    paddd                m4, m6
3931    paddd                m5, m7
3932 %if isput
3933    pshufd               m6, m12, q1032
3934 %endif
3935    paddd                m4, m8
3936    paddd                m5, m9
3937%else
3938    movd                 r0, m15
3939    movd                 rX, m4
3940    pshufd              m15, m15, q0321
3941    pshufd               m4, m4, q0321
3942    movd                 r4, m15
3943    movd                 r5, m4
3944    mova                m14, [stk+0x110]
3945    movq                 m2, [base+subpel_filters+r0*8]
3946    movq                 m3, [base+subpel_filters+rX*8]
3947    movhps               m2, [base+subpel_filters+r4*8]
3948    movhps               m3, [base+subpel_filters+r5*8]
3949    psrld               m14, 10
3950    mova           [stk+16], m14
3951    mov                  r0, [stk+ 0]
3952    mov                  rX, [stk+ 4]
3953    mov                  r4, [stk+ 8]
3954    mov                  r5, [stk+12]
3955    mova         [stk+0x20], m0
3956    mova         [stk+0x30], m1
3957    mova         [stk+0x40], m2
3958    mova         [stk+0x50], m3
3959    pshufd               m4, m6, q1100
3960    pshufd               m6, m6, q3322
3961    pshufd               m7, m5, q1100
3962    pshufd               m5, m5, q3322
3963    pand                 m0, m9, m4
3964    pand                 m1, m9, m6
3965    pand                 m2, m9, m7
3966    pand                 m3, m9, m5
3967    pandn                m4, [stk+0x20]
3968    pandn                m6, [stk+0x30]
3969    pandn                m7, [stk+0x40]
3970    pandn                m5, [stk+0x50]
3971    por                  m0, m4
3972    por                  m1, m6
3973    por                  m2, m7
3974    por                  m3, m5
3975    punpcklbw            m4, m0, m0
3976    punpckhbw            m0, m0
3977    punpcklbw            m5, m1, m1
3978    punpckhbw            m1, m1
3979    psraw                m4, 8
3980    psraw                m0, 8
3981    psraw                m5, 8
3982    psraw                m1, 8
3983    punpcklbw            m6, m2, m2
3984    punpckhbw            m2, m2
3985    punpcklbw            m7, m3, m3
3986    punpckhbw            m3, m3
3987    psraw                m6, 8
3988    psraw                m2, 8
3989    psraw                m7, 8
3990    psraw                m3, 8
3991    mova        [stk+0x0a0], m4
3992    mova        [stk+0x0b0], m0
3993    mova        [stk+0x0c0], m5
3994    mova        [stk+0x0d0], m1
3995    mova        [stk+0x140], m6
3996    mova        [stk+0x150], m2
3997    mova        [stk+0x160], m7
3998    mova        [stk+0x170], m3
3999    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
4000    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
4001    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
4002    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
4003    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
4004    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
4005    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
4006    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
4007    mova                 m5, [stk+0x60]
4008    mova                 m6, [stk+0x70]
4009    mova                 m7, [stk+0x80]
4010    mova                 m0, [stk+0x90]
4011    mov                 myd, mym
4012    punpcklwd            m4, m5, m6      ; 45a
4013    punpckhwd            m5, m6          ; 45b
4014    punpcklwd            m6, m7, m0      ; 67a
4015    punpckhwd            m7, m0          ; 67b
4016    mova         [stk+0x60], m4
4017    mova         [stk+0x70], m5
4018    mova         [stk+0x80], m6
4019    mova         [stk+0x90], m7
4020    mova                 m1, [stk+0x20]
4021    mova                 m2, [stk+0x30]
4022    mova                 m3, [stk+0x40]
4023    mova                 m4, [stk+0x50]
4024    punpcklwd            m0, m1, m2      ; 01a
4025    punpckhwd            m1, m2          ; 01b
4026    punpcklwd            m2, m3, m4      ; 23a
4027    punpckhwd            m3, m4          ; 23b
4028    mova         [stk+0x20], m0
4029    mova         [stk+0x30], m1
4030    mova         [stk+0x40], m2
4031    mova         [stk+0x50], m3
4032.vloop:
4033    mov                  r0, r0m
4034    mov                  r5, [esp+0x1f4]
4035    and                 myd, 0x3ff
4036    mov                 mym, myd
4037    xor                  r3, r3
4038    shr                  r4, 6
4039    lea                  r5, [r5+r4]
4040    mov                  r4, 64 << 24
4041    cmovnz               r4, [base+subpel_filters+r5*8+0]
4042    cmovnz               r3, [base+subpel_filters+r5*8+4]
4043    movd                 m7, r4
4044    movd                 m6, r3
4045    punpckldq            m7, m6
4046    punpcklbw            m7, m7
4047    psraw                m7, 8
4048    pshufd               m4, m7, q0000
4049    pshufd               m5, m7, q1111
4050    pmaddwd              m0, m4
4051    pmaddwd              m1, m4
4052    pmaddwd              m2, m5
4053    pmaddwd              m3, m5
4054    pshufd               m6, m7, q2222
4055    pshufd               m7, m7, q3333
4056    paddd                m0, m2
4057    paddd                m1, m3
4058    pmaddwd              m2, [stk+0x60], m6
4059    pmaddwd              m3, [stk+0x70], m6
4060    pmaddwd              m4, [stk+0x80], m7
4061    pmaddwd              m5, [stk+0x90], m7
4062 %if isput
4063    movd                 m6, [esp+0x18]
4064 %endif
4065    paddd                m0, m2
4066    paddd                m1, m3
4067    paddd                m0, vrnd_mem
4068    paddd                m1, vrnd_mem
4069    paddd                m4, m0
4070    paddd                m5, m1
4071%endif
4072%ifidn %1, put
4073    psrad                m4, m6
4074    psrad                m5, m6
4075    packssdw             m4, m5
4076    pxor                 m7, m7
4077    pmaxsw               m4, m7
4078    pminsw               m4, pxmaxm
4079    mova             [dstq], m4
4080    add                dstq, dsm
4081%else
4082    psrad                m4, 6
4083    psrad                m5, 6
4084    packssdw             m4, m5
4085    mova             [tmpq], m4
4086    add                tmpq, tmp_stridem
4087%endif
4088    dec                  hd
4089    jz .hloop_prep
4090%if ARCH_X86_64
4091    add                 myd, dyd
4092    test                myd, ~0x3ff
4093    jz .vloop
4094    test                myd, 0x400
4095    mov         [stk+0x140], myd
4096    mov                 r4d, [stk+ 0]
4097    mov                 r6d, [stk+ 4]
4098    mov                 r7d, [stk+ 8]
4099    mov                 r9d, [stk+12]
4100    jz .skip_line
4101    mova                m14, [base+unpckw]
4102    movu                 m8, [srcq+r10*2]
4103    movu                 m9, [srcq+r11*2]
4104    movu                m10, [srcq+r13*2]
4105    movu                m11, [srcq+ rX*2]
4106    movu                 m4, [srcq+ r4*2]
4107    movu                 m5, [srcq+ r6*2]
4108    movu                 m6, [srcq+ r7*2]
4109    movu                 m7, [srcq+ r9*2]
4110    add                srcq, ssq
4111    mov                 myd, [stk+0x140]
4112    mov                 dyd, dym
4113    pshufd              m15, m14, q1032
4114    pshufb               m0, m14                ; 0a 1a
4115    pshufb               m1, m14                ; 0b 1b
4116    pshufb               m2, m15                ; 3a 2a
4117    pshufb               m3, m15                ; 3b 2b
4118    pmaddwd              m8, [stk+0x50]
4119    pmaddwd              m9, [stk+0x60]
4120    pmaddwd             m10, [stk+0x70]
4121    pmaddwd             m11, [stk+0x80]
4122    pmaddwd              m4, [stk+0x10]
4123    pmaddwd              m5, [stk+0x20]
4124    pmaddwd              m6, [stk+0x30]
4125    pmaddwd              m7, [stk+0x40]
4126    phaddd               m8, m9
4127    phaddd              m10, m11
4128    mova                m11, hround
4129    phaddd               m4, m5
4130    phaddd               m6, m7
4131    phaddd               m8, m10
4132    phaddd               m4, m6
4133    paddd                m4, m11
4134    paddd                m8, m11
4135    psrad                m4, m12
4136    psrad                m8, m12
4137    packssdw             m4, m8
4138    pshufb               m5, [stk+0x90], m14    ; 4a 5a
4139    pshufb               m6, [stk+0xa0], m14    ; 4b 5b
4140    pshufb               m7, [stk+0xb0], m15    ; 7a 6a
4141    pshufb               m8, [stk+0xc0], m15    ; 7b 6b
4142    punpckhwd            m0, m2 ; 12a
4143    punpckhwd            m1, m3 ; 12b
4144    punpcklwd            m2, m5 ; 34a
4145    punpcklwd            m3, m6 ; 34b
4146    punpckhwd            m5, m7 ; 56a
4147    punpckhwd            m6, m8 ; 56b
4148    punpcklwd            m7, m4 ; 78a
4149    punpckhqdq           m4, m4
4150    punpcklwd            m8, m4 ; 78b
4151    mova         [stk+0x90], m5
4152    mova         [stk+0xa0], m6
4153    mova         [stk+0xb0], m7
4154    mova         [stk+0xc0], m8
4155    jmp .vloop
4156.skip_line:
4157    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 10, 11
4158    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 0, 10, 11
4159    mov                 myd, [stk+0x140]
4160    mov                 dyd, dym
4161    mova                 m0, m2         ; 01a
4162    mova                 m1, m3         ; 01b
4163    mova                 m2, [stk+0x90] ; 23a
4164    mova                 m3, [stk+0xa0] ; 23b
4165    mova                 m5, [stk+0xb0] ; 45a
4166    mova                 m6, [stk+0xc0] ; 45b
4167    punpcklwd            m7, m4, m8     ; 67a
4168    punpckhwd            m4, m8         ; 67b
4169    mova         [stk+0x90], m5
4170    mova         [stk+0xa0], m6
4171    mova         [stk+0xb0], m7
4172    mova         [stk+0xc0], m4
4173%else
4174    mov                 r0m, r0
4175    mov                 myd, mym
4176    mov                  r3, r3m
4177    add                 myd, dym
4178    test                myd, ~0x3ff
4179    mov                 mym, myd
4180    jnz .next_line
4181    mova                 m0, [stk+0x20]
4182    mova                 m1, [stk+0x30]
4183    mova                 m2, [stk+0x40]
4184    mova                 m3, [stk+0x50]
4185    jmp .vloop
4186.next_line:
4187    test                myd, 0x400
4188    mov                  r0, [stk+ 0]
4189    mov                  rX, [stk+ 4]
4190    mov                  r4, [stk+ 8]
4191    mov                  r5, [stk+12]
4192    jz .skip_line
4193    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
4194    mova                 m7, [base+unpckw]
4195    pshufd               m4, m7, q1032
4196    pshufb               m0, [stk+0x20], m7 ; 0a 1a
4197    pshufb               m1, [stk+0x30], m7 ; 0b 1b
4198    pshufb               m2, [stk+0x40], m4 ; 3a 2a
4199    pshufb               m3, [stk+0x50], m4 ; 3b 2b
4200    pshufb               m5, [stk+0x60], m7 ; 4a 5a
4201    pshufb               m6, [stk+0x70], m7 ; 4b 5b
4202    pshufb               m7, [stk+0x80], m4 ; 7a 6a
4203    punpckhwd            m0, m2 ; 12a
4204    punpckhwd            m1, m3 ; 12b
4205    punpcklwd            m2, m5 ; 34a
4206    punpcklwd            m3, m6 ; 34b
4207    mova         [stk+0x20], m0
4208    mova         [stk+0x30], m1
4209    mova         [stk+0x40], m2
4210    mova         [stk+0x50], m3
4211    punpckhwd            m5, m7 ; 56a
4212    mova         [stk+0x60], m5
4213    pshufb               m5, [stk+0x90], m4 ; 7b 6b
4214    punpcklwd            m7, [stk+0xe0] ; 78a
4215    punpckhwd            m6, m5 ; 56b
4216    mova         [stk+0x70], m6
4217    movq                 m6, [stk+0xe8]
4218    mova         [stk+0x80], m7
4219    punpcklwd            m5, m6
4220    mov                 myd, mym
4221    mova         [stk+0x90], m5
4222    jmp .vloop
4223.skip_line:
4224    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
4225    MC_8TAP_SCALED_H 0xa0, 0       ; 9
4226    mova                 m7, [stk+0xe0]
4227    mova                 m2, [stk+0x60] ; 23a
4228    mova                 m3, [stk+0x70] ; 23b
4229    mova                 m4, [stk+0x80] ; 45a
4230    mova                 m5, [stk+0x90] ; 45b
4231    punpcklwd            m6, m7, m0     ; 67a
4232    punpckhwd            m7, m0         ; 67b
4233    mova                 m0, [stk+0x40] ; 01a
4234    mova                 m1, [stk+0x50] ; 01b
4235    mov                 myd, mym
4236    mova         [stk+0x40], m2
4237    mova         [stk+0x50], m3
4238    mova         [stk+0x60], m4
4239    mova         [stk+0x70], m5
4240    mova         [stk+0x80], m6
4241    mova         [stk+0x90], m7
4242    mova         [stk+0x20], m0
4243    mova         [stk+0x30], m1
4244%endif
4245    jmp .vloop
4246INIT_XMM ssse3
4247.dy1:
4248    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy1_table+wq*2]
4249    add                  wq, base_reg
4250    jmp                  wq
4251%if isput
4252.dy1_w2:
4253 %if ARCH_X86_64
4254    mov                 myd, mym
4255    movzx               t0d, t0b
4256    sub                srcq, 2
4257    movd                m15, t0d
4258 %else
4259  %define m8  m0
4260  %define m9  m1
4261  %define m14 m4
4262  %define m15 m3
4263  %define m11 [esp+0x00]
4264  %define m12 [esp+0x10]
4265  %define m13 [esp+0x20]
4266    movzx                r5, byte [esp+0x1f0]
4267    sub                srcq, 2
4268    movd                m15, r5
4269    mov                  r1, r1m
4270 %endif
4271    pxor                 m9, m9
4272    punpckldq            m9, m8
4273    paddd               m14, m9 ; mx+dx*[0-1]
4274 %if ARCH_X86_64
4275    mova                 m9, [base+pd_0x4000]
4276 %endif
4277    pshufd              m15, m15, q0000
4278    pand                 m8, m14, m10
4279    psrld                m8, 6
4280    paddd               m15, m8
4281    movd                r4d, m15
4282    pshufd              m15, m15, q0321
4283 %if ARCH_X86_64
4284    movd                r6d, m15
4285 %else
4286    movd                r3d, m15
4287 %endif
4288    mova                 m5, [base+bdct_lb_q]
4289    mova                 m6, [base+spel_s_shuf2]
4290    movd                m15, [base+subpel_filters+r4*8+2]
4291 %if ARCH_X86_64
4292    movd                 m7, [base+subpel_filters+r6*8+2]
4293 %else
4294    movd                 m7, [base+subpel_filters+r3*8+2]
4295 %endif
4296    pxor                 m2, m2
4297    pcmpeqd              m8, m2
4298    psrld               m14, 10
4299    paddd               m14, m14
4300 %if ARCH_X86_32
4301    mov                  r3, r3m
4302    pshufb              m14, m5
4303    paddb               m14, m6
4304    mova              [stk], m14
4305    SWAP                 m5, m0
4306    SWAP                 m6, m3
4307  %define m15 m6
4308 %endif
4309    movu                 m0, [srcq+ssq*0]
4310    movu                 m1, [srcq+ssq*1]
4311    movu                 m2, [srcq+ssq*2]
4312    movu                 m3, [srcq+ss3q ]
4313    lea                srcq, [srcq+ssq*4]
4314    punpckldq           m15, m7
4315 %if ARCH_X86_64
4316    pshufb              m14, m5
4317    paddb               m14, m6
4318    pand                 m9, m8
4319    pandn                m8, m15
4320    SWAP                m15, m8
4321    por                 m15, m9
4322    movu                 m4, [srcq+ssq*0]
4323    movu                 m5, [srcq+ssq*1]
4324    movu                 m6, [srcq+ssq*2]
4325    add                srcq, ss3q
4326    shr                 myd, 6
4327    mov                 r4d, 64 << 24
4328    lea                 myd, [t1+myq]
4329    cmovnz              r4q, [base+subpel_filters+myq*8]
4330 %else
4331    pand                 m7, m5, [base+pd_0x4000]
4332    pandn                m5, m15
4333    por                  m5, m7
4334  %define m15 m5
4335    mov                 myd, mym
4336    mov                  r5, [esp+0x1f4]
4337    xor                  r3, r3
4338    shr                 myd, 6
4339    lea                  r5, [r5+myd]
4340    mov                  r4, 64 << 24
4341    cmovnz               r4, [base+subpel_filters+r5*8+0]
4342    cmovnz               r3, [base+subpel_filters+r5*8+4]
4343    mov          [stk+0x20], r3
4344    mov                  r3, r3m
4345 %endif
4346    punpcklbw           m15, m15
4347    psraw               m15, 8
4348    REPX    {pshufb x, m14}, m0, m1, m2, m3
4349    REPX   {pmaddwd x, m15}, m0, m1, m2, m3
4350 %if ARCH_X86_64
4351    REPX    {pshufb x, m14}, m4, m5, m6
4352    REPX   {pmaddwd x, m15}, m4, m5, m6
4353    phaddd               m0, m1
4354    phaddd               m2, m3
4355    phaddd               m4, m5
4356    phaddd               m6, m6
4357    REPX     {paddd x, m11}, m0, m2, m4, m6
4358    REPX     {psrad x, m12}, m0, m2, m4, m6
4359    packssdw             m0, m2 ; 0 1 2 3
4360    packssdw             m4, m6 ; 4 5 6
4361    SWAP                 m1, m4
4362    movq                m10, r4
4363 %else
4364    mova         [stk+0x10], m15
4365    phaddd               m0, m1
4366    phaddd               m2, m3
4367    movu                 m1, [srcq+ssq*0]
4368    movu                 m7, [srcq+ssq*1]
4369    movu                 m6, [srcq+ssq*2]
4370    add                srcq, ss3q
4371    REPX    {pshufb x, m14}, m1, m7, m6
4372    REPX   {pmaddwd x, m15}, m1, m7, m6
4373  %define m14 [stk+0x00]
4374  %define m15 [stk+0x10]
4375    phaddd               m1, m7
4376    phaddd               m6, m6
4377    REPX     {paddd x, m11}, m0, m2, m1, m6
4378    REPX     {psrad x, m12}, m0, m2, m1, m6
4379    packssdw             m0, m2
4380    packssdw             m1, m6
4381  %define m8  m6
4382  %define m9  m4
4383  %define m10 m5
4384    movd                m10, r4
4385    movd                 m9, [stk+0x20]
4386    punpckldq           m10, m9
4387 %endif
4388    punpcklbw           m10, m10
4389    psraw               m10, 8
4390    pshufd               m7, m10, q0000
4391    pshufd               m8, m10, q1111
4392    pshufd               m9, m10, q2222
4393    pshufd              m10, m10, q3333
4394 %if ARCH_X86_32
4395    mova         [stk+0x50], m7
4396    mova         [stk+0x60], m8
4397    mova         [stk+0x70], m9
4398    mova         [stk+0x80], m10
4399  %define m7  [stk+0x50]
4400  %define m8  [stk+0x60]
4401  %define m9  [stk+0x70]
4402  %define m10 [stk+0x80]
4403 %endif
4404    palignr              m2, m1, m0, 4 ; 1 2 3 4
4405    punpcklwd            m3, m0, m2    ; 01 12
4406    punpckhwd            m0, m2        ; 23 34
4407    pshufd               m4, m1, q2121 ; 5 6 5 6
4408    punpcklwd            m2, m1, m4    ; 45 56
4409 %if ARCH_X86_32
4410    mov                  r0, r0m
4411 %endif
4412.dy1_w2_loop:
4413    movu                 m1, [srcq+ssq*0]
4414    movu                 m6, [srcq+ssq*1]
4415    lea                srcq, [srcq+ssq*2]
4416    pmaddwd              m5, m3, m7
4417    mova                 m3, m0
4418    pmaddwd              m0, m8
4419    pshufb               m1, m14
4420    pshufb               m6, m14
4421    pmaddwd              m1, m15
4422    pmaddwd              m6, m15
4423    phaddd               m1, m6
4424    paddd                m1, m11
4425    psrad                m1, m12
4426    packssdw             m1, m1
4427    paddd                m5, m0
4428    mova                 m0, m2
4429    pmaddwd              m2, m9
4430    paddd                m5, m2
4431    palignr              m2, m1, m4, 12
4432    punpcklwd            m2, m1        ; 67 78
4433    pmaddwd              m4, m2, m10
4434    paddd                m5, m13
4435    paddd                m5, m4
4436    pxor                 m6, m6
4437    mova                 m4, m1
4438    pshufd               m1, m12, q1032
4439    psrad                m5, m1
4440    packssdw             m5, m5
4441    pmaxsw               m5, m6
4442    pminsw               m5, pxmaxm
4443    movd       [dstq+dsq*0], m5
4444    pshuflw              m5, m5, q1032
4445    movd       [dstq+dsq*1], m5
4446    lea                dstq, [dstq+dsq*2]
4447    sub                  hd, 2
4448    jg .dy1_w2_loop
4449    RET
4450%endif
4451INIT_XMM ssse3
4452.dy1_w4:
4453%if ARCH_X86_64
4454    mov                 myd, mym
4455    mova         [rsp+0x10], m11
4456    mova         [rsp+0x20], m12
4457 %if isput
4458    mova         [rsp+0x30], m13
4459  %define vrnd_mem [rsp+0x30]
4460  %define stk rsp+0x40
4461 %else
4462  %define vrnd_mem [base+pd_m524256]
4463  %define stk rsp+0x30
4464 %endif
4465    movzx               t0d, t0b
4466    sub                srcq, 2
4467    movd                m15, t0d
4468%else
4469 %define m10 [base+pd_0x3ff]
4470 %define m9  [base+pd_0x4000]
4471 %define m8  m0
4472 %xdefine m14 m4
4473 %define m15 m3
4474 %if isprep
4475  %define ssq r3
4476 %endif
4477    movzx                r5, byte [esp+0x1f0]
4478    sub                srcq, 2
4479    movd                m15, r5
4480%endif
4481    pmaddwd              m8, [base+rescale_mul]
4482%if ARCH_X86_64
4483    mova                 m9, [base+pd_0x4000]
4484%endif
4485    pshufd              m15, m15, q0000
4486    paddd               m14, m8 ; mx+dx*[0-3]
4487    pand                 m0, m14, m10
4488    psrld                m0, 6
4489    paddd               m15, m0
4490    pshufd               m7, m15, q1032
4491%if ARCH_X86_64
4492    movd                r4d, m15
4493    movd               r11d, m7
4494    pshufd              m15, m15, q0321
4495    pshufd               m7, m7, q0321
4496    movd                r6d, m15
4497    movd               r13d, m7
4498    mova                m10, [base+bdct_lb_q+ 0]
4499    mova                m11, [base+bdct_lb_q+16]
4500    movd                m13, [base+subpel_filters+ r4*8+2]
4501    movd                 m2, [base+subpel_filters+ r6*8+2]
4502    movd                m15, [base+subpel_filters+r11*8+2]
4503    movd                 m4, [base+subpel_filters+r13*8+2]
4504%else
4505    movd                 r0, m15
4506    movd                 r4, m7
4507    pshufd              m15, m15, q0321
4508    pshufd               m7, m7, q0321
4509    movd                 rX, m15
4510    movd                 r5, m7
4511    mova                 m5, [base+bdct_lb_q+ 0]
4512    mova                 m6, [base+bdct_lb_q+16]
4513    movd                 m1, [base+subpel_filters+r0*8+2]
4514    movd                 m2, [base+subpel_filters+rX*8+2]
4515    movd                 m3, [base+subpel_filters+r4*8+2]
4516    movd                 m7, [base+subpel_filters+r5*8+2]
4517    SWAP                 m4, m7
4518 %if isprep
4519    mov                  r3, r3m
4520 %endif
4521 %define m10 m5
4522 %define m11 m6
4523 %define m12 m1
4524 %define m13 m1
4525%endif
4526    psrld               m14, 10
4527    paddd               m14, m14
4528    punpckldq           m13, m2
4529    punpckldq           m15, m4
4530    punpcklqdq          m13, m15
4531    pxor                 m2, m2
4532    pcmpeqd              m0, m2
4533%if ARCH_X86_64
4534    pand                 m9, m0
4535%else
4536    pand                 m2, m9, m0
4537 %define m9 m2
4538    SWAP                 m7, m4
4539%endif
4540    pandn                m0, m13
4541%if ARCH_X86_64
4542    SWAP                m13, m0
4543%else
4544 %define m13 m0
4545%endif
4546    por                 m13, m9
4547    punpckhbw           m15, m13, m13
4548    punpcklbw           m13, m13
4549    psraw               m15, 8
4550    psraw               m13, 8
4551    pshufb              m12, m14, m10
4552    pshufb              m14, m11
4553    mova                m10, [base+spel_s_shuf2]
4554    movd                r4d, m14
4555    shr                 r4d, 24
4556%if ARCH_X86_32
4557    mova         [stk+0x40], m13
4558    mova         [stk+0x50], m15
4559    pxor                 m2, m2
4560%endif
4561    pshufb               m7, m14, m2
4562    psubb               m14, m7
4563    paddb               m12, m10
4564    paddb               m14, m10
4565%if ARCH_X86_64
4566    lea                  r6, [r4+ssq*1]
4567    lea                 r11, [r4+ssq*2]
4568    lea                 r13, [r4+ss3q ]
4569    movu                 m7, [srcq+ssq*0]
4570    movu                 m9, [srcq+ssq*1]
4571    movu                 m8, [srcq+ssq*2]
4572    movu                m10, [srcq+ss3q ]
4573    movu                 m1, [srcq+r4   ]
4574    movu                 m3, [srcq+r6   ]
4575    movu                 m2, [srcq+r11  ]
4576    movu                 m4, [srcq+r13  ]
4577    lea                srcq, [srcq+ssq*4]
4578    REPX    {pshufb x, m12}, m7, m9, m8, m10
4579    REPX   {pmaddwd x, m13}, m7, m9, m8, m10
4580    REPX    {pshufb x, m14}, m1, m3, m2, m4
4581    REPX   {pmaddwd x, m15}, m1, m3, m2, m4
4582    mova                 m5, [rsp+0x10]
4583    movd                xm6, [rsp+0x20]
4584    phaddd               m7, m1
4585    phaddd               m9, m3
4586    phaddd               m8, m2
4587    phaddd              m10, m4
4588    movu                 m1, [srcq+ssq*0]
4589    movu                 m2, [srcq+ssq*1]
4590    movu                 m3, [srcq+ssq*2]
4591    REPX      {paddd x, m5}, m7, m9, m8, m10
4592    REPX     {psrad x, xm6}, m7, m9, m8, m10
4593    packssdw             m7, m9  ; 0 1
4594    packssdw             m8, m10 ; 2 3
4595    movu                 m0, [srcq+r4   ]
4596    movu                 m9, [srcq+r6   ]
4597    movu                m10, [srcq+r11  ]
4598    add                srcq, ss3q
4599    REPX    {pshufb x, m12}, m1, m2, m3
4600    REPX   {pmaddwd x, m13}, m1, m2, m3
4601    REPX    {pshufb x, m14}, m0, m9, m10
4602    REPX   {pmaddwd x, m15}, m0, m9, m10
4603    phaddd               m1, m0
4604    phaddd               m2, m9
4605    phaddd               m3, m10
4606    shr                 myd, 6
4607    mov                r13d, 64 << 24
4608    lea                 myd, [t1+myq]
4609    cmovnz             r13q, [base+subpel_filters+myq*8]
4610    REPX      {paddd x, m5}, m1, m2, m3
4611    REPX     {psrad x, xm6}, m1, m2, m3
4612    packssdw             m1, m2 ; 4 5
4613    packssdw             m3, m3 ; 6 6
4614    SWAP                 m9, m1
4615    shufps               m4, m7, m8, q1032  ; 1 2
4616    shufps               m5, m8, m9, q1032  ; 3 4
4617    shufps               m6, m9, m3, q1032  ; 5 6
4618    punpcklwd            m0, m7, m4 ; 01
4619    punpckhwd            m7, m4     ; 12
4620    punpcklwd            m1, m8, m5 ; 23
4621    punpckhwd            m8, m5     ; 34
4622    punpcklwd            m2, m9, m6 ; 45
4623    punpckhwd            m9, m6     ; 56
4624    movq                m10, r13
4625    mova         [stk+0x00], m1
4626    mova         [stk+0x10], m8
4627    mova         [stk+0x20], m2
4628    mova         [stk+0x30], m9
4629    mova         [stk+0x40], m3
4630 %define hrnd_mem [rsp+0x10]
4631 %define hsh_mem  [rsp+0x20]
4632 %define vsh_mem  [rsp+0x28]
4633 %if isput
4634  %define vrnd_mem [rsp+0x30]
4635 %else
4636  %define vrnd_mem [base+pd_m524256]
4637 %endif
4638%else
4639    mova         [stk+0x20], m12
4640    mova         [stk+0x30], m14
4641    add                  r4, srcq
4642    MC_4TAP_SCALED_H   0x60 ; 0 1
4643    MC_4TAP_SCALED_H   0x70 ; 2 3
4644    MC_4TAP_SCALED_H   0x80 ; 4 5
4645    movu                 m7, [srcq]
4646    movu                 m2, [r4]
4647    add                srcq, ssq
4648    add                  r4, ssq
4649    mov          [stk+0xb0], r4
4650    pshufb               m7, m12
4651    pshufb               m2, m14
4652    pmaddwd              m7, m13
4653    pmaddwd              m2, m15
4654    phaddd               m7, m2
4655    paddd                m7, [esp+0x00]
4656    psrad                m7, [esp+0x10]
4657    packssdw             m7, m7 ; 6 6
4658    mova                 m4, [stk+0x60]
4659    mova                 m5, [stk+0x70]
4660    mova                 m6, [stk+0x80]
4661    mov                 myd, mym
4662    mov                  rX, [esp+0x1f4]
4663    xor                  r5, r5
4664    shr                 myd, 6
4665    lea                  rX, [rX+myd]
4666    mov                  r4, 64 << 24
4667    cmovnz               r4, [base+subpel_filters+rX*8+0]
4668    cmovnz               r5, [base+subpel_filters+rX*8+4]
4669    mov                  r3, r3m
4670    shufps               m1, m4, m5, q1032 ; 1 2
4671    shufps               m2, m5, m6, q1032 ; 3 4
4672    shufps               m3, m6, m7, q1032 ; 5 6
4673    mova         [stk+0xa0], m7
4674    punpcklwd            m0, m4, m1         ; 01
4675    punpckhwd            m4, m1             ; 12
4676    punpcklwd            m1, m5, m2         ; 23
4677    punpckhwd            m5, m2             ; 34
4678    punpcklwd            m2, m6, m3         ; 45
4679    punpckhwd            m6, m3             ; 56
4680    movd                 m7, r4
4681    movd                 m3, r5
4682    mov                  r0, r0m
4683 %if isput
4684    mov                  r1, r1m
4685 %endif
4686    mov                  r4, [stk+0xb0]
4687    mova         [stk+0xc0], m4 ; 12
4688    mova         [stk+0x60], m1 ; 23
4689    mova         [stk+0x70], m2 ; 45
4690    mova         [stk+0x80], m5 ; 34
4691    mova         [stk+0x90], m6 ; 56
4692 %define m12 [stk+0x20]
4693 %define m14 [stk+0x30]
4694 %define m13 [stk+0x40]
4695 %define m15 [stk+0x50]
4696 %define hrnd_mem [esp+0x00]
4697 %define hsh_mem  [esp+0x10]
4698 %define vsh_mem  [esp+0x18]
4699 %if isput
4700  %define vrnd_mem [esp+0x20]
4701 %else
4702  %define vrnd_mem [base+pd_m524256]
4703 %endif
4704 %define m10 m7
4705    punpckldq           m10, m3
4706%endif
4707    punpcklbw           m10, m10
4708    psraw               m10, 8
4709    pshufd               m3, m10, q0000
4710    pshufd               m4, m10, q1111
4711    pshufd               m5, m10, q2222
4712    pshufd              m10, m10, q3333
4713%if ARCH_X86_32
4714 %xdefine m8  m3
4715 %xdefine m9  m6
4716 %xdefine m11 m5
4717 %xdefine m6  m4
4718    mova         [stk+0x100], m3
4719    mova         [stk+0x110], m4
4720    mova         [stk+0x120], m5
4721    mova         [stk+0x130], m10
4722 %define m3  [stk+0x100]
4723 %define m4  [stk+0x110]
4724 %define m5  [stk+0x120]
4725 %define m10 [stk+0x130]
4726    mova                 m7, [stk+0xc0]
4727    mova                 m8, [stk+0x80]
4728%endif
4729.dy1_w4_loop:
4730    movu                m11, [srcq+ssq*0]
4731    movu                 m6, [srcq+ssq*1]
4732    pmaddwd              m0, m3
4733    pmaddwd              m7, m3
4734    pmaddwd              m1, m4
4735    pmaddwd              m8, m4
4736    pmaddwd              m2, m5
4737    pmaddwd              m9, m5
4738    paddd                m1, m0
4739    paddd                m8, m7
4740%if ARCH_X86_64
4741    movu                 m0, [srcq+r4]
4742    movu                 m7, [srcq+r6]
4743%else
4744    movu                 m0, [r4+ssq*0]
4745    movu                 m7, [r4+ssq*1]
4746    lea                  r4, [r4+ssq*2]
4747%endif
4748    lea                srcq, [srcq+ssq*2]
4749    paddd                m1, m2
4750    paddd                m8, m9
4751    pshufb              m11, m12
4752    pshufb               m6, m12
4753    pmaddwd             m11, m13
4754    pmaddwd              m6, m13
4755    pshufb               m0, m14
4756    pshufb               m7, m14
4757    pmaddwd              m0, m15
4758    pmaddwd              m7, m15
4759    phaddd              m11, m0
4760    phaddd               m6, m7
4761    paddd               m11, hrnd_mem
4762    paddd                m6, hrnd_mem
4763    psrad               m11, hsh_mem
4764    psrad                m6, hsh_mem
4765    packssdw            m11, m6                     ; 7 8
4766%if ARCH_X86_64
4767    shufps               m9, [stk+0x40], m11, q1032 ; 6 7
4768    mova                 m0, [stk+0x00]
4769    mova         [stk+0x40], m11
4770%else
4771    shufps               m9, [stk+0xa0], m11, q1032 ; 6 7
4772    mova                 m0, [stk+0x60]
4773    mova         [stk+0xa0], m11
4774%endif
4775    punpcklwd            m2, m9, m11 ; 67
4776    punpckhwd            m9, m11     ; 78
4777    pmaddwd              m6, m2, m10
4778    pmaddwd              m7, m9, m10
4779%if isput
4780    movd                m11, vsh_mem
4781%endif
4782    paddd                m1, vrnd_mem
4783    paddd                m8, vrnd_mem
4784    paddd                m1, m6
4785    paddd                m8, m7
4786%if ARCH_X86_64
4787    mova                 m7, [stk+0x10]
4788%else
4789    mova                 m7, [stk+0x80]
4790%endif
4791%if isput
4792    psrad                m1, m11
4793    psrad                m8, m11
4794%else
4795    psrad                m1, 6
4796    psrad                m8, 6
4797%endif
4798    packssdw             m1, m8
4799%if ARCH_X86_64
4800    mova                 m8, [stk+0x30]
4801%else
4802    mova                 m8, [stk+0x90]
4803%endif
4804%if isput
4805    pxor                 m6, m6
4806    pmaxsw               m1, m6
4807    pminsw               m1, pxmaxm
4808    movq       [dstq+dsq*0], m1
4809    movhps     [dstq+dsq*1], m1
4810    lea                dstq, [dstq+dsq*2]
4811%else
4812    mova             [tmpq], m1
4813    add                tmpq, 16
4814%endif
4815%if ARCH_X86_64
4816    mova                 m1, [stk+0x20]
4817    mova         [stk+0x10], m8
4818    mova         [stk+0x00], m1
4819    mova         [stk+0x20], m2
4820    mova         [stk+0x30], m9
4821%else
4822    mova                 m1, [stk+0x70]
4823    mova         [stk+0x80], m8
4824    mova         [stk+0x60], m1
4825    mova         [stk+0x70], m2
4826    mova         [stk+0x90], m9
4827%endif
4828    sub                  hd, 2
4829    jg .dy1_w4_loop
4830    MC_8TAP_SCALED_RET ; why not jz .ret?
4831INIT_XMM ssse3
4832.dy1_w8:
4833    mov    dword [stk+0xf0], 1
4834    movifprep   tmp_stridem, 16
4835    jmp .dy1_w_start
4836.dy1_w16:
4837    mov    dword [stk+0xf0], 2
4838    movifprep   tmp_stridem, 32
4839    jmp .dy1_w_start
4840.dy1_w32:
4841    mov    dword [stk+0xf0], 4
4842    movifprep   tmp_stridem, 64
4843    jmp .dy1_w_start
4844.dy1_w64:
4845    mov    dword [stk+0xf0], 8
4846    movifprep   tmp_stridem, 128
4847    jmp .dy1_w_start
4848.dy1_w128:
4849    mov    dword [stk+0xf0], 16
4850    movifprep   tmp_stridem, 256
4851.dy1_w_start:
4852    mov                 myd, mym
4853%if ARCH_X86_64
4854 %ifidn %1, put
4855    movifnidn           dsm, dsq
4856 %endif
4857    mova         [rsp+0x10], m11
4858    mova         [rsp+0x20], m12
4859 %define hround m11
4860 %if isput
4861    mova         [rsp+0x30], m13
4862 %else
4863    mova                m13, [base+pd_m524256]
4864 %endif
4865    shr                 t0d, 16
4866    shr                 myd, 6
4867    mov                 r4d, 64 << 24
4868    lea                 myd, [t1+myq]
4869    cmovnz              r4q, [base+subpel_filters+myq*8]
4870    movd                m15, t0d
4871%else
4872 %define hround [esp+0x00]
4873 %define m12    [esp+0x10]
4874 %define m10    [base+pd_0x3ff]
4875 %define m8  m0
4876 %xdefine m14 m4
4877 %xdefine m15 m3
4878 %if isprep
4879  %define ssq ssm
4880 %endif
4881    mov                  r5, [esp+0x1f0]
4882    mov                  r3, [esp+0x1f4]
4883    shr                  r5, 16
4884    movd                m15, r5
4885    xor                  r5, r5
4886    shr                 myd, 6
4887    lea                  r3, [r3+myd]
4888    mov                  r4, 64 << 24
4889    cmovnz               r4, [base+subpel_filters+r3*8+0]
4890    cmovnz               r5, [base+subpel_filters+r3*8+4]
4891    mov                  r0, r0m
4892    mov                  r3, r3m
4893%endif
4894    sub                srcq, 6
4895    pslld                m7, m8, 2 ; dx*4
4896    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
4897    pshufd              m15, m15, q0000
4898    paddd               m14, m8 ; mx+dx*[0-3]
4899%if ARCH_X86_64
4900    movq                 m3, r4q
4901%else
4902    movd                 m5, r4
4903    movd                 m6, r5
4904    punpckldq            m5, m6
4905    SWAP                 m3, m5
4906%endif
4907    punpcklbw            m3, m3
4908    psraw                m3, 8
4909    mova        [stk+0x100], m7
4910    mova        [stk+0x120], m15
4911    mov         [stk+0x0f8], srcq
4912    mov         [stk+0x130], r0q ; dstq / tmpq
4913    pshufd               m0, m3, q0000
4914    pshufd               m1, m3, q1111
4915    pshufd               m2, m3, q2222
4916    pshufd               m3, m3, q3333
4917%if ARCH_X86_64
4918    mova        [stk+0x140], m0
4919    mova        [stk+0x150], m1
4920    mova        [stk+0x160], m2
4921    mova        [stk+0x170], m3
4922 %if UNIX64
4923    mov                  hm, hd
4924 %endif
4925%else
4926    mova        [stk+0x180], m0
4927    mova        [stk+0x190], m1
4928    mova        [stk+0x1a0], m2
4929    mova        [stk+0x1b0], m3
4930    SWAP                 m5, m3
4931    mov                  r5, hm
4932    mov         [stk+0x134], r5
4933%endif
4934    jmp .dy1_hloop
4935.dy1_hloop_prep:
4936    dec   dword [stk+0x0f0]
4937    jz .ret
4938%if ARCH_X86_64
4939    add   qword [stk+0x130], 16
4940    mov                  hd, hm
4941%else
4942    add   dword [stk+0x130], 16
4943    mov                  r5, [stk+0x134]
4944    mov                  r0, [stk+0x130]
4945%endif
4946    mova                 m7, [stk+0x100]
4947    mova                m14, [stk+0x110]
4948%if ARCH_X86_64
4949    mova                m10, [base+pd_0x3ff]
4950    mova                m11, [rsp+0x10]
4951%endif
4952    mova                m15, [stk+0x120]
4953    mov                srcq, [stk+0x0f8]
4954%if ARCH_X86_64
4955    mov                 r0q, [stk+0x130] ; dstq / tmpq
4956%else
4957    mov                  hm, r5
4958    mov                 r0m, r0
4959    mov                  r3, r3m
4960%endif
4961    paddd               m14, m7
4962.dy1_hloop:
4963%if ARCH_X86_64
4964    mova                 m9, [base+pq_0x40000000]
4965%else
4966 %define m9 [base+pq_0x40000000]
4967%endif
4968    pxor                 m1, m1
4969    psrld                m2, m14, 10
4970    mova              [stk], m2
4971    pand                 m6, m14, m10
4972    psrld                m6, 6
4973    paddd                m5, m15, m6
4974    pcmpeqd              m6, m1
4975    pshufd               m2, m5, q1032
4976%if ARCH_X86_64
4977    movd                r4d, m5
4978    movd                r6d, m2
4979    pshufd               m5, m5, q0321
4980    pshufd               m2, m2, q0321
4981    movd                r7d, m5
4982    movd                r9d, m2
4983    movq                 m0, [base+subpel_filters+r4*8]
4984    movq                 m1, [base+subpel_filters+r6*8]
4985    movhps               m0, [base+subpel_filters+r7*8]
4986    movhps               m1, [base+subpel_filters+r9*8]
4987%else
4988    movd                 r0, m5
4989    movd                 rX, m2
4990    pshufd               m5, m5, q0321
4991    pshufd               m2, m2, q0321
4992    movd                 r4, m5
4993    movd                 r5, m2
4994    movq                 m0, [base+subpel_filters+r0*8]
4995    movq                 m1, [base+subpel_filters+rX*8]
4996    movhps               m0, [base+subpel_filters+r4*8]
4997    movhps               m1, [base+subpel_filters+r5*8]
4998%endif
4999    paddd               m14, m7 ; mx+dx*[4-7]
5000    pand                 m5, m14, m10
5001    psrld                m5, 6
5002    paddd               m15, m5
5003    pxor                 m2, m2
5004    pcmpeqd              m5, m2
5005    mova        [stk+0x110], m14
5006    pshufd               m4, m15, q1032
5007%if ARCH_X86_64
5008    movd               r10d, m15
5009    movd               r11d, m4
5010    pshufd              m15, m15, q0321
5011    pshufd               m4, m4, q0321
5012    movd               r13d, m15
5013    movd                rXd, m4
5014    movq                 m2, [base+subpel_filters+r10*8]
5015    movq                 m3, [base+subpel_filters+r11*8]
5016    movhps               m2, [base+subpel_filters+r13*8]
5017    movhps               m3, [base+subpel_filters+ rX*8]
5018    psrld               m14, 10
5019    movq                r11, m14
5020    punpckhqdq          m14, m14
5021    movq                 rX, m14
5022    mov                r10d, r11d
5023    shr                 r11, 32
5024    mov                r13d, rXd
5025    shr                  rX, 32
5026    mov                 r4d, [stk+ 0]
5027    mov                 r6d, [stk+ 4]
5028    mov                 r7d, [stk+ 8]
5029    mov                 r9d, [stk+12]
5030    pshufd               m4, m6, q1100
5031    pshufd               m6, m6, q3322
5032    pshufd              m14, m5, q1100
5033    pshufd               m5, m5, q3322
5034    pand                 m7, m9, m4
5035    pand                 m8, m9, m6
5036    pand                m15, m9, m14
5037    pand                 m9, m9, m5
5038    pandn                m4, m0
5039    pandn                m6, m1
5040    pandn               m14, m2
5041    pandn                m5, m3
5042    por                  m7, m4
5043    por                  m8, m6
5044    por                 m15, m14
5045    por                  m9, m5
5046    punpcklbw            m0, m7, m7
5047    punpckhbw            m7, m7
5048    punpcklbw            m1, m8, m8
5049    punpckhbw            m8, m8
5050    psraw                m0, 8
5051    psraw                m7, 8
5052    psraw                m1, 8
5053    psraw                m8, 8
5054    punpcklbw            m2, m15, m15
5055    punpckhbw           m15, m15
5056    punpcklbw            m3, m9, m9
5057    punpckhbw            m9, m9
5058    psraw                m2, 8
5059    psraw               m15, 8
5060    psraw                m3, 8
5061    psraw                m9, 8
5062    mova         [stk+0x10], m0
5063    mova         [stk+0x20], m7
5064    mova         [stk+0x30], m1
5065    mova         [stk+0x40], m8
5066    mova         [stk+0x50], m2
5067    mova         [stk+0x60], m15
5068    mova         [stk+0x70], m3
5069    mova         [stk+0x80], m9
5070    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
5071    mova         [stk+0x90], m1
5072    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
5073    mova         [stk+0xa0], m2
5074    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
5075    mova         [stk+0xb0], m3
5076    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
5077    mova         [stk+0xc0], m4
5078    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
5079    mova         [stk+0xd0], m5
5080    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
5081    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
5082    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
5083    mova                 m5, [stk+0xd0]
5084    mova                 m1, [stk+0x90]
5085    mova                 m2, [stk+0xa0]
5086    mova                 m3, [stk+0xb0]
5087    mova                 m9, [stk+0xc0]
5088    punpcklwd            m4, m5, m6 ; 45a
5089    punpckhwd            m5, m6     ; 45b
5090    punpcklwd            m6, m7, m8 ; 67a
5091    punpckhwd            m7, m8     ; 67b
5092    punpcklwd            m0, m1, m2 ; 01a
5093    punpckhwd            m1, m2     ; 01b
5094    punpcklwd            m2, m3, m9 ; 23a
5095    punpckhwd            m3, m9     ; 23b
5096    mova                m10, [stk+0x140]
5097    mova                m11, [stk+0x150]
5098    mova                m14, [stk+0x160]
5099    mova                m15, [stk+0x170]
5100    mova         [stk+0x90], m4
5101    mova         [stk+0xa0], m5
5102    mova         [stk+0xb0], m6
5103    mova         [stk+0xc0], m7
5104 %define hround [rsp+0x10]
5105 %define shift  [rsp+0x20]
5106 %if isput
5107  %define vround [rsp+0x30]
5108 %else
5109  %define vround [base+pd_m524256]
5110 %endif
5111.dy1_vloop:
5112    pmaddwd              m4, m0, m10
5113    pmaddwd              m5, m1, m10
5114    pmaddwd              m6, m2, m11
5115    pmaddwd              m7, m3, m11
5116    paddd                m4, m13
5117    paddd                m5, m13
5118    paddd                m4, m6
5119    paddd                m5, m7
5120    pmaddwd              m6, [stk+0x90], m14
5121    pmaddwd              m7, [stk+0xa0], m14
5122    pmaddwd              m8, [stk+0xb0], m15
5123    pmaddwd              m9, [stk+0xc0], m15
5124    paddd                m4, m6
5125    paddd                m5, m7
5126 %if isput
5127    pshufd               m6, m12, q1032
5128 %endif
5129    paddd                m4, m8
5130    paddd                m5, m9
5131%else
5132    movd                 r0, m15
5133    movd                 rX, m4
5134    pshufd              m15, m15, q0321
5135    pshufd               m4, m4, q0321
5136    movd                 r4, m15
5137    movd                 r5, m4
5138    mova                m14, [stk+0x110]
5139    movq                 m2, [base+subpel_filters+r0*8]
5140    movq                 m3, [base+subpel_filters+rX*8]
5141    movhps               m2, [base+subpel_filters+r4*8]
5142    movhps               m3, [base+subpel_filters+r5*8]
5143    psrld               m14, 10
5144    mova           [stk+16], m14
5145    mov                  r0, [stk+ 0]
5146    mov                  rX, [stk+ 4]
5147    mov                  r4, [stk+ 8]
5148    mov                  r5, [stk+12]
5149    mova         [stk+0x20], m0
5150    mova         [stk+0x30], m1
5151    mova         [stk+0x40], m2
5152    mova         [stk+0x50], m3
5153    pshufd               m4, m6, q1100
5154    pshufd               m6, m6, q3322
5155    pshufd               m7, m5, q1100
5156    pshufd               m5, m5, q3322
5157    pand                 m0, m9, m4
5158    pand                 m1, m9, m6
5159    pand                 m2, m9, m7
5160    pand                 m3, m9, m5
5161    pandn                m4, [stk+0x20]
5162    pandn                m6, [stk+0x30]
5163    pandn                m7, [stk+0x40]
5164    pandn                m5, [stk+0x50]
5165    por                  m0, m4
5166    por                  m1, m6
5167    por                  m2, m7
5168    por                  m3, m5
5169    punpcklbw            m4, m0, m0
5170    punpckhbw            m0, m0
5171    punpcklbw            m5, m1, m1
5172    punpckhbw            m1, m1
5173    psraw                m4, 8
5174    psraw                m0, 8
5175    psraw                m5, 8
5176    psraw                m1, 8
5177    punpcklbw            m6, m2, m2
5178    punpckhbw            m2, m2
5179    punpcklbw            m7, m3, m3
5180    punpckhbw            m3, m3
5181    psraw                m6, 8
5182    psraw                m2, 8
5183    psraw                m7, 8
5184    psraw                m3, 8
5185    mova        [stk+0x0a0], m4
5186    mova        [stk+0x0b0], m0
5187    mova        [stk+0x0c0], m5
5188    mova        [stk+0x0d0], m1
5189    mova        [stk+0x140], m6
5190    mova        [stk+0x150], m2
5191    mova        [stk+0x160], m7
5192    mova        [stk+0x170], m3
5193    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
5194    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
5195    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
5196    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
5197    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
5198    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
5199    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
5200    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
5201    mova                 m5, [stk+0x60]
5202    mova                 m6, [stk+0x70]
5203    mova                 m7, [stk+0x80]
5204    mova                 m0, [stk+0x90]
5205    mov                  r0, r0m
5206    punpcklwd            m4, m5, m6      ; 45a
5207    punpckhwd            m5, m6          ; 45b
5208    punpcklwd            m6, m7, m0      ; 67a
5209    punpckhwd            m7, m0          ; 67b
5210    mova         [stk+0x60], m4
5211    mova         [stk+0x70], m5
5212    mova         [stk+0x80], m6
5213    mova         [stk+0x90], m7
5214    mova                 m1, [stk+0x20]
5215    mova                 m2, [stk+0x30]
5216    mova                 m3, [stk+0x40]
5217    mova                 m4, [stk+0x50]
5218    punpcklwd            m0, m1, m2      ; 01a
5219    punpckhwd            m1, m2          ; 01b
5220    punpcklwd            m2, m3, m4      ; 23a
5221    punpckhwd            m3, m4          ; 23b
5222    mova                 m4, [stk+0x180]
5223    mova                 m5, [stk+0x190]
5224    mova                 m6, [stk+0x1a0]
5225    mova                 m7, [stk+0x1b0]
5226    mova         [stk+0x20], m0
5227    mova         [stk+0x30], m1
5228    mova         [stk+0x40], m2
5229    mova         [stk+0x50], m3
5230.dy1_vloop:
5231    pmaddwd              m0, m4
5232    pmaddwd              m1, m4
5233    pmaddwd              m2, m5
5234    pmaddwd              m3, m5
5235    paddd                m0, m2
5236    paddd                m1, m3
5237    pmaddwd              m2, [stk+0x60], m6
5238    pmaddwd              m3, [stk+0x70], m6
5239    pmaddwd              m4, [stk+0x80], m7
5240    pmaddwd              m5, [stk+0x90], m7
5241 %if isput
5242    movd                 m6, [esp+0x18]
5243 %endif
5244    paddd                m0, m2
5245    paddd                m1, m3
5246    paddd                m0, vrnd_mem
5247    paddd                m1, vrnd_mem
5248    paddd                m4, m0
5249    paddd                m5, m1
5250%endif
5251%ifidn %1, put
5252    psrad                m4, m6
5253    psrad                m5, m6
5254    packssdw             m4, m5
5255    pxor                 m7, m7
5256    pmaxsw               m4, m7
5257    pminsw               m4, pxmaxm
5258    mova             [dstq], m4
5259    add                dstq, dsm
5260%else
5261    psrad                m4, 6
5262    psrad                m5, 6
5263    packssdw             m4, m5
5264    mova             [tmpq], m4
5265    add                tmpq, tmp_stridem
5266%endif
5267    dec                  hd
5268    jz .dy1_hloop_prep
5269%if ARCH_X86_64
5270    movu                 m8, [srcq+r10*2]
5271    movu                 m9, [srcq+r11*2]
5272    movu                m12, [srcq+r13*2]
5273    movu                m13, [srcq+ rX*2]
5274    movu                 m4, [srcq+ r4*2]
5275    movu                 m5, [srcq+ r6*2]
5276    movu                 m6, [srcq+ r7*2]
5277    movu                 m7, [srcq+ r9*2]
5278    add                srcq, ssq
5279    pmaddwd              m8, [stk+0x50]
5280    pmaddwd              m9, [stk+0x60]
5281    pmaddwd             m12, [stk+0x70]
5282    pmaddwd             m13, [stk+0x80]
5283    pmaddwd              m4, [stk+0x10]
5284    pmaddwd              m5, [stk+0x20]
5285    pmaddwd              m6, [stk+0x30]
5286    pmaddwd              m7, [stk+0x40]
5287    phaddd               m8, m9
5288    phaddd              m12, m13
5289    mova                 m9, [base+unpckw]
5290    mova                m13, hround
5291    phaddd               m4, m5
5292    phaddd               m6, m7
5293    phaddd               m8, m12
5294    phaddd               m4, m6
5295    pshufd               m5, m9, q1032
5296    pshufb               m0, m9             ; 0a 1a
5297    pshufb               m1, m9             ; 0b 1b
5298    pshufb               m2, m5             ; 3a 2a
5299    pshufb               m3, m5             ; 3b 2b
5300    mova                m12, shift
5301    paddd                m4, m13
5302    paddd                m8, m13
5303    psrad                m4, m12
5304    psrad                m8, m12
5305    packssdw             m4, m8
5306    pshufb               m6, [stk+0x90], m9 ; 4a 5a
5307    pshufb               m7, [stk+0xa0], m9 ; 4b 5b
5308    pshufb               m8, [stk+0xb0], m5 ; 7a 6a
5309    pshufb              m13, [stk+0xc0], m5 ; 7b 6b
5310    punpckhwd            m0, m2  ; 12a
5311    punpckhwd            m1, m3  ; 12b
5312    punpcklwd            m2, m6  ; 34a
5313    punpcklwd            m3, m7  ; 34b
5314    punpckhwd            m6, m8  ; 56a
5315    punpckhwd            m7, m13 ; 56b
5316    punpcklwd            m8, m4  ; 78a
5317    punpckhqdq           m4, m4
5318    punpcklwd           m13, m4  ; 78b
5319    mova         [stk+0x90], m6
5320    mova         [stk+0xa0], m7
5321    mova         [stk+0xb0], m8
5322    mova         [stk+0xc0], m13
5323    mova                m13, vround
5324%else
5325    mov                 r0m, r0
5326    mov                  r3, r3m
5327    mov                  r0, [stk+ 0]
5328    mov                  rX, [stk+ 4]
5329    mov                  r4, [stk+ 8]
5330    mov                  r5, [stk+12]
5331    MC_8TAP_SCALED_H 0xa0, 0xe0, 0 ; 8
5332    mova                 m7, [base+unpckw]
5333    pshufd               m4, m7, q1032
5334    pshufb               m0, [stk+0x20], m7 ; 0a 1a
5335    pshufb               m1, [stk+0x30], m7 ; 0b 1b
5336    pshufb               m2, [stk+0x40], m4 ; 3a 2a
5337    pshufb               m3, [stk+0x50], m4 ; 3b 2b
5338    pshufb               m5, [stk+0x60], m7 ; 4a 5a
5339    pshufb               m6, [stk+0x70], m7 ; 4b 5b
5340    pshufb               m7, [stk+0x80], m4 ; 7a 6a
5341    punpckhwd            m0, m2 ; 12a
5342    punpckhwd            m1, m3 ; 12b
5343    punpcklwd            m2, m5 ; 34a
5344    punpcklwd            m3, m6 ; 34b
5345    mova         [stk+0x20], m0
5346    mova         [stk+0x30], m1
5347    mova         [stk+0x40], m2
5348    mova         [stk+0x50], m3
5349    punpckhwd            m5, m7 ; 56a
5350    mova         [stk+0x60], m5
5351    pshufb               m5, [stk+0x90], m4 ; 7b 6b
5352    punpcklwd            m7, [stk+0xe0] ; 78a
5353    mova                 m4, [stk+0x180]
5354    punpckhwd            m6, m5 ; 56b
5355    mova         [stk+0x70], m6
5356    movq                 m6, [stk+0xe8]
5357    mova         [stk+0x80], m7
5358    mova                 m7, [stk+0x1b0]
5359    punpcklwd            m5, m6
5360    mova                 m6, [stk+0x1a0]
5361    mova         [stk+0x90], m5
5362    mova                 m5, [stk+0x190]
5363    mov                  r0, r0m
5364%endif
5365    jmp .dy1_vloop
5366INIT_XMM ssse3
5367%if ARCH_X86_64
5368 %define stk rsp+0x20
5369%endif
5370.dy2:
5371    movzx                wd, word [base+%1_8tap_scaled_ssse3_dy2_table+wq*2]
5372    add                  wq, base_reg
5373    jmp                  wq
5374%if isput
5375.dy2_w2:
5376 %if ARCH_X86_64
5377    mov                 myd, mym
5378    mova         [rsp+0x10], m13
5379  %define vrnd_mem [rsp+0x10]
5380    movzx               t0d, t0b
5381    sub                srcq, 2
5382    movd                m15, t0d
5383 %else
5384  %define m8  m0
5385  %define m9  m1
5386  %define m14 m4
5387  %define m15 m3
5388  %define m11 [esp+0x00]
5389  %define m12 [esp+0x10]
5390  %define vrnd_mem [esp+0x20]
5391    mov                  r1, r1m
5392    movzx                r5, byte [esp+0x1f0]
5393    sub                srcq, 2
5394    movd                m15, r5
5395 %endif
5396    pxor                 m9, m9
5397    punpckldq            m9, m8
5398    paddd               m14, m9 ; mx+dx*[0-1]
5399 %if ARCH_X86_64
5400    mova                 m9, [base+pd_0x4000]
5401 %endif
5402    pshufd              m15, m15, q0000
5403    pand                 m8, m14, m10
5404    psrld                m8, 6
5405    paddd               m15, m8
5406    movd                r4d, m15
5407    pshufd              m15, m15, q0321
5408 %if ARCH_X86_64
5409    movd                r6d, m15
5410 %else
5411    movd                r3d, m15
5412 %endif
5413    mova                 m5, [base+bdct_lb_q]
5414    mova                 m6, [base+spel_s_shuf2]
5415    movd                m15, [base+subpel_filters+r4*8+2]
5416 %if ARCH_X86_64
5417    movd                 m7, [base+subpel_filters+r6*8+2]
5418 %else
5419    movd                 m7, [base+subpel_filters+r3*8+2]
5420 %endif
5421    pxor                 m2, m2
5422    pcmpeqd              m8, m2
5423    psrld               m14, 10
5424    paddd               m14, m14
5425 %if ARCH_X86_32
5426    mov                  r3, r3m
5427    pshufb              m14, m5
5428    paddb               m14, m6
5429    mova              [stk], m14
5430    SWAP                 m5, m0
5431    SWAP                 m6, m3
5432  %define m15 m6
5433 %endif
5434    movu                 m0, [srcq+ssq*0]
5435    movu                 m1, [srcq+ssq*2]
5436    movu                 m2, [srcq+ssq*4]
5437    punpckldq           m15, m7
5438 %if ARCH_X86_64
5439    pshufb              m14, m5
5440    paddb               m14, m6
5441    pand                 m9, m8
5442    pandn                m8, m15
5443    SWAP                m15, m8
5444    por                 m15, m9
5445    movu                 m4, [srcq+ssq*1]
5446    movu                 m5, [srcq+ss3q ]
5447    lea                srcq, [srcq+ssq*4]
5448    movu                 m6, [srcq+ssq*1]
5449    lea                srcq, [srcq+ssq*2]
5450    shr                 myd, 6
5451    mov                 r4d, 64 << 24
5452    lea                 myd, [t1+myq]
5453    cmovnz              r4q, [base+subpel_filters+myq*8]
5454 %else
5455    pand                 m7, m5, [base+pd_0x4000]
5456    pandn                m5, m15
5457    por                  m5, m7
5458  %define m15 m5
5459    mov                 myd, mym
5460    mov                  r5, [esp+0x1f4]
5461    xor                  r3, r3
5462    shr                 myd, 6
5463    lea                  r5, [r5+myd]
5464    mov                  r4, 64 << 24
5465    cmovnz               r4, [base+subpel_filters+r5*8+0]
5466    cmovnz               r3, [base+subpel_filters+r5*8+4]
5467    mov          [stk+0x20], r3
5468    mov                  r3, r3m
5469 %endif
5470    punpcklbw           m15, m15
5471    psraw               m15, 8
5472    REPX    {pshufb x, m14}, m0, m1, m2
5473    REPX   {pmaddwd x, m15}, m0, m1, m2
5474 %if ARCH_X86_64
5475    REPX    {pshufb x, m14}, m4, m5, m6
5476    REPX   {pmaddwd x, m15}, m4, m5, m6
5477    phaddd               m0, m1
5478    phaddd               m1, m2
5479    phaddd               m4, m5
5480    phaddd               m5, m6
5481    REPX     {paddd x, m11}, m0, m1, m4, m5
5482    REPX     {psrad x, m12}, m0, m1, m4, m5
5483    packssdw             m0, m1 ; 0 2 2 4
5484    packssdw             m4, m5 ; 1 3 3 5
5485    SWAP                 m2, m4
5486    movq                m10, r4
5487 %else
5488    mova         [stk+0x10], m15
5489    phaddd               m0, m1
5490    phaddd               m1, m2
5491    movu                 m2, [srcq+ssq*1]
5492    movu                 m7, [srcq+ss3q ]
5493    lea                srcq, [srcq+ssq*4]
5494    movu                 m6, [srcq+ssq*1]
5495    lea                srcq, [srcq+ssq*2]
5496    REPX    {pshufb x, m14}, m2, m7, m6
5497    REPX   {pmaddwd x, m15}, m2, m7, m6
5498  %define m14 [stk+0x00]
5499  %define m15 [stk+0x10]
5500    phaddd               m2, m7
5501    phaddd               m7, m6
5502    REPX     {paddd x, m11}, m0, m1, m2, m7
5503    REPX     {psrad x, m12}, m0, m1, m2, m7
5504    packssdw             m0, m1
5505    packssdw             m2, m7
5506  %define m8  m6
5507  %define m9  m4
5508  %define m10 m5
5509    movd                m10, r4
5510    movd                 m9, [stk+0x20]
5511    punpckldq           m10, m9
5512 %endif
5513    punpcklbw           m10, m10
5514    psraw               m10, 8
5515    pshufd               m7, m10, q0000
5516    pshufd               m8, m10, q1111
5517    pshufd               m9, m10, q2222
5518    pshufd              m10, m10, q3333
5519 %if ARCH_X86_32
5520    mova         [stk+0x50], m7
5521    mova         [stk+0x60], m8
5522    mova         [stk+0x70], m9
5523    mova         [stk+0x80], m10
5524  %xdefine m13 m7
5525  %define m7  [stk+0x50]
5526  %define m8  [stk+0x60]
5527  %define m9  [stk+0x70]
5528  %define m10 [stk+0x80]
5529 %endif
5530    punpcklwd            m1, m0, m2    ; 01 23
5531    punpckhwd            m3, m0, m2    ; 23 45
5532 %if ARCH_X86_32
5533    mov                  r4, r0m
5534  %define dstq r4
5535    mova         [stk+0x20], m3
5536    mova         [stk+0x30], m0
5537 %endif
5538.dy2_w2_loop:
5539    movu                 m4, [srcq+ssq*0]
5540    movu                 m5, [srcq+ssq*1]
5541    movu                 m6, [srcq+ssq*2]
5542    movu                m13, [srcq+ss3q ]
5543    lea                srcq, [srcq+ssq*4]
5544    pmaddwd              m3, m8
5545    REPX    {pshufb x, m14}, m4, m5, m6, m13
5546    REPX   {pmaddwd x, m15}, m4, m5, m6, m13
5547    phaddd               m4, m5
5548    phaddd               m6, m13
5549    pmaddwd              m5, m1, m7
5550    paddd                m4, m11
5551    paddd                m6, m11
5552    psrad                m4, m12
5553    psrad                m6, m12
5554    packssdw             m4, m6 ; 6 7 8 9
5555    paddd                m5, m3
5556    pshufd               m3, m4, q2200
5557    pshufd               m4, m4, q3311
5558    palignr              m3, m0, 12 ; 4 6 6 8
5559    palignr              m4, m2, 12 ; 5 7 7 9
5560    mova                 m0, m3
5561    mova                 m2, m4
5562    punpcklwd            m1, m3, m4
5563    punpckhwd            m3, m4
5564    pmaddwd              m6, m1, m9
5565    pmaddwd              m4, m3, m10
5566    paddd                m5, vrnd_mem
5567    paddd                m6, m4
5568    paddd                m5, m6
5569    pshufd               m4, m12, q1032
5570    pxor                 m6, m6
5571    psrad                m5, m4
5572    packssdw             m5, m5
5573    pmaxsw               m5, m6
5574    pminsw               m5, pxmaxm
5575    movd       [dstq+dsq*0], m5
5576    pshuflw              m5, m5, q1032
5577    movd       [dstq+dsq*1], m5
5578    lea                dstq, [dstq+dsq*2]
5579    sub                  hd, 2
5580    jg .dy2_w2_loop
5581    RET
5582%endif
5583INIT_XMM ssse3
5584.dy2_w4:
5585%if ARCH_X86_64
5586    mov                 myd, mym
5587    mova         [rsp+0x10], m11
5588    mova         [rsp+0x20], m12
5589 %if isput
5590    mova         [rsp+0x30], m13
5591  %define vrnd_mem [rsp+0x30]
5592  %define stk rsp+0x40
5593 %else
5594  %define vrnd_mem [base+pd_m524256]
5595  %define stk rsp+0x30
5596 %endif
5597    movzx               t0d, t0b
5598    sub                srcq, 2
5599    movd                m15, t0d
5600%else
5601 %define m10 [base+pd_0x3ff]
5602 %define m9  [base+pd_0x4000]
5603 %define m8  m0
5604 %xdefine m14 m4
5605 %define m15 m3
5606 %if isprep
5607  %define ssq r3
5608 %endif
5609    movzx                r5, byte [esp+0x1f0]
5610    sub                srcq, 2
5611    movd                m15, r5
5612%endif
5613    pmaddwd              m8, [base+rescale_mul]
5614%if ARCH_X86_64
5615    mova                 m9, [base+pd_0x4000]
5616%endif
5617    pshufd              m15, m15, q0000
5618    paddd               m14, m8 ; mx+dx*[0-3]
5619    pand                 m0, m14, m10
5620    psrld                m0, 6
5621    paddd               m15, m0
5622    pshufd               m7, m15, q1032
5623%if ARCH_X86_64
5624    movd                r4d, m15
5625    movd               r11d, m7
5626    pshufd              m15, m15, q0321
5627    pshufd               m7, m7, q0321
5628    movd                r6d, m15
5629    movd               r13d, m7
5630    mova                m10, [base+bdct_lb_q+ 0]
5631    mova                m11, [base+bdct_lb_q+16]
5632    movd                m13, [base+subpel_filters+ r4*8+2]
5633    movd                 m2, [base+subpel_filters+ r6*8+2]
5634    movd                m15, [base+subpel_filters+r11*8+2]
5635    movd                 m4, [base+subpel_filters+r13*8+2]
5636%else
5637    movd                 r1, m15
5638    movd                 r4, m7
5639    pshufd              m15, m15, q0321
5640    pshufd               m7, m7, q0321
5641    movd                 r3, m15
5642    movd                 r5, m7
5643    mova                 m5, [base+bdct_lb_q+ 0]
5644    mova                 m6, [base+bdct_lb_q+16]
5645    movd                 m1, [base+subpel_filters+r1*8+2]
5646    movd                 m2, [base+subpel_filters+r3*8+2]
5647    movd                 m3, [base+subpel_filters+r4*8+2]
5648    movd                 m7, [base+subpel_filters+r5*8+2]
5649    SWAP                 m4, m7
5650    mov                  r3, r3m
5651 %if isprep
5652    lea                ss3q, [ssq*3]
5653 %endif
5654 %define m10 m5
5655 %define m11 m6
5656 %define m12 m1
5657 %define m13 m1
5658%endif
5659    psrld               m14, 10
5660    paddd               m14, m14
5661    punpckldq           m13, m2
5662    punpckldq           m15, m4
5663    punpcklqdq          m13, m15
5664    pxor                 m2, m2
5665    pcmpeqd              m0, m2
5666%if ARCH_X86_64
5667    pand                 m9, m0
5668%else
5669    pand                 m2, m9, m0
5670 %define m9 m2
5671    SWAP                 m7, m4
5672%endif
5673    pandn                m0, m13
5674%if ARCH_X86_64
5675    SWAP                m13, m0
5676%else
5677 %define m13 m0
5678%endif
5679    por                 m13, m9
5680    punpckhbw           m15, m13, m13
5681    punpcklbw           m13, m13
5682    psraw               m15, 8
5683    psraw               m13, 8
5684    pshufb              m12, m14, m10
5685    pshufb              m14, m11
5686    mova                m10, [base+spel_s_shuf2]
5687    movd                r4d, m14
5688    shr                 r4d, 24
5689%if ARCH_X86_32
5690    mova         [stk+0x40], m13
5691    mova         [stk+0x50], m15
5692    pxor                 m2, m2
5693%endif
5694    pshufb               m7, m14, m2
5695    psubb               m14, m7
5696    paddb               m12, m10
5697    paddb               m14, m10
5698%if ARCH_X86_64
5699    lea                  r6, [r4+ssq*1]
5700    lea                 r11, [r4+ssq*2]
5701    lea                 r13, [r4+ss3q ]
5702    movu                 m1, [srcq+ssq*0]
5703    movu                 m8, [srcq+ssq*2]
5704    movu                 m9, [srcq+ssq*1]
5705    movu                m10, [srcq+ss3q ]
5706    movu                 m7, [srcq+r4   ]
5707    movu                 m2, [srcq+r11  ]
5708    movu                 m3, [srcq+r6   ]
5709    movu                 m4, [srcq+r13  ]
5710    lea                srcq, [srcq+ssq*4]
5711    REPX    {pshufb x, m12}, m1, m9, m8, m10
5712    REPX   {pmaddwd x, m13}, m1, m9, m8, m10
5713    REPX    {pshufb x, m14}, m7, m3, m2, m4
5714    REPX   {pmaddwd x, m15}, m7, m3, m2, m4
5715    mova                 m5, [rsp+0x10]
5716    movd                xm6, [rsp+0x20]
5717    phaddd               m1, m7
5718    phaddd               m8, m2
5719    phaddd               m9, m3
5720    phaddd              m10, m4
5721    movu                 m2, [srcq+ssq*0]
5722    movu                 m3, [srcq+ssq*1]
5723    REPX      {paddd x, m5}, m1, m9, m8, m10
5724    REPX     {psrad x, xm6}, m1, m9, m8, m10
5725    packssdw             m1, m8     ; 0 2
5726    packssdw             m9, m10    ; 1 3
5727    movu                 m0, [srcq+r4   ]
5728    movu                 m8, [srcq+r6   ]
5729    lea                srcq, [srcq+ssq*2]
5730    REPX    {pshufb x, m12}, m2, m3
5731    REPX   {pmaddwd x, m13}, m2, m3
5732    REPX    {pshufb x, m14}, m0, m8
5733    REPX   {pmaddwd x, m15}, m0, m8
5734    phaddd               m2, m0
5735    phaddd               m3, m8
5736    shr                 myd, 6
5737    mov                 r9d, 64 << 24
5738    lea                 myd, [t1+myq]
5739    cmovnz              r9q, [base+subpel_filters+myq*8]
5740    REPX      {paddd x, m5}, m2, m3
5741    REPX     {psrad x, xm6}, m2, m3
5742    packssdw             m2, m3        ; 4 5
5743    pshufd               m3, m2, q1032 ; 5 _
5744    punpcklwd            m0, m1, m9    ; 01
5745    punpckhwd            m1, m9        ; 23
5746    punpcklwd            m2, m3        ; 45
5747    movq                m10, r9
5748 %define hrnd_mem [rsp+0x10]
5749 %define hsh_mem  [rsp+0x20]
5750 %define vsh_mem  [rsp+0x28]
5751 %if isput
5752  %define vrnd_mem [rsp+0x30]
5753 %else
5754  %define vrnd_mem [base+pd_m524256]
5755 %endif
5756%else
5757    mova         [stk+0x20], m12
5758    mova         [stk+0x30], m14
5759    add                  r4, srcq
5760    MC_4TAP_SCALED_H   0x60 ; 0 1
5761    MC_4TAP_SCALED_H   0x70 ; 2 3
5762    MC_4TAP_SCALED_H   0x80 ; 4 5
5763    mov          [stk+0xe0], r4
5764    mova                 m3, [base+spel_s_shuf8]
5765    mova                 m0, [stk+0x60]
5766    mova                 m1, [stk+0x70]
5767    mova                 m2, [stk+0x80]
5768    mov                 myd, mym
5769    mov                  rX, [esp+0x1f4]
5770    xor                  r5, r5
5771    shr                 myd, 6
5772    lea                  rX, [rX+myd]
5773    mov                  r4, 64 << 24
5774    cmovnz               r4, [base+subpel_filters+rX*8+0]
5775    cmovnz               r5, [base+subpel_filters+rX*8+4]
5776    mov                  r3, r3m
5777    pshufb               m0, m3 ; 01
5778    pshufb               m1, m3 ; 23
5779    pshufb               m2, m3 ; 45
5780    movd                 m7, r4
5781    movd                 m4, r5
5782    mov                  r5, r0m
5783 %if isput
5784    mov                  r1, r1m
5785 %endif
5786    mov                  r4, [stk+0xe0]
5787 %define dstq r5
5788 %define tmpq r5
5789 %define m12 [stk+0x20]
5790 %define m14 [stk+0x30]
5791 %define m13 [stk+0x40]
5792 %define m15 [stk+0x50]
5793 %define hrnd_mem [esp+0x00]
5794 %define hsh_mem  [esp+0x10]
5795 %define vsh_mem  [esp+0x18]
5796 %if isput
5797  %define vrnd_mem [esp+0x20]
5798 %else
5799  %define vrnd_mem [base+pd_m524256]
5800 %endif
5801 %define m10 m7
5802    punpckldq           m10, m4
5803%endif
5804    punpcklbw           m10, m10
5805    psraw               m10, 8
5806    pshufd               m3, m10, q0000
5807    pshufd               m4, m10, q1111
5808    pshufd               m5, m10, q2222
5809    pshufd              m10, m10, q3333
5810%if ARCH_X86_32
5811 %xdefine m8  m3
5812 %xdefine m9  m6
5813 %xdefine m11 m5
5814 %xdefine m6  m4
5815    mova         [stk+0x100], m3
5816    mova         [stk+0x110], m4
5817    mova         [stk+0x120], m5
5818    mova         [stk+0x130], m10
5819 %define m3  [stk+0x100]
5820 %define m4  [stk+0x110]
5821 %define m5  [stk+0x120]
5822 %define m10 [stk+0x130]
5823%endif
5824.dy2_w4_loop:
5825    pmaddwd              m8, m0, m3
5826    pmaddwd              m9, m1, m3
5827    mova                 m0, m2
5828    pmaddwd              m1, m4
5829    pmaddwd             m11, m2, m4
5830    paddd                m8, vrnd_mem
5831    paddd                m9, vrnd_mem
5832    pmaddwd              m2, m5
5833    paddd                m8, m1
5834    paddd                m9, m11
5835    paddd                m8, m2
5836    movu                 m6, [srcq+ssq*0]
5837    movu                 m1, [srcq+ssq*2]
5838%if ARCH_X86_64
5839    movu                m11, [srcq+r4 ]
5840    movu                 m2, [srcq+r11]
5841%else
5842    movu                m11, [r4+ssq*0]
5843    movu                 m2, [r4+ssq*2]
5844%endif
5845    pshufb               m6, m12
5846    pshufb               m1, m12
5847    pmaddwd              m6, m13
5848    pmaddwd              m1, m13
5849    pshufb              m11, m14
5850    pshufb               m2, m14
5851    pmaddwd             m11, m15
5852    pmaddwd              m2, m15
5853    phaddd               m6, m11
5854    phaddd               m1, m2
5855    paddd                m6, hrnd_mem
5856    paddd                m1, hrnd_mem
5857    psrad                m6, hsh_mem
5858    psrad                m1, hsh_mem
5859    movu                 m7, [srcq+ssq*1]
5860    movu                m11, [srcq+ss3q ]
5861    packssdw             m6, m1 ; 6 8
5862%if ARCH_X86_64
5863    movu                 m2, [srcq+r6 ]
5864    movu                 m1, [srcq+r13]
5865%else
5866    movu                 m2, [r4+ssq*1]
5867    movu                 m1, [r4+ss3q ]
5868%endif
5869    pshufb               m7, m12
5870    pshufb              m11, m12
5871    pmaddwd              m7, m13
5872    pmaddwd             m11, m13
5873    pshufb               m2, m14
5874    pshufb               m1, m14
5875    pmaddwd              m2, m15
5876    pmaddwd              m1, m15
5877    phaddd               m7, m2
5878    phaddd              m11, m1
5879    paddd                m7, hrnd_mem
5880    paddd               m11, hrnd_mem
5881    psrad                m7, hsh_mem
5882    psrad               m11, hsh_mem
5883    packssdw             m7, m11 ; 7 9
5884%if ARCH_X86_32
5885    lea                  r4, [r4+ssq*4]
5886%endif
5887    lea                srcq, [srcq+ssq*4]
5888    punpcklwd            m1, m6, m7 ; 67
5889    punpckhwd            m6, m7     ; 89
5890    mova                 m2, m6
5891    pmaddwd             m11, m1, m5
5892    pmaddwd              m7, m1, m10
5893    pmaddwd              m6, m10
5894    paddd                m9, m11
5895%if isput
5896    movd                m11, vsh_mem
5897%endif
5898    paddd                m8, m7
5899    paddd                m9, m6
5900%if isput
5901    psrad                m8, m11
5902    psrad                m9, m11
5903    packssdw             m8, m9
5904    pxor                 m7, m7
5905    pmaxsw               m8, m7
5906    pminsw               m8, pxmaxm
5907    movq       [dstq+dsq*0], m8
5908    movhps     [dstq+dsq*1], m8
5909    lea                dstq, [dstq+dsq*2]
5910%else
5911    psrad                m8, 6
5912    psrad                m9, 6
5913    packssdw             m8, m9
5914    mova             [tmpq], m8
5915    add                tmpq, 16
5916%endif
5917    sub                  hd, 2
5918    jg .dy2_w4_loop
5919    MC_8TAP_SCALED_RET ; why not jz .ret?
5920INIT_XMM ssse3
5921.dy2_w8:
5922    mov    dword [stk+0xf0], 1
5923    movifprep   tmp_stridem, 16
5924    jmp .dy2_w_start
5925.dy2_w16:
5926    mov    dword [stk+0xf0], 2
5927    movifprep   tmp_stridem, 32
5928    jmp .dy2_w_start
5929.dy2_w32:
5930    mov    dword [stk+0xf0], 4
5931    movifprep   tmp_stridem, 64
5932    jmp .dy2_w_start
5933.dy2_w64:
5934    mov    dword [stk+0xf0], 8
5935    movifprep   tmp_stridem, 128
5936    jmp .dy2_w_start
5937.dy2_w128:
5938    mov    dword [stk+0xf0], 16
5939    movifprep   tmp_stridem, 256
5940.dy2_w_start:
5941    mov                 myd, mym
5942%if ARCH_X86_64
5943 %ifidn %1, put
5944    movifnidn           dsm, dsq
5945 %endif
5946    mova         [rsp+0x10], m11
5947    mova         [rsp+0x20], m12
5948 %define hround m11
5949 %if isput
5950    mova         [rsp+0x30], m13
5951 %else
5952    mova                m13, [base+pd_m524256]
5953 %endif
5954    shr                 t0d, 16
5955    shr                 myd, 6
5956    mov                 r4d, 64 << 24
5957    lea                 myd, [t1+myq]
5958    cmovnz              r4q, [base+subpel_filters+myq*8]
5959    movd                m15, t0d
5960%else
5961 %define hround [esp+0x00]
5962 %define m12    [esp+0x10]
5963 %define m10    [base+pd_0x3ff]
5964 %define m8  m0
5965 %xdefine m14 m4
5966 %xdefine m15 m3
5967 %if isput
5968  %define dstq r0
5969 %else
5970  %define tmpq r0
5971  %define ssq ssm
5972 %endif
5973    mov                  r5, [esp+0x1f0]
5974    mov                  r3, [esp+0x1f4]
5975    shr                  r5, 16
5976    movd                m15, r5
5977    xor                  r5, r5
5978    shr                 myd, 6
5979    lea                  r3, [r3+myd]
5980    mov                  r4, 64 << 24
5981    cmovnz               r4, [base+subpel_filters+r3*8+0]
5982    cmovnz               r5, [base+subpel_filters+r3*8+4]
5983    mov                  r0, r0m
5984    mov                  r3, r3m
5985%endif
5986    sub                srcq, 6
5987    pslld                m7, m8, 2 ; dx*4
5988    pmaddwd              m8, [base+rescale_mul] ; dx*[0-3]
5989    pshufd              m15, m15, q0000
5990    paddd               m14, m8 ; mx+dx*[0-3]
5991%if ARCH_X86_64
5992    movq                 m3, r4q
5993%else
5994    movd                 m5, r4
5995    movd                 m6, r5
5996    punpckldq            m5, m6
5997    SWAP                 m3, m5
5998%endif
5999    punpcklbw            m3, m3
6000    psraw                m3, 8
6001    mova        [stk+0x100], m7
6002    mova        [stk+0x120], m15
6003    mov         [stk+0x0f8], srcq
6004    mov         [stk+0x130], r0q ; dstq / tmpq
6005    pshufd               m0, m3, q0000
6006    pshufd               m1, m3, q1111
6007    pshufd               m2, m3, q2222
6008    pshufd               m3, m3, q3333
6009%if ARCH_X86_64
6010    mova        [stk+0x140], m0
6011    mova        [stk+0x150], m1
6012    mova        [stk+0x160], m2
6013    mova        [stk+0x170], m3
6014 %if UNIX64
6015    mov                  hm, hd
6016 %endif
6017%else
6018    mova        [stk+0x180], m0
6019    mova        [stk+0x190], m1
6020    mova        [stk+0x1a0], m2
6021    mova        [stk+0x1b0], m3
6022    SWAP                 m5, m3
6023    mov                  r5, hm
6024    mov         [stk+0x134], r5
6025%endif
6026    jmp .dy2_hloop
6027.dy2_hloop_prep:
6028    dec   dword [stk+0x0f0]
6029    jz .ret
6030%if ARCH_X86_64
6031    add   qword [stk+0x130], 16
6032    mov                  hd, hm
6033%else
6034    add   dword [stk+0x130], 16
6035    mov                  r5, [stk+0x134]
6036    mov                  r0, [stk+0x130]
6037%endif
6038    mova                 m7, [stk+0x100]
6039    mova                m14, [stk+0x110]
6040%if ARCH_X86_64
6041    mova                m10, [base+pd_0x3ff]
6042    mova                m11, [rsp+0x10]
6043%endif
6044    mova                m15, [stk+0x120]
6045    mov                srcq, [stk+0x0f8]
6046%if ARCH_X86_64
6047    mov                 r0q, [stk+0x130] ; dstq / tmpq
6048%else
6049    mov                  hm, r5
6050    mov                 r0m, r0
6051    mov                  r3, r3m
6052%endif
6053    paddd               m14, m7
6054.dy2_hloop:
6055%if ARCH_X86_64
6056    mova                 m9, [base+pq_0x40000000]
6057%else
6058 %define m9 [base+pq_0x40000000]
6059%endif
6060    pxor                 m1, m1
6061    psrld                m2, m14, 10
6062    mova              [stk], m2
6063    pand                 m6, m14, m10
6064    psrld                m6, 6
6065    paddd                m5, m15, m6
6066    pcmpeqd              m6, m1
6067    pshufd               m2, m5, q1032
6068%if ARCH_X86_64
6069    movd                r4d, m5
6070    movd                r6d, m2
6071    pshufd               m5, m5, q0321
6072    pshufd               m2, m2, q0321
6073    movd                r7d, m5
6074    movd                r9d, m2
6075    movq                 m0, [base+subpel_filters+r4*8]
6076    movq                 m1, [base+subpel_filters+r6*8]
6077    movhps               m0, [base+subpel_filters+r7*8]
6078    movhps               m1, [base+subpel_filters+r9*8]
6079%else
6080    movd                 r0, m5
6081    movd                 rX, m2
6082    pshufd               m5, m5, q0321
6083    pshufd               m2, m2, q0321
6084    movd                 r4, m5
6085    movd                 r5, m2
6086    movq                 m0, [base+subpel_filters+r0*8]
6087    movq                 m1, [base+subpel_filters+rX*8]
6088    movhps               m0, [base+subpel_filters+r4*8]
6089    movhps               m1, [base+subpel_filters+r5*8]
6090%endif
6091    paddd               m14, m7 ; mx+dx*[4-7]
6092    pand                 m5, m14, m10
6093    psrld                m5, 6
6094    paddd               m15, m5
6095    pxor                 m2, m2
6096    pcmpeqd              m5, m2
6097    mova        [stk+0x110], m14
6098    pshufd               m4, m15, q1032
6099%if ARCH_X86_64
6100    movd               r10d, m15
6101    movd               r11d, m4
6102    pshufd              m15, m15, q0321
6103    pshufd               m4, m4, q0321
6104    movd               r13d, m15
6105    movd                rXd, m4
6106    movq                 m2, [base+subpel_filters+r10*8]
6107    movq                 m3, [base+subpel_filters+r11*8]
6108    movhps               m2, [base+subpel_filters+r13*8]
6109    movhps               m3, [base+subpel_filters+ rX*8]
6110    psrld               m14, 10
6111    movq                r11, m14
6112    punpckhqdq          m14, m14
6113    movq                 rX, m14
6114    mov                r10d, r11d
6115    shr                 r11, 32
6116    mov                r13d, rXd
6117    shr                  rX, 32
6118    mov                 r4d, [stk+ 0]
6119    mov                 r6d, [stk+ 4]
6120    mov                 r7d, [stk+ 8]
6121    mov                 r9d, [stk+12]
6122    pshufd               m4, m6, q1100
6123    pshufd               m6, m6, q3322
6124    pshufd              m14, m5, q1100
6125    pshufd               m5, m5, q3322
6126    pand                 m7, m9, m4
6127    pand                 m8, m9, m6
6128    pand                m15, m9, m14
6129    pand                 m9, m9, m5
6130    pandn                m4, m0
6131    pandn                m6, m1
6132    pandn               m14, m2
6133    pandn                m5, m3
6134    por                  m7, m4
6135    por                  m8, m6
6136    por                 m15, m14
6137    por                  m9, m5
6138    punpcklbw            m0, m7, m7
6139    punpckhbw            m7, m7
6140    punpcklbw            m1, m8, m8
6141    punpckhbw            m8, m8
6142    psraw                m0, 8
6143    psraw                m7, 8
6144    psraw                m1, 8
6145    psraw                m8, 8
6146    punpcklbw            m2, m15, m15
6147    punpckhbw           m15, m15
6148    punpcklbw            m3, m9, m9
6149    punpckhbw            m9, m9
6150    psraw                m2, 8
6151    psraw               m15, 8
6152    psraw                m3, 8
6153    psraw                m9, 8
6154    mova         [stk+0x10], m0
6155    mova         [stk+0x20], m7
6156    mova         [stk+0x30], m1
6157    mova         [stk+0x40], m8
6158    mova         [stk+0x50], m2
6159    mova         [stk+0x60], m15
6160    mova         [stk+0x70], m3
6161    mova         [stk+0x80], m9
6162    MC_8TAP_SCALED_H 1, 2, 3, 4, 5, 6, 9, 10 ; 0
6163    mova         [stk+0x90], m1
6164    MC_8TAP_SCALED_H 2, 3, 4, 5, 6, 1, 9, 10 ; 1
6165    mova         [stk+0xa0], m2
6166    MC_8TAP_SCALED_H 3, 4, 5, 6, 1, 2, 9, 10 ; 2
6167    mova         [stk+0xb0], m3
6168    MC_8TAP_SCALED_H 4, 5, 6, 1, 2, 3, 9, 10 ; 3
6169    mova         [stk+0xc0], m4
6170    MC_8TAP_SCALED_H 5, 6, 1, 2, 3, 4, 9, 10 ; 4
6171    mova         [stk+0xd0], m5
6172    MC_8TAP_SCALED_H 6, 1, 2, 3, 4, 5, 9, 10 ; 5
6173    MC_8TAP_SCALED_H 7, 1, 2, 3, 4, 5, 9, 10 ; 6
6174    MC_8TAP_SCALED_H 8, 1, 2, 3, 4, 5, 9, 10 ; 7
6175    mova                 m5, [stk+0xd0]
6176    mova                 m1, [stk+0x90]
6177    mova                 m2, [stk+0xa0]
6178    mova                 m3, [stk+0xb0]
6179    mova                 m9, [stk+0xc0]
6180    punpcklwd            m4, m5, m6 ; 45a
6181    punpckhwd            m5, m6     ; 45b
6182    punpcklwd            m6, m7, m8 ; 67a
6183    punpckhwd            m7, m8     ; 67b
6184    punpcklwd            m0, m1, m2 ; 01a
6185    punpckhwd            m1, m2     ; 01b
6186    punpcklwd            m2, m3, m9 ; 23a
6187    punpckhwd            m3, m9     ; 23b
6188    mova                m10, [stk+0x140]
6189    mova                m11, [stk+0x150]
6190    mova                m14, [stk+0x160]
6191    mova                m15, [stk+0x170]
6192    mova         [stk+0x90], m4
6193    mova         [stk+0xa0], m5
6194    mova         [stk+0xb0], m6
6195    mova         [stk+0xc0], m7
6196 %define hround [rsp+0x10]
6197 %define shift  [rsp+0x20]
6198 %if isput
6199  %define vround [rsp+0x30]
6200 %else
6201  %define vround [base+pd_m524256]
6202 %endif
6203.dy2_vloop:
6204    pmaddwd              m4, m0, m10
6205    pmaddwd              m5, m1, m10
6206    pmaddwd              m6, m2, m11
6207    pmaddwd              m7, m3, m11
6208    paddd                m4, m13
6209    paddd                m5, m13
6210    paddd                m4, m6
6211    paddd                m5, m7
6212    pmaddwd              m6, [stk+0x90], m14
6213    pmaddwd              m7, [stk+0xa0], m14
6214    pmaddwd              m8, [stk+0xb0], m15
6215    pmaddwd              m9, [stk+0xc0], m15
6216    paddd                m4, m6
6217    paddd                m5, m7
6218 %if isput
6219    pshufd               m6, m12, q1032
6220 %endif
6221    paddd                m4, m8
6222    paddd                m5, m9
6223%else
6224    movd                 r0, m15
6225    movd                 rX, m4
6226    pshufd              m15, m15, q0321
6227    pshufd               m4, m4, q0321
6228    movd                 r4, m15
6229    movd                 r5, m4
6230    mova                m14, [stk+0x110]
6231    movq                 m2, [base+subpel_filters+r0*8]
6232    movq                 m3, [base+subpel_filters+rX*8]
6233    movhps               m2, [base+subpel_filters+r4*8]
6234    movhps               m3, [base+subpel_filters+r5*8]
6235    psrld               m14, 10
6236    mova           [stk+16], m14
6237    mov                  r0, [stk+ 0]
6238    mov                  rX, [stk+ 4]
6239    mov                  r4, [stk+ 8]
6240    mov                  r5, [stk+12]
6241    mova         [stk+0x20], m0
6242    mova         [stk+0x30], m1
6243    mova         [stk+0x40], m2
6244    mova         [stk+0x50], m3
6245    pshufd               m4, m6, q1100
6246    pshufd               m6, m6, q3322
6247    pshufd               m7, m5, q1100
6248    pshufd               m5, m5, q3322
6249    pand                 m0, m9, m4
6250    pand                 m1, m9, m6
6251    pand                 m2, m9, m7
6252    pand                 m3, m9, m5
6253    pandn                m4, [stk+0x20]
6254    pandn                m6, [stk+0x30]
6255    pandn                m7, [stk+0x40]
6256    pandn                m5, [stk+0x50]
6257    por                  m0, m4
6258    por                  m1, m6
6259    por                  m2, m7
6260    por                  m3, m5
6261    punpcklbw            m4, m0, m0
6262    punpckhbw            m0, m0
6263    punpcklbw            m5, m1, m1
6264    punpckhbw            m1, m1
6265    psraw                m4, 8
6266    psraw                m0, 8
6267    psraw                m5, 8
6268    psraw                m1, 8
6269    punpcklbw            m6, m2, m2
6270    punpckhbw            m2, m2
6271    punpcklbw            m7, m3, m3
6272    punpckhbw            m3, m3
6273    psraw                m6, 8
6274    psraw                m2, 8
6275    psraw                m7, 8
6276    psraw                m3, 8
6277    mova        [stk+0x0a0], m4
6278    mova        [stk+0x0b0], m0
6279    mova        [stk+0x0c0], m5
6280    mova        [stk+0x0d0], m1
6281    mova        [stk+0x140], m6
6282    mova        [stk+0x150], m2
6283    mova        [stk+0x160], m7
6284    mova        [stk+0x170], m3
6285    MC_8TAP_SCALED_H   0xa0, 0x20, 0 ; 0
6286    MC_8TAP_SCALED_H   0xa0, 0x30    ; 1
6287    MC_8TAP_SCALED_H   0xa0, 0x40    ; 2
6288    MC_8TAP_SCALED_H   0xa0, 0x50    ; 3
6289    MC_8TAP_SCALED_H   0xa0, 0x60    ; 4
6290    MC_8TAP_SCALED_H   0xa0, 0x70    ; 5
6291    MC_8TAP_SCALED_H   0xa0, 0x80    ; 6
6292    MC_8TAP_SCALED_H   0xa0, 0x90    ; 7
6293    mova                 m5, [stk+0x60]
6294    mova                 m6, [stk+0x70]
6295    mova                 m7, [stk+0x80]
6296    mova                 m0, [stk+0x90]
6297    mov                  r0, r0m
6298    punpcklwd            m4, m5, m6      ; 45a
6299    punpckhwd            m5, m6          ; 45b
6300    punpcklwd            m6, m7, m0      ; 67a
6301    punpckhwd            m7, m0          ; 67b
6302    mova         [stk+0x60], m4
6303    mova         [stk+0x70], m5
6304    mova         [stk+0x80], m6
6305    mova         [stk+0x90], m7
6306    mova                 m1, [stk+0x20]
6307    mova                 m2, [stk+0x30]
6308    mova                 m3, [stk+0x40]
6309    mova                 m4, [stk+0x50]
6310    punpcklwd            m0, m1, m2      ; 01a
6311    punpckhwd            m1, m2          ; 01b
6312    punpcklwd            m2, m3, m4      ; 23a
6313    punpckhwd            m3, m4          ; 23b
6314    mova                 m4, [stk+0x180]
6315    mova                 m5, [stk+0x190]
6316    mova                 m6, [stk+0x1a0]
6317    mova                 m7, [stk+0x1b0]
6318    mova         [stk+0x40], m2
6319    mova         [stk+0x50], m3
6320.dy2_vloop:
6321    pmaddwd              m0, m4
6322    pmaddwd              m1, m4
6323    pmaddwd              m2, m5
6324    pmaddwd              m3, m5
6325    paddd                m0, m2
6326    paddd                m1, m3
6327    pmaddwd              m2, [stk+0x60], m6
6328    pmaddwd              m3, [stk+0x70], m6
6329    pmaddwd              m4, [stk+0x80], m7
6330    pmaddwd              m5, [stk+0x90], m7
6331 %if isput
6332    movd                 m6, [esp+0x18]
6333 %endif
6334    paddd                m0, m2
6335    paddd                m1, m3
6336    paddd                m0, vrnd_mem
6337    paddd                m1, vrnd_mem
6338    paddd                m4, m0
6339    paddd                m5, m1
6340%endif
6341%ifidn %1, put
6342    psrad                m4, m6
6343    psrad                m5, m6
6344    packssdw             m4, m5
6345    pxor                 m7, m7
6346    pmaxsw               m4, m7
6347    pminsw               m4, pxmaxm
6348    mova             [dstq], m4
6349    add                dstq, dsm
6350%else
6351    psrad                m4, 6
6352    psrad                m5, 6
6353    packssdw             m4, m5
6354    mova             [tmpq], m4
6355    add                tmpq, tmp_stridem
6356%endif
6357    dec                  hd
6358    jz .dy2_hloop_prep
6359%if ARCH_X86_64
6360    MC_8TAP_SCALED_H 4, 8, 5, 6, 7, 9, 0, 1
6361    mova         [stk+0xd0], m4
6362    MC_8TAP_SCALED_H 8, 5, 6, 7, 9, 4, 0, 1
6363    mova                 m4, [stk+0xd0]
6364    mova                 m0, m2         ; 01a
6365    mova                 m1, m3         ; 01b
6366    mova                 m2, [stk+0x90] ; 23a
6367    mova                 m3, [stk+0xa0] ; 23b
6368    mova                 m5, [stk+0xb0] ; 45a
6369    mova                 m6, [stk+0xc0] ; 45b
6370    punpcklwd            m7, m4, m8     ; 67a
6371    punpckhwd            m4, m8         ; 67b
6372    mova         [stk+0x90], m5
6373    mova         [stk+0xa0], m6
6374    mova         [stk+0xb0], m7
6375    mova         [stk+0xc0], m4
6376%else
6377    mov                 r0m, r0
6378    mov                  r3, r3m
6379    MC_8TAP_SCALED_H 0xa0, 0xe0 ; 8
6380    MC_8TAP_SCALED_H 0xa0, 0    ; 9
6381    mova                 m7, [stk+0xe0]
6382    mova                 m2, [stk+0x60] ; 23a
6383    mova                 m3, [stk+0x70] ; 23b
6384    mova                 m4, [stk+0x80] ; 45a
6385    mova                 m5, [stk+0x90] ; 45b
6386    punpcklwd            m6, m7, m0     ; 67a
6387    punpckhwd            m7, m0         ; 67b
6388    mova                 m0, [stk+0x40] ; 01a
6389    mova                 m1, [stk+0x50] ; 01b
6390    mova         [stk+0x40], m2
6391    mova         [stk+0x50], m3
6392    mova         [stk+0x60], m4
6393    mova         [stk+0x70], m5
6394    mova                 m4, [stk+0x180]
6395    mova                 m5, [stk+0x190]
6396    mova         [stk+0x80], m6
6397    mova         [stk+0x90], m7
6398    mova                 m6, [stk+0x1a0]
6399    mova                 m7, [stk+0x1b0]
6400    mov                  r0, r0m
6401%endif
6402    jmp .dy2_vloop
6403INIT_XMM ssse3
6404.ret:
6405    MC_8TAP_SCALED_RET 0
6406%if ARCH_X86_32 && !isprep && required_stack_alignment > STACK_ALIGNMENT
6407 %define r0m [rstk+stack_offset+ 4]
6408 %define r1m [rstk+stack_offset+ 8]
6409 %define r2m [rstk+stack_offset+12]
6410 %define r3m [rstk+stack_offset+16]
6411%endif
6412%undef isput
6413%undef isprep
6414%endmacro
6415
6416%macro BILIN_SCALED_FN 1
6417cglobal %1_bilin_scaled_16bpc
6418    mov                 t0d, (5*15 << 16) | 5*15
6419    mov                 t1d, (5*15 << 16) | 5*15
6420    jmp mangle(private_prefix %+ _%1_8tap_scaled_16bpc %+ SUFFIX)
6421%endmacro
6422
6423%if WIN64
6424DECLARE_REG_TMP 6, 5
6425%elif ARCH_X86_64
6426DECLARE_REG_TMP 6, 8
6427%else
6428DECLARE_REG_TMP 1, 2
6429%endif
6430BILIN_SCALED_FN put
6431FN put_8tap_scaled, sharp,          SHARP,   SHARP
6432FN put_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
6433FN put_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
6434FN put_8tap_scaled, smooth,         SMOOTH,  SMOOTH
6435FN put_8tap_scaled, sharp_regular,  SHARP,   REGULAR
6436FN put_8tap_scaled, regular_sharp,  REGULAR, SHARP
6437FN put_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
6438FN put_8tap_scaled, regular_smooth, REGULAR, SMOOTH
6439FN put_8tap_scaled, regular,        REGULAR, REGULAR
6440MC_8TAP_SCALED put
6441
6442%if WIN64
6443DECLARE_REG_TMP 5, 4
6444%elif ARCH_X86_64
6445DECLARE_REG_TMP 6, 7
6446%else
6447DECLARE_REG_TMP 1, 2
6448%endif
6449BILIN_SCALED_FN prep
6450FN prep_8tap_scaled, sharp,          SHARP,   SHARP
6451FN prep_8tap_scaled, sharp_smooth,   SHARP,   SMOOTH
6452FN prep_8tap_scaled, smooth_sharp,   SMOOTH,  SHARP
6453FN prep_8tap_scaled, smooth,         SMOOTH,  SMOOTH
6454FN prep_8tap_scaled, sharp_regular,  SHARP,   REGULAR
6455FN prep_8tap_scaled, regular_sharp,  REGULAR, SHARP
6456FN prep_8tap_scaled, smooth_regular, SMOOTH,  REGULAR
6457FN prep_8tap_scaled, regular_smooth, REGULAR, SMOOTH
6458FN prep_8tap_scaled, regular,        REGULAR, REGULAR
6459MC_8TAP_SCALED prep
6460
6461%if ARCH_X86_64
6462DECLARE_REG_TMP 6
6463%else
6464DECLARE_REG_TMP 2
6465%endif
6466
6467%if ARCH_X86_64
6468; warp8x8t spills one less xmm register than warp8x8 on WIN64, compensate that
6469; by allocating 16 bytes more stack space so that stack offsets match up.
6470%if WIN64 && STACK_ALIGNMENT == 16
6471%assign stksz 16*14
6472%else
6473%assign stksz 16*13
6474%endif
6475cglobal warp_affine_8x8t_16bpc, 4, 13, 9, stksz, dst, ds, src, ss, delta, \
6476                                                 mx, tmp, alpha, beta, \
6477                                                 filter, my, gamma, cnt
6478%assign stack_size_padded_8x8t stack_size_padded
6479%else
6480cglobal warp_affine_8x8t_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
6481                                                 filter, mx, my
6482%define m8   [esp+16*13]
6483%define m9   [esp+16*14]
6484%define cntd dword [esp+4*63]
6485%define dstq tmpq
6486%define dsq  0
6487%if STACK_ALIGNMENT < 16
6488%define dstm [esp+4*65]
6489%define dsm  [esp+4*66]
6490%else
6491%define dstm r0m
6492%define dsm  r1m
6493%endif
6494%endif
6495%define base filterq-$$
6496    mov                 t0d, r7m
6497    LEA             filterq, $$
6498    shr                 t0d, 11
6499%if ARCH_X86_64
6500    movddup              m8, [base+warp8x8t_rnd]
6501%else
6502    movddup              m1, [base+warp8x8t_rnd]
6503    mov                  r1, r1m
6504    add                  r1, r1
6505    mova                 m8, m1
6506    mov                 r1m, r1 ; ds *= 2
6507%endif
6508    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main
6509    jmp .start
6510.loop:
6511%if ARCH_X86_64
6512    lea                dstq, [dstq+dsq*4]
6513%else
6514    add                dstq, dsm
6515    mov                dstm, dstq
6516%endif
6517    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main2
6518.start:
6519%if ARCH_X86_32
6520    mov                dstq, dstm
6521%endif
6522    paddd                m1, m8
6523    paddd                m2, m8
6524    psrad                m1, 15
6525    psrad                m2, 15
6526    packssdw             m1, m2
6527    mova       [dstq+dsq*0], m1
6528    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_ssse3).main3
6529%if ARCH_X86_32
6530    mov                dstq, dstm
6531    add                dstq, dsm
6532%endif
6533    paddd                m1, m8
6534    paddd                m2, m8
6535    psrad                m1, 15
6536    psrad                m2, 15
6537    packssdw             m1, m2
6538    mova       [dstq+dsq*2], m1
6539    dec                cntd
6540    jg .loop
6541    RET
6542
6543%if ARCH_X86_64
6544cglobal warp_affine_8x8_16bpc, 4, 13, 10, 16*13, dst, ds, src, ss, delta, \
6545                                                 mx, tmp, alpha, beta, \
6546                                                 filter, my, gamma, cnt
6547ASSERT stack_size_padded == stack_size_padded_8x8t
6548%else
6549cglobal warp_affine_8x8_16bpc, 0, 7, 8, -16*17, alpha, gamma, src, tmp, \
6550                                                filter, mx, my
6551%endif
6552    mov                 t0d, r7m
6553    LEA             filterq, $$
6554    shr                 t0d, 11
6555%if ARCH_X86_64
6556    movddup              m8, [base+warp8x8_rnd2+t0*8]
6557    movd                 m9, r7m ; pixel_max
6558    pshufb               m9, [base+pw_256]
6559%else
6560    movddup              m1, [base+warp8x8_rnd2+t0*8]
6561    movd                 m2, r7m ; pixel_max
6562    pshufb               m2, [base+pw_256]
6563    mova                 m8, m1
6564    mova                 m9, m2
6565%endif
6566    call .main
6567    jmp .start
6568.loop:
6569%if ARCH_X86_64
6570    lea                dstq, [dstq+dsq*2]
6571%else
6572    add                dstq, dsm
6573    mov                dstm, dstq
6574%endif
6575    call .main2
6576.start:
6577%if ARCH_X86_32
6578    mov                dstq, dstm
6579%endif
6580    psrad                m1, 16
6581    psrad                m2, 16
6582    packssdw             m1, m2
6583    pmaxsw               m1, m6
6584    pmulhrsw             m1, m8
6585    pminsw               m1, m9
6586    mova       [dstq+dsq*0], m1
6587    call .main3
6588%if ARCH_X86_32
6589    mov                dstq, dstm
6590    add                dstq, dsm
6591%endif
6592    psrad                m1, 16
6593    psrad                m2, 16
6594    packssdw             m1, m2
6595    pmaxsw               m1, m6
6596    pmulhrsw             m1, m8
6597    pminsw               m1, m9
6598    mova       [dstq+dsq*1], m1
6599    dec                cntd
6600    jg .loop
6601    RET
6602ALIGN function_align
6603.main:
6604    ; Stack args offset by one (r4m -> r5m etc.) due to call
6605%if WIN64
6606    mov              deltaq, r5m
6607    mov                 mxd, r6m
6608%endif
6609    movd                 m0, [base+warp8x8_shift+t0*4]
6610    movddup              m7, [base+warp8x8_rnd1+t0*8]
6611    add             filterq, mc_warp_filter-$$
6612%if ARCH_X86_64
6613    movsx            alphad, word [deltaq+2*0]
6614    movsx             betad, word [deltaq+2*1]
6615    movsx            gammad, word [deltaq+2*2]
6616    movsx            deltad, word [deltaq+2*3]
6617    lea                tmpq, [ssq*3]
6618    add                 mxd, 512+(64<<10)
6619    sub                srcq, tmpq             ; src -= ss*3
6620    imul               tmpd, alphad, -7
6621    mov                 myd, r7m
6622    add               betad, tmpd             ; beta -= alpha*7
6623    imul               tmpd, gammad, -7
6624    add                 myd, 512+(64<<10)
6625    mov                cntd, 4
6626    add              deltad, tmpd             ; delta -= gamma*7
6627%else
6628%if STACK_ALIGNMENT < 16
6629    %assign stack_offset stack_offset - gprsize
6630%endif
6631    mov                 r3d, r5m              ; abcd
6632%if STACK_ALIGNMENT < 16
6633    mov                  r0, r1m              ; dst
6634    mov                  r1, r2m              ; ds
6635    mov  [esp+gprsize+4*65], r0
6636    mov  [esp+gprsize+4*66], r1
6637%endif
6638    movsx            alphad, word [r3+2*0]
6639    movsx               r2d, word [r3+2*1]
6640    movsx            gammad, word [r3+2*2]
6641    movsx               r3d, word [r3+2*3]
6642    imul                r5d, alphad, -7
6643    add                 r2d, r5d              ; beta -= alpha*7
6644    imul                r5d, gammad, -7
6645    mov  [esp+gprsize+4*60], r2d
6646    add                 r3d, r5d              ; delta -= gamma*7
6647    mov  [esp+gprsize+4*61], r3d
6648    mov                 r3d, r4m              ; ss
6649    mov                srcq, r3m
6650    mov                 mxd, r6m
6651    mov                 myd, r7m
6652    mov dword [esp+gprsize+4*63], 4           ; cnt
6653    mov  [esp+gprsize+4*62], r3
6654    lea                  r3, [r3*3]
6655    add                 mxd, 512+(64<<10)
6656    add                 myd, 512+(64<<10)
6657    sub                srcq, r3               ; src -= ss*3
6658%if STACK_ALIGNMENT < 16
6659    %assign stack_offset stack_offset + gprsize
6660%endif
6661%endif
6662    mova      [rsp+gprsize], m0
6663    pxor                 m6, m6
6664    call .h
6665    mova                 m5, m0
6666    call .h
6667    punpcklwd            m1, m5, m0           ; 01
6668    punpckhwd            m5, m0
6669    mova [rsp+gprsize+16* 1], m1
6670    mova [rsp+gprsize+16* 4], m5
6671    mova                 m5, m0
6672    call .h
6673    punpcklwd            m1, m5, m0           ; 12
6674    punpckhwd            m5, m0
6675    mova [rsp+gprsize+16* 7], m1
6676    mova [rsp+gprsize+16*10], m5
6677    mova                 m5, m0
6678    call .h
6679    punpcklwd            m1, m5, m0           ; 23
6680    punpckhwd            m5, m0
6681    mova [rsp+gprsize+16* 2], m1
6682    mova [rsp+gprsize+16* 5], m5
6683    mova                 m5, m0
6684    call .h
6685    punpcklwd            m1, m5, m0           ; 34
6686    punpckhwd            m5, m0
6687    mova [rsp+gprsize+16* 8], m1
6688    mova [rsp+gprsize+16*11], m5
6689    mova                 m5, m0
6690    call .h
6691    punpcklwd            m1, m5, m0           ; 45
6692    punpckhwd            m5, m0
6693    mova [rsp+gprsize+16* 3], m1
6694    mova [rsp+gprsize+16* 6], m5
6695    mova                 m5, m0
6696    call .h
6697    punpcklwd            m1, m5, m0           ; 56
6698    punpckhwd            m5, m0
6699    mova [rsp+gprsize+16* 9], m1
6700    mova [rsp+gprsize+16*12], m5
6701    mova                 m5, m0
6702.main2:
6703    call .h
6704%macro WARP_V 6 ; 01l, 23l, 45l, 01h, 23h, 45h
6705    lea                tmpd, [myq+gammaq]
6706    shr                 myd, 10
6707    movq                 m4, [filterq+myq*8]  ; a
6708    lea                 myd, [tmpq+gammaq]
6709    shr                tmpd, 10
6710    movq                 m2, [filterq+tmpq*8] ; b
6711    lea                tmpd, [myq+gammaq]
6712    shr                 myd, 10
6713    movq                 m3, [filterq+myq*8]  ; c
6714    lea                 myd, [tmpq+gammaq]
6715    shr                tmpd, 10
6716    movq                 m1, [filterq+tmpq*8] ; d
6717    lea                tmpd, [myq+gammaq]
6718    shr                 myd, 10
6719    punpcklwd            m4, m2
6720    punpcklwd            m3, m1
6721    punpckldq            m2, m4, m3
6722    punpckhdq            m4, m3
6723    punpcklbw            m1, m6, m2           ; a0 a1 b0 b1 c0 c1 d0 d1 << 8
6724    pmaddwd              m1, [rsp+gprsize+16*%1]
6725    punpckhbw            m3, m6, m2           ; a2 a3 b2 b3 c2 c3 d2 d3 << 8
6726    mova                 m2, [rsp+gprsize+16*%2]
6727    pmaddwd              m3, m2
6728    mova [rsp+gprsize+16*%1], m2
6729    paddd                m1, m3
6730    punpcklbw            m3, m6, m4           ; a4 a5 b4 b5 c4 c5 d4 d5 << 8
6731    mova                 m2, [rsp+gprsize+16*%3]
6732    pmaddwd              m3, m2
6733    mova [rsp+gprsize+16*%2], m2
6734    paddd                m1, m3
6735    punpcklwd            m3, m5, m0           ; 67
6736    punpckhbw            m2, m6, m4           ; a6 a7 b6 b7 c6 c7 d6 d7 << 8
6737    pmaddwd              m2, m3
6738    mova [rsp+gprsize+16*%3], m3
6739    paddd                m1, m2
6740    movq                 m4, [filterq+myq*8]  ; e
6741    lea                 myd, [tmpq+gammaq]
6742    shr                tmpd, 10
6743    movq                 m3, [filterq+tmpq*8] ; f
6744    lea                tmpd, [myq+gammaq]
6745    shr                 myd, 10
6746    movq                 m2, [filterq+myq*8]  ; g
6747%if ARCH_X86_64
6748    lea                 myd, [tmpq+deltaq]    ; my += delta
6749%else
6750    mov                 myd, [esp+gprsize+4*61]
6751    add                 myd, tmpd
6752%endif
6753    shr                tmpd, 10
6754    punpcklwd            m4, m3
6755    movq                 m3, [filterq+tmpq*8] ; h
6756    punpcklwd            m2, m3
6757    punpckldq            m3, m4, m2
6758    punpckhdq            m4, m2
6759    punpcklbw            m2, m6, m3           ; e0 e1 f0 f1 g0 g1 h0 h1 << 8
6760    pmaddwd              m2, [rsp+gprsize+16*%4]
6761    punpckhbw            m6, m3               ; e2 e3 f2 f3 g2 g3 h2 h3 << 8
6762    mova                 m3, [rsp+gprsize+16*%5]
6763    pmaddwd              m6, m3
6764    mova [rsp+gprsize+16*%4], m3
6765    pxor                 m3, m3
6766    paddd                m2, m6
6767    punpcklbw            m3, m4               ; e4 e5 f4 f5 g4 g5 h4 h5 << 8
6768    mova                 m6, [rsp+gprsize+16*%6]
6769    pmaddwd              m3, m6
6770    mova [rsp+gprsize+16*%5], m6
6771    punpckhwd            m5, m0
6772    pxor                 m6, m6
6773    paddd                m2, m3
6774    punpckhbw            m3, m6, m4           ; e6 e7 f6 f7 g6 g7 h6 h7 << 8
6775    pmaddwd              m3, m5
6776    mova [rsp+gprsize+16*%6], m5
6777    mova                 m5, m0
6778    paddd                m2, m3
6779%endmacro
6780    WARP_V                1,  2,  3,  4,  5,  6
6781    ret
6782.main3:
6783    call .h
6784    WARP_V                7,  8,  9, 10, 11, 12
6785    ret
6786ALIGN function_align
6787.h:
6788    lea                tmpd, [mxq+alphaq]
6789    shr                 mxd, 10
6790    movq                 m3, [filterq+mxq*8]
6791    punpcklbw            m0, m6, m3
6792    movu                 m3, [srcq-6]
6793    pmaddwd              m0, m3               ; 0
6794    lea                 mxd, [tmpq+alphaq]
6795    shr                tmpd, 10
6796    movq                 m3, [filterq+tmpq*8]
6797    punpcklbw            m2, m6, m3
6798    movu                 m3, [srcq-4]
6799    pmaddwd              m2, m3               ; 1
6800    lea                tmpd, [mxq+alphaq]
6801    shr                 mxd, 10
6802    movq                 m3, [filterq+mxq*8]
6803    phaddd               m0, m2               ; 0 1
6804    punpcklbw            m2, m6, m3
6805    movu                 m3, [srcq-2]
6806    pmaddwd              m2, m3               ; 2
6807    lea                 mxd, [tmpq+alphaq]
6808    shr                tmpd, 10
6809    movq                 m3, [filterq+tmpq*8]
6810    punpcklbw            m1, m6, m3
6811    movu                 m3, [srcq+0]
6812    pmaddwd              m1, m3               ; 3
6813    lea                tmpd, [mxq+alphaq]
6814    shr                 mxd, 10
6815    movq                 m3, [filterq+mxq*8]
6816    phaddd               m2, m1               ; 2 3
6817    punpcklbw            m1, m6, m3
6818    movu                 m3, [srcq+2]
6819    pmaddwd              m1, m3               ; 4
6820    lea                 mxd, [tmpq+alphaq]
6821    shr                tmpd, 10
6822    movq                 m3, [filterq+tmpq*8]
6823    phaddd               m0, m2               ; 0 1 2 3
6824    punpcklbw            m2, m6, m3
6825    movu                 m3, [srcq+4]
6826    pmaddwd              m2, m3               ; 5
6827    lea                tmpd, [mxq+alphaq]
6828    shr                 mxd, 10
6829    movq                 m3, [filterq+mxq*8]
6830    phaddd               m1, m2               ; 4 5
6831    punpcklbw            m2, m6, m3
6832    movu                 m3, [srcq+6]
6833    pmaddwd              m2, m3               ; 6
6834%if ARCH_X86_64
6835    lea                 mxd, [tmpq+betaq]     ; mx += beta
6836%else
6837    mov                 mxd, [esp+gprsize*2+4*60]
6838    add                 mxd, tmpd
6839%endif
6840    shr                tmpd, 10
6841    movq                 m3, [filterq+tmpq*8]
6842    punpcklbw            m4, m6, m3
6843    movu                 m3, [srcq+8]
6844%if ARCH_X86_64
6845    add                srcq, ssq
6846%else
6847    add                srcq, [esp+gprsize*2+4*62]
6848%endif
6849    pmaddwd              m3, m4               ; 7
6850    phaddd               m2, m3               ; 6 7
6851    phaddd               m1, m2               ; 4 5 6 7
6852    paddd                m0, m7
6853    paddd                m1, m7
6854    psrad                m0, [rsp+gprsize*2]
6855    psrad                m1, [rsp+gprsize*2]
6856    packssdw             m0, m1
6857    ret
6858
6859%macro BIDIR_FN 0
6860    call .main
6861    jmp                  wq
6862.w4_loop:
6863    call .main
6864    lea                dstq, [dstq+strideq*2]
6865.w4:
6866    movq   [dstq+strideq*0], m0
6867    movhps [dstq+strideq*1], m0
6868    lea                dstq, [dstq+strideq*2]
6869    movq   [dstq+strideq*0], m1
6870    movhps [dstq+strideq*1], m1
6871    sub                  hd, 4
6872    jg .w4_loop
6873.ret:
6874    RET
6875.w8_loop:
6876    call .main
6877    lea                dstq, [dstq+strideq*2]
6878.w8:
6879    mova   [dstq+strideq*0], m0
6880    mova   [dstq+strideq*1], m1
6881    sub                  hd, 2
6882    jne .w8_loop
6883    RET
6884.w16_loop:
6885    call .main
6886    add                dstq, strideq
6887.w16:
6888    mova        [dstq+16*0], m0
6889    mova        [dstq+16*1], m1
6890    dec                  hd
6891    jg .w16_loop
6892    RET
6893.w32_loop:
6894    call .main
6895    add                dstq, strideq
6896.w32:
6897    mova        [dstq+16*0], m0
6898    mova        [dstq+16*1], m1
6899    call .main
6900    mova        [dstq+16*2], m0
6901    mova        [dstq+16*3], m1
6902    dec                  hd
6903    jg .w32_loop
6904    RET
6905.w64_loop:
6906    call .main
6907    add                dstq, strideq
6908.w64:
6909    mova        [dstq+16*0], m0
6910    mova        [dstq+16*1], m1
6911    call .main
6912    mova        [dstq+16*2], m0
6913    mova        [dstq+16*3], m1
6914    call .main
6915    mova        [dstq+16*4], m0
6916    mova        [dstq+16*5], m1
6917    call .main
6918    mova        [dstq+16*6], m0
6919    mova        [dstq+16*7], m1
6920    dec                  hd
6921    jg .w64_loop
6922    RET
6923.w128_loop:
6924    call .main
6925    add                dstq, strideq
6926.w128:
6927    mova       [dstq+16* 0], m0
6928    mova       [dstq+16* 1], m1
6929    call .main
6930    mova       [dstq+16* 2], m0
6931    mova       [dstq+16* 3], m1
6932    call .main
6933    mova       [dstq+16* 4], m0
6934    mova       [dstq+16* 5], m1
6935    call .main
6936    mova       [dstq+16* 6], m0
6937    mova       [dstq+16* 7], m1
6938    call .main
6939    mova       [dstq+16* 8], m0
6940    mova       [dstq+16* 9], m1
6941    call .main
6942    mova       [dstq+16*10], m0
6943    mova       [dstq+16*11], m1
6944    call .main
6945    mova       [dstq+16*12], m0
6946    mova       [dstq+16*13], m1
6947    call .main
6948    mova       [dstq+16*14], m0
6949    mova       [dstq+16*15], m1
6950    dec                  hd
6951    jg .w128_loop
6952    RET
6953%endmacro
6954
6955%if UNIX64
6956DECLARE_REG_TMP 7
6957%else
6958DECLARE_REG_TMP 5
6959%endif
6960
6961cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h
6962%define base r6-avg_ssse3_table
6963    LEA                  r6, avg_ssse3_table
6964    tzcnt                wd, wm
6965    mov                 t0d, r6m ; pixel_max
6966    movsxd               wq, [r6+wq*4]
6967    shr                 t0d, 11
6968    movddup              m2, [base+bidir_rnd+t0*8]
6969    movddup              m3, [base+bidir_mul+t0*8]
6970    movifnidn            hd, hm
6971    add                  wq, r6
6972    BIDIR_FN
6973ALIGN function_align
6974.main:
6975    mova                 m0, [tmp1q+16*0]
6976    paddsw               m0, [tmp2q+16*0]
6977    mova                 m1, [tmp1q+16*1]
6978    paddsw               m1, [tmp2q+16*1]
6979    add               tmp1q, 16*2
6980    add               tmp2q, 16*2
6981    pmaxsw               m0, m2
6982    pmaxsw               m1, m2
6983    psubsw               m0, m2
6984    psubsw               m1, m2
6985    pmulhw               m0, m3
6986    pmulhw               m1, m3
6987    ret
6988
6989cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h
6990%define base r6-w_avg_ssse3_table
6991    LEA                  r6, w_avg_ssse3_table
6992    tzcnt                wd, wm
6993    mov                 t0d, r6m ; weight
6994    movd                 m6, r7m ; pixel_max
6995    movddup              m5, [base+pd_65538]
6996    movsxd               wq, [r6+wq*4]
6997    pshufb               m6, [base+pw_256]
6998    add                  wq, r6
6999    lea                 r6d, [t0-16]
7000    shl                 t0d, 16
7001    sub                 t0d, r6d ; 16-weight, weight
7002    paddw                m5, m6
7003    mov                 r6d, t0d
7004    shl                 t0d, 2
7005    test          dword r7m, 0x800
7006    cmovnz              r6d, t0d
7007    movifnidn            hd, hm
7008    movd                 m4, r6d
7009    pslld                m5, 7
7010    pxor                 m7, m7
7011    pshufd               m4, m4, q0000
7012    BIDIR_FN
7013ALIGN function_align
7014.main:
7015    mova                 m2, [tmp1q+16*0]
7016    mova                 m0, [tmp2q+16*0]
7017    punpckhwd            m3, m0, m2
7018    punpcklwd            m0, m2
7019    mova                 m2, [tmp1q+16*1]
7020    mova                 m1, [tmp2q+16*1]
7021    add               tmp1q, 16*2
7022    add               tmp2q, 16*2
7023    pmaddwd              m3, m4
7024    pmaddwd              m0, m4
7025    paddd                m3, m5
7026    paddd                m0, m5
7027    psrad                m3, 8
7028    psrad                m0, 8
7029    packssdw             m0, m3
7030    punpckhwd            m3, m1, m2
7031    punpcklwd            m1, m2
7032    pmaddwd              m3, m4
7033    pmaddwd              m1, m4
7034    paddd                m3, m5
7035    paddd                m1, m5
7036    psrad                m3, 8
7037    psrad                m1, 8
7038    packssdw             m1, m3
7039    pminsw               m0, m6
7040    pminsw               m1, m6
7041    pmaxsw               m0, m7
7042    pmaxsw               m1, m7
7043    ret
7044
7045%if ARCH_X86_64
7046cglobal mask_16bpc, 4, 7, 9, dst, stride, tmp1, tmp2, w, h, mask
7047%else
7048cglobal mask_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, mask
7049%define hd dword r5m
7050%define m8 [base+pw_64]
7051%endif
7052%define base r6-mask_ssse3_table
7053    LEA                  r6, mask_ssse3_table
7054    tzcnt                wd, wm
7055    mov                 t0d, r7m ; pixel_max
7056    shr                 t0d, 11
7057    movsxd               wq, [r6+wq*4]
7058    movddup              m6, [base+bidir_rnd+t0*8]
7059    movddup              m7, [base+bidir_mul+t0*8]
7060%if ARCH_X86_64
7061    mova                 m8, [base+pw_64]
7062    movifnidn            hd, hm
7063%endif
7064    add                  wq, r6
7065    mov               maskq, r6mp
7066    BIDIR_FN
7067ALIGN function_align
7068.main:
7069    movq                 m3, [maskq+8*0]
7070    mova                 m0, [tmp1q+16*0]
7071    mova                 m4, [tmp2q+16*0]
7072    pxor                 m5, m5
7073    punpcklbw            m3, m5
7074    punpckhwd            m2, m0, m4
7075    punpcklwd            m0, m4
7076    psubw                m1, m8, m3
7077    punpckhwd            m4, m3, m1 ; m, 64-m
7078    punpcklwd            m3, m1
7079    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
7080    pmaddwd              m0, m3
7081    movq                 m3, [maskq+8*1]
7082    mova                 m1, [tmp1q+16*1]
7083    mova                 m4, [tmp2q+16*1]
7084    add               maskq, 8*2
7085    add               tmp1q, 16*2
7086    add               tmp2q, 16*2
7087    psrad                m2, 5
7088    psrad                m0, 5
7089    packssdw             m0, m2
7090    punpcklbw            m3, m5
7091    punpckhwd            m2, m1, m4
7092    punpcklwd            m1, m4
7093    psubw                m5, m8, m3
7094    punpckhwd            m4, m3, m5 ; m, 64-m
7095    punpcklwd            m3, m5
7096    pmaddwd              m2, m4     ; tmp1 * m + tmp2 * (64-m)
7097    pmaddwd              m1, m3
7098    psrad                m2, 5
7099    psrad                m1, 5
7100    packssdw             m1, m2
7101    pmaxsw               m0, m6
7102    pmaxsw               m1, m6
7103    psubsw               m0, m6
7104    psubsw               m1, m6
7105    pmulhw               m0, m7
7106    pmulhw               m1, m7
7107    ret
7108
7109cglobal w_mask_420_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
7110%define base t0-w_mask_420_ssse3_table
7111    LEA                  t0, w_mask_420_ssse3_table
7112    tzcnt                wd, wm
7113    mov                 r6d, r8m ; pixel_max
7114    movd                 m0, r7m ; sign
7115    shr                 r6d, 11
7116    movsxd               wq, [t0+wq*4]
7117%if ARCH_X86_64
7118    mova                 m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
7119    mova                 m9, [base+pw_64]
7120    movddup             m10, [base+bidir_rnd+r6*8]
7121    movddup             m11, [base+bidir_mul+r6*8]
7122%else
7123    mova                 m1, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
7124    mova                 m2, [base+pw_64]
7125    movddup              m3, [base+bidir_rnd+r6*8]
7126    movddup              m4, [base+bidir_mul+r6*8]
7127    ALLOC_STACK       -16*4
7128    mova         [rsp+16*0], m1
7129    mova         [rsp+16*1], m2
7130    mova         [rsp+16*2], m3
7131    mova         [rsp+16*3], m4
7132    %define              m8  [rsp+gprsize+16*0]
7133    %define              m9  [rsp+gprsize+16*1]
7134    %define             m10  [rsp+gprsize+16*2]
7135    %define             m11  [rsp+gprsize+16*3]
7136%endif
7137    movd                 m7, [base+pw_2]
7138    psubw                m7, m0
7139    pshufb               m7, [base+pw_256]
7140    add                  wq, t0
7141    movifnidn            hd, r5m
7142    mov               maskq, r6mp
7143    call .main
7144    jmp                  wq
7145.w4_loop:
7146    call .main
7147    lea                dstq, [dstq+strideq*2]
7148    add               maskq, 4
7149.w4:
7150    movq   [dstq+strideq*0], m0
7151    phaddw               m2, m3
7152    movhps [dstq+strideq*1], m0
7153    phaddd               m2, m2
7154    lea                dstq, [dstq+strideq*2]
7155    paddw                m2, m7
7156    movq   [dstq+strideq*0], m1
7157    psrlw                m2, 2
7158    movhps [dstq+strideq*1], m1
7159    packuswb             m2, m2
7160    movd            [maskq], m2
7161    sub                  hd, 4
7162    jg .w4_loop
7163    RET
7164.w8_loop:
7165    call .main
7166    lea                dstq, [dstq+strideq*2]
7167    add               maskq, 4
7168.w8:
7169    mova   [dstq+strideq*0], m0
7170    paddw                m2, m3
7171    phaddw               m2, m2
7172    mova   [dstq+strideq*1], m1
7173    paddw                m2, m7
7174    psrlw                m2, 2
7175    packuswb             m2, m2
7176    movd            [maskq], m2
7177    sub                  hd, 2
7178    jg .w8_loop
7179    RET
7180.w16_loop:
7181    call .main
7182    lea                dstq, [dstq+strideq*2]
7183    add               maskq, 8
7184.w16:
7185    mova [dstq+strideq*1+16*0], m2
7186    mova [dstq+strideq*0+16*0], m0
7187    mova [dstq+strideq*1+16*1], m3
7188    mova [dstq+strideq*0+16*1], m1
7189    call .main
7190    paddw                m2, [dstq+strideq*1+16*0]
7191    paddw                m3, [dstq+strideq*1+16*1]
7192    mova [dstq+strideq*1+16*0], m0
7193    phaddw               m2, m3
7194    mova [dstq+strideq*1+16*1], m1
7195    paddw                m2, m7
7196    psrlw                m2, 2
7197    packuswb             m2, m2
7198    movq            [maskq], m2
7199    sub                  hd, 2
7200    jg .w16_loop
7201    RET
7202.w32_loop:
7203    call .main
7204    lea                dstq, [dstq+strideq*2]
7205    add               maskq, 16
7206.w32:
7207    mova [dstq+strideq*1+16*0], m2
7208    mova [dstq+strideq*0+16*0], m0
7209    mova [dstq+strideq*1+16*1], m3
7210    mova [dstq+strideq*0+16*1], m1
7211    call .main
7212    mova [dstq+strideq*0+16*2], m0
7213    phaddw               m2, m3
7214    mova [dstq+strideq*1+16*3], m2
7215    mova [dstq+strideq*0+16*3], m1
7216    call .main
7217    paddw                m2, [dstq+strideq*1+16*0]
7218    paddw                m3, [dstq+strideq*1+16*1]
7219    mova [dstq+strideq*1+16*0], m0
7220    phaddw               m2, m3
7221    mova [dstq+strideq*1+16*2], m2
7222    mova [dstq+strideq*1+16*1], m1
7223    call .main
7224    phaddw               m2, m3
7225    paddw                m3, m7, [dstq+strideq*1+16*2]
7226    paddw                m2, [dstq+strideq*1+16*3]
7227    mova [dstq+strideq*1+16*2], m0
7228    paddw                m2, m7
7229    psrlw                m3, 2
7230    psrlw                m2, 2
7231    mova [dstq+strideq*1+16*3], m1
7232    packuswb             m3, m2
7233    mova            [maskq], m3
7234    sub                  hd, 2
7235    jg .w32_loop
7236    RET
7237.w64_loop:
7238    call .main
7239    lea                dstq, [dstq+strideq*2]
7240    add               maskq, 16*2
7241.w64:
7242    mova [dstq+strideq*1+16*1], m2
7243    mova [dstq+strideq*0+16*0], m0
7244    mova [dstq+strideq*1+16*2], m3
7245    mova [dstq+strideq*0+16*1], m1
7246    call .main
7247    mova [dstq+strideq*1+16*3], m2
7248    mova [dstq+strideq*0+16*2], m0
7249    mova [dstq+strideq*1+16*4], m3
7250    mova [dstq+strideq*0+16*3], m1
7251    call .main
7252    mova [dstq+strideq*1+16*5], m2
7253    mova [dstq+strideq*0+16*4], m0
7254    mova [dstq+strideq*1+16*6], m3
7255    mova [dstq+strideq*0+16*5], m1
7256    call .main
7257    mova [dstq+strideq*0+16*6], m0
7258    phaddw               m2, m3
7259    mova [dstq+strideq*1+16*7], m2
7260    mova [dstq+strideq*0+16*7], m1
7261    call .main
7262    paddw                m2, [dstq+strideq*1+16*1]
7263    paddw                m3, [dstq+strideq*1+16*2]
7264    mova [dstq+strideq*1+16*0], m0
7265    phaddw               m2, m3
7266    mova [dstq+strideq*1+16*2], m2
7267    mova [dstq+strideq*1+16*1], m1
7268    call .main
7269    paddw                m2, [dstq+strideq*1+16*3]
7270    paddw                m3, [dstq+strideq*1+16*4]
7271    phaddw               m2, m3
7272    paddw                m3, m7, [dstq+strideq*1+16*2]
7273    mova [dstq+strideq*1+16*2], m0
7274    paddw                m2, m7
7275    psrlw                m3, 2
7276    psrlw                m2, 2
7277    mova [dstq+strideq*1+16*3], m1
7278    packuswb             m3, m2
7279    mova       [maskq+16*0], m3
7280    call .main
7281    paddw                m2, [dstq+strideq*1+16*5]
7282    paddw                m3, [dstq+strideq*1+16*6]
7283    mova [dstq+strideq*1+16*4], m0
7284    phaddw               m2, m3
7285    mova [dstq+strideq*1+16*6], m2
7286    mova [dstq+strideq*1+16*5], m1
7287    call .main
7288    phaddw               m2, m3
7289    paddw                m3, m7, [dstq+strideq*1+16*6]
7290    paddw                m2, [dstq+strideq*1+16*7]
7291    mova [dstq+strideq*1+16*6], m0
7292    paddw                m2, m7
7293    psrlw                m3, 2
7294    psrlw                m2, 2
7295    mova [dstq+strideq*1+16*7], m1
7296    packuswb             m3, m2
7297    mova       [maskq+16*1], m3
7298    sub                  hd, 2
7299    jg .w64_loop
7300    RET
7301.w128_loop:
7302    call .main
7303    lea                dstq, [dstq+strideq*2]
7304    add               maskq, 16*4
7305.w128:
7306    mova [dstq+strideq*1+16* 1], m2
7307    mova [dstq+strideq*0+16* 0], m0
7308    mova [dstq+strideq*1+16* 2], m3
7309    mova [dstq+strideq*0+16* 1], m1
7310    call .main
7311    mova [dstq+strideq*1+16* 3], m2
7312    mova [dstq+strideq*0+16* 2], m0
7313    mova [dstq+strideq*1+16* 4], m3
7314    mova [dstq+strideq*0+16* 3], m1
7315    call .main
7316    mova [dstq+strideq*1+16* 5], m2
7317    mova [dstq+strideq*0+16* 4], m0
7318    mova [dstq+strideq*1+16* 6], m3
7319    mova [dstq+strideq*0+16* 5], m1
7320    call .main
7321    mova [dstq+strideq*1+16* 7], m2
7322    mova [dstq+strideq*0+16* 6], m0
7323    mova [dstq+strideq*1+16* 8], m3
7324    mova [dstq+strideq*0+16* 7], m1
7325    call .main
7326    mova [dstq+strideq*1+16* 9], m2
7327    mova [dstq+strideq*0+16* 8], m0
7328    mova [dstq+strideq*1+16*10], m3
7329    mova [dstq+strideq*0+16* 9], m1
7330    call .main
7331    mova [dstq+strideq*1+16*11], m2
7332    mova [dstq+strideq*0+16*10], m0
7333    mova [dstq+strideq*1+16*12], m3
7334    mova [dstq+strideq*0+16*11], m1
7335    call .main
7336    mova [dstq+strideq*1+16*13], m2
7337    mova [dstq+strideq*0+16*12], m0
7338    mova [dstq+strideq*1+16*14], m3
7339    mova [dstq+strideq*0+16*13], m1
7340    call .main
7341    mova [dstq+strideq*0+16*14], m0
7342    phaddw               m2, m3
7343    mova [dstq+strideq*1+16*15], m2
7344    mova [dstq+strideq*0+16*15], m1
7345    call .main
7346    paddw                m2, [dstq+strideq*1+16* 1]
7347    paddw                m3, [dstq+strideq*1+16* 2]
7348    mova [dstq+strideq*1+16* 0], m0
7349    phaddw               m2, m3
7350    mova [dstq+strideq*1+16* 2], m2
7351    mova [dstq+strideq*1+16* 1], m1
7352    call .main
7353    paddw                m2, [dstq+strideq*1+16* 3]
7354    paddw                m3, [dstq+strideq*1+16* 4]
7355    phaddw               m2, m3
7356    paddw                m3, m7, [dstq+strideq*1+16* 2]
7357    mova [dstq+strideq*1+16* 2], m0
7358    paddw                m2, m7
7359    psrlw                m3, 2
7360    psrlw                m2, 2
7361    mova [dstq+strideq*1+16* 3], m1
7362    packuswb             m3, m2
7363    mova       [maskq+16*0], m3
7364    call .main
7365    paddw                m2, [dstq+strideq*1+16* 5]
7366    paddw                m3, [dstq+strideq*1+16* 6]
7367    mova [dstq+strideq*1+16* 4], m0
7368    phaddw               m2, m3
7369    mova [dstq+strideq*1+16* 6], m2
7370    mova [dstq+strideq*1+16* 5], m1
7371    call .main
7372    paddw                m2, [dstq+strideq*1+16* 7]
7373    paddw                m3, [dstq+strideq*1+16* 8]
7374    phaddw               m2, m3
7375    paddw                m3, m7, [dstq+strideq*1+16* 6]
7376    mova [dstq+strideq*1+16* 6], m0
7377    paddw                m2, m7
7378    psrlw                m3, 2
7379    psrlw                m2, 2
7380    mova [dstq+strideq*1+16* 7], m1
7381    packuswb             m3, m2
7382    mova       [maskq+16*1], m3
7383    call .main
7384    paddw                m2, [dstq+strideq*1+16* 9]
7385    paddw                m3, [dstq+strideq*1+16*10]
7386    mova [dstq+strideq*1+16* 8], m0
7387    phaddw               m2, m3
7388    mova [dstq+strideq*1+16*10], m2
7389    mova [dstq+strideq*1+16* 9], m1
7390    call .main
7391    paddw                m2, [dstq+strideq*1+16*11]
7392    paddw                m3, [dstq+strideq*1+16*12]
7393    phaddw               m2, m3
7394    paddw                m3, m7, [dstq+strideq*1+16*10]
7395    mova [dstq+strideq*1+16*10], m0
7396    paddw                m2, m7
7397    psrlw                m3, 2
7398    psrlw                m2, 2
7399    mova [dstq+strideq*1+16*11], m1
7400    packuswb             m3, m2
7401    mova       [maskq+16*2], m3
7402    call .main
7403    paddw                m2, [dstq+strideq*1+16*13]
7404    paddw                m3, [dstq+strideq*1+16*14]
7405    mova [dstq+strideq*1+16*12], m0
7406    phaddw               m2, m3
7407    mova [dstq+strideq*1+16*14], m2
7408    mova [dstq+strideq*1+16*13], m1
7409    call .main
7410    phaddw               m2, m3
7411    paddw                m3, m7, [dstq+strideq*1+16*14]
7412    paddw                m2, [dstq+strideq*1+16*15]
7413    mova [dstq+strideq*1+16*14], m0
7414    paddw                m2, m7
7415    psrlw                m3, 2
7416    psrlw                m2, 2
7417    mova [dstq+strideq*1+16*15], m1
7418    packuswb             m3, m2
7419    mova       [maskq+16*3], m3
7420    sub                  hd, 2
7421    jg .w128_loop
7422    RET
7423ALIGN function_align
7424.main:
7425%macro W_MASK 2 ; dst/tmp_offset, mask
7426    mova                m%1, [tmp1q+16*%1]
7427    mova                m%2, [tmp2q+16*%1]
7428    punpcklwd            m4, m%2, m%1
7429    punpckhwd            m5, m%2, m%1
7430    psubsw              m%1, m%2
7431    pabsw               m%1, m%1
7432    psubusw              m6, m8, m%1
7433    psrlw                m6, 10      ; 64-m
7434    psubw               m%2, m9, m6  ; m
7435    punpcklwd           m%1, m6, m%2
7436    punpckhwd            m6, m%2
7437    pmaddwd             m%1, m4
7438    pmaddwd              m6, m5
7439    psrad               m%1, 5
7440    psrad                m6, 5
7441    packssdw            m%1, m6
7442    pmaxsw              m%1, m10
7443    psubsw              m%1, m10
7444    pmulhw              m%1, m11
7445%endmacro
7446    W_MASK                0, 2
7447    W_MASK                1, 3
7448    add               tmp1q, 16*2
7449    add               tmp2q, 16*2
7450    ret
7451
7452cglobal w_mask_422_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
7453%define base t0-w_mask_422_ssse3_table
7454    LEA                  t0, w_mask_422_ssse3_table
7455    tzcnt                wd, wm
7456    mov                 r6d, r8m ; pixel_max
7457    movd                 m7, r7m ; sign
7458    shr                 r6d, 11
7459    movsxd               wq, [t0+wq*4]
7460%if ARCH_X86_64
7461    mova                 m8, [base+pw_27615]
7462    mova                 m9, [base+pw_64]
7463    movddup             m10, [base+bidir_rnd+r6*8]
7464    movddup             m11, [base+bidir_mul+r6*8]
7465%else
7466    mova                 m1, [base+pw_27615]
7467    mova                 m2, [base+pw_64]
7468    movddup              m3, [base+bidir_rnd+r6*8]
7469    movddup              m4, [base+bidir_mul+r6*8]
7470    ALLOC_STACK       -16*4
7471    mova         [rsp+16*0], m1
7472    mova         [rsp+16*1], m2
7473    mova         [rsp+16*2], m3
7474    mova         [rsp+16*3], m4
7475%endif
7476    pxor                 m0, m0
7477    add                  wq, t0
7478    pshufb               m7, m0
7479    movifnidn            hd, r5m
7480    mov               maskq, r6mp
7481    call .main
7482    jmp                  wq
7483.w4_loop:
7484    call .main
7485    lea                dstq, [dstq+strideq*2]
7486.w4:
7487    movq   [dstq+strideq*0], m0
7488    movhps [dstq+strideq*1], m0
7489    lea                dstq, [dstq+strideq*2]
7490    movq   [dstq+strideq*0], m1
7491    movhps [dstq+strideq*1], m1
7492    sub                  hd, 4
7493    jg .w4_loop
7494.end:
7495    RET
7496.w8_loop:
7497    call .main
7498    lea                dstq, [dstq+strideq*2]
7499.w8:
7500    mova   [dstq+strideq*0], m0
7501    mova   [dstq+strideq*1], m1
7502    sub                  hd, 2
7503    jg .w8_loop
7504.w8_end:
7505    RET
7506.w16_loop:
7507    call .main
7508    lea                dstq, [dstq+strideq*2]
7509.w16:
7510    mova [dstq+strideq*0+16*0], m0
7511    mova [dstq+strideq*0+16*1], m1
7512    call .main
7513    mova [dstq+strideq*1+16*0], m0
7514    mova [dstq+strideq*1+16*1], m1
7515    sub                  hd, 2
7516    jg .w16_loop
7517    RET
7518.w32_loop:
7519    call .main
7520    add                dstq, strideq
7521.w32:
7522    mova        [dstq+16*0], m0
7523    mova        [dstq+16*1], m1
7524    call .main
7525    mova        [dstq+16*2], m0
7526    mova        [dstq+16*3], m1
7527    dec                  hd
7528    jg .w32_loop
7529    RET
7530.w64_loop:
7531    call .main
7532    add                dstq, strideq
7533.w64:
7534    mova        [dstq+16*0], m0
7535    mova        [dstq+16*1], m1
7536    call .main
7537    mova        [dstq+16*2], m0
7538    mova        [dstq+16*3], m1
7539    call .main
7540    mova        [dstq+16*4], m0
7541    mova        [dstq+16*5], m1
7542    call .main
7543    mova        [dstq+16*6], m0
7544    mova        [dstq+16*7], m1
7545    dec                  hd
7546    jg .w64_loop
7547    RET
7548.w128_loop:
7549    call .main
7550    add                dstq, strideq
7551.w128:
7552    mova       [dstq+16* 0], m0
7553    mova       [dstq+16* 1], m1
7554    call .main
7555    mova       [dstq+16* 2], m0
7556    mova       [dstq+16* 3], m1
7557    call .main
7558    mova       [dstq+16* 4], m0
7559    mova       [dstq+16* 5], m1
7560    call .main
7561    mova       [dstq+16* 6], m0
7562    mova       [dstq+16* 7], m1
7563    call .main
7564    mova       [dstq+16* 8], m0
7565    mova       [dstq+16* 9], m1
7566    call .main
7567    mova       [dstq+16*10], m0
7568    mova       [dstq+16*11], m1
7569    call .main
7570    mova       [dstq+16*12], m0
7571    mova       [dstq+16*13], m1
7572    call .main
7573    mova       [dstq+16*14], m0
7574    mova       [dstq+16*15], m1
7575    dec                  hd
7576    jg .w128_loop
7577    RET
7578ALIGN function_align
7579.main:
7580    W_MASK                0, 2
7581    W_MASK                1, 3
7582    phaddw               m2, m3
7583    add               tmp1q, 16*2
7584    add               tmp2q, 16*2
7585    packuswb             m2, m2
7586    pxor                 m3, m3
7587    psubb                m2, m7
7588    pavgb                m2, m3
7589    movq            [maskq], m2
7590    add               maskq, 8
7591    ret
7592
7593cglobal w_mask_444_16bpc, 4, 7, 12, dst, stride, tmp1, tmp2, w, h, mask
7594%define base t0-w_mask_444_ssse3_table
7595    LEA                  t0, w_mask_444_ssse3_table
7596    tzcnt                wd, wm
7597    mov                 r6d, r8m ; pixel_max
7598    shr                 r6d, 11
7599    movsxd               wq, [t0+wq*4]
7600%if ARCH_X86_64
7601    mova                 m8, [base+pw_27615]
7602    mova                 m9, [base+pw_64]
7603    movddup             m10, [base+bidir_rnd+r6*8]
7604    movddup             m11, [base+bidir_mul+r6*8]
7605%else
7606    mova                 m1, [base+pw_27615]
7607    mova                 m2, [base+pw_64]
7608    movddup              m3, [base+bidir_rnd+r6*8]
7609    movddup              m7, [base+bidir_mul+r6*8]
7610    ALLOC_STACK       -16*3
7611    mova         [rsp+16*0], m1
7612    mova         [rsp+16*1], m2
7613    mova         [rsp+16*2], m3
7614    %define             m11  m7
7615%endif
7616    add                  wq, t0
7617    movifnidn            hd, r5m
7618    mov               maskq, r6mp
7619    call .main
7620    jmp                  wq
7621.w4_loop:
7622    call .main
7623    lea                dstq, [dstq+strideq*2]
7624.w4:
7625    movq   [dstq+strideq*0], m0
7626    movhps [dstq+strideq*1], m0
7627    lea                dstq, [dstq+strideq*2]
7628    movq   [dstq+strideq*0], m1
7629    movhps [dstq+strideq*1], m1
7630    sub                  hd, 4
7631    jg .w4_loop
7632.end:
7633    RET
7634.w8_loop:
7635    call .main
7636    lea                dstq, [dstq+strideq*2]
7637.w8:
7638    mova   [dstq+strideq*0], m0
7639    mova   [dstq+strideq*1], m1
7640    sub                  hd, 2
7641    jg .w8_loop
7642.w8_end:
7643    RET
7644.w16_loop:
7645    call .main
7646    lea                dstq, [dstq+strideq*2]
7647.w16:
7648    mova [dstq+strideq*0+16*0], m0
7649    mova [dstq+strideq*0+16*1], m1
7650    call .main
7651    mova [dstq+strideq*1+16*0], m0
7652    mova [dstq+strideq*1+16*1], m1
7653    sub                  hd, 2
7654    jg .w16_loop
7655    RET
7656.w32_loop:
7657    call .main
7658    add                dstq, strideq
7659.w32:
7660    mova        [dstq+16*0], m0
7661    mova        [dstq+16*1], m1
7662    call .main
7663    mova        [dstq+16*2], m0
7664    mova        [dstq+16*3], m1
7665    dec                  hd
7666    jg .w32_loop
7667    RET
7668.w64_loop:
7669    call .main
7670    add                dstq, strideq
7671.w64:
7672    mova        [dstq+16*0], m0
7673    mova        [dstq+16*1], m1
7674    call .main
7675    mova        [dstq+16*2], m0
7676    mova        [dstq+16*3], m1
7677    call .main
7678    mova        [dstq+16*4], m0
7679    mova        [dstq+16*5], m1
7680    call .main
7681    mova        [dstq+16*6], m0
7682    mova        [dstq+16*7], m1
7683    dec                  hd
7684    jg .w64_loop
7685    RET
7686.w128_loop:
7687    call .main
7688    add                dstq, strideq
7689.w128:
7690    mova       [dstq+16* 0], m0
7691    mova       [dstq+16* 1], m1
7692    call .main
7693    mova       [dstq+16* 2], m0
7694    mova       [dstq+16* 3], m1
7695    call .main
7696    mova       [dstq+16* 4], m0
7697    mova       [dstq+16* 5], m1
7698    call .main
7699    mova       [dstq+16* 6], m0
7700    mova       [dstq+16* 7], m1
7701    call .main
7702    mova       [dstq+16* 8], m0
7703    mova       [dstq+16* 9], m1
7704    call .main
7705    mova       [dstq+16*10], m0
7706    mova       [dstq+16*11], m1
7707    call .main
7708    mova       [dstq+16*12], m0
7709    mova       [dstq+16*13], m1
7710    call .main
7711    mova       [dstq+16*14], m0
7712    mova       [dstq+16*15], m1
7713    dec                  hd
7714    jg .w128_loop
7715    RET
7716ALIGN function_align
7717.main:
7718    W_MASK                0, 2
7719    W_MASK                1, 3
7720    packuswb             m2, m3
7721    add               tmp1q, 16*2
7722    add               tmp2q, 16*2
7723    mova            [maskq], m2
7724    add               maskq, 16
7725    ret
7726
7727; (a * (64 - m) + b * m + 32) >> 6
7728; = (((b - a) * m + 32) >> 6) + a
7729; = (((b - a) * (m << 9) + 16384) >> 15) + a
7730;   except m << 9 overflows int16_t when m == 64 (which is possible),
7731;   but if we negate m it works out (-64 << 9 == -32768).
7732; = (((a - b) * (m * -512) + 16384) >> 15) + a
7733cglobal blend_16bpc, 3, 7, 8, dst, stride, tmp, w, h, mask, stride3
7734%define base r6-blend_ssse3_table
7735    LEA                  r6, blend_ssse3_table
7736    tzcnt                wd, wm
7737    movifnidn            hd, hm
7738    movsxd               wq, [r6+wq*4]
7739    movifnidn         maskq, maskmp
7740    mova                 m7, [base+pw_m512]
7741    add                  wq, r6
7742    lea            stride3q, [strideq*3]
7743    pxor                 m6, m6
7744    jmp                  wq
7745.w4:
7746    mova                 m5, [maskq]
7747    movq                 m0, [dstq+strideq*0]
7748    movhps               m0, [dstq+strideq*1]
7749    movq                 m1, [dstq+strideq*2]
7750    movhps               m1, [dstq+stride3q ]
7751    psubw                m2, m0, [tmpq+16*0]
7752    psubw                m3, m1, [tmpq+16*1]
7753    add               maskq, 16
7754    add                tmpq, 32
7755    punpcklbw            m4, m5, m6
7756    punpckhbw            m5, m6
7757    pmullw               m4, m7
7758    pmullw               m5, m7
7759    pmulhrsw             m2, m4
7760    pmulhrsw             m3, m5
7761    paddw                m0, m2
7762    paddw                m1, m3
7763    movq   [dstq+strideq*0], m0
7764    movhps [dstq+strideq*1], m0
7765    movq   [dstq+strideq*2], m1
7766    movhps [dstq+stride3q ], m1
7767    lea                dstq, [dstq+strideq*4]
7768    sub                  hd, 4
7769    jg .w4
7770    RET
7771.w8:
7772    mova                 m5, [maskq]
7773    mova                 m0, [dstq+strideq*0]
7774    mova                 m1, [dstq+strideq*1]
7775    psubw                m2, m0, [tmpq+16*0]
7776    psubw                m3, m1, [tmpq+16*1]
7777    add               maskq, 16
7778    add                tmpq, 32
7779    punpcklbw            m4, m5, m6
7780    punpckhbw            m5, m6
7781    pmullw               m4, m7
7782    pmullw               m5, m7
7783    pmulhrsw             m2, m4
7784    pmulhrsw             m3, m5
7785    paddw                m0, m2
7786    paddw                m1, m3
7787    mova   [dstq+strideq*0], m0
7788    mova   [dstq+strideq*1], m1
7789    lea                dstq, [dstq+strideq*2]
7790    sub                  hd, 2
7791    jg .w8
7792    RET
7793.w16:
7794    mova                 m5, [maskq]
7795    mova                 m0, [dstq+16*0]
7796    mova                 m1, [dstq+16*1]
7797    psubw                m2, m0, [tmpq+16*0]
7798    psubw                m3, m1, [tmpq+16*1]
7799    add               maskq, 16
7800    add                tmpq, 32
7801    punpcklbw            m4, m5, m6
7802    punpckhbw            m5, m6
7803    pmullw               m4, m7
7804    pmullw               m5, m7
7805    pmulhrsw             m2, m4
7806    pmulhrsw             m3, m5
7807    paddw                m0, m2
7808    paddw                m1, m3
7809    mova        [dstq+16*0], m0
7810    mova        [dstq+16*1], m1
7811    add                dstq, strideq
7812    dec                  hd
7813    jg .w16
7814    RET
7815.w32:
7816    mova                 m5, [maskq+16*0]
7817    mova                 m0, [dstq+16*0]
7818    mova                 m1, [dstq+16*1]
7819    psubw                m2, m0, [tmpq+16*0]
7820    psubw                m3, m1, [tmpq+16*1]
7821    punpcklbw            m4, m5, m6
7822    punpckhbw            m5, m6
7823    pmullw               m4, m7
7824    pmullw               m5, m7
7825    pmulhrsw             m2, m4
7826    pmulhrsw             m3, m5
7827    paddw                m0, m2
7828    paddw                m1, m3
7829    mova        [dstq+16*0], m0
7830    mova        [dstq+16*1], m1
7831    mova                 m5, [maskq+16*1]
7832    mova                 m0, [dstq+16*2]
7833    mova                 m1, [dstq+16*3]
7834    psubw                m2, m0, [tmpq+16*2]
7835    psubw                m3, m1, [tmpq+16*3]
7836    add               maskq, 32
7837    add                tmpq, 64
7838    punpcklbw            m4, m5, m6
7839    punpckhbw            m5, m6
7840    pmullw               m4, m7
7841    pmullw               m5, m7
7842    pmulhrsw             m2, m4
7843    pmulhrsw             m3, m5
7844    paddw                m0, m2
7845    paddw                m1, m3
7846    mova        [dstq+16*2], m0
7847    mova        [dstq+16*3], m1
7848    add                dstq, strideq
7849    dec                  hd
7850    jg .w32
7851    RET
7852
7853cglobal blend_v_16bpc, 3, 6, 6, dst, stride, tmp, w, h
7854%define base r5-blend_v_ssse3_table
7855    LEA                  r5, blend_v_ssse3_table
7856    tzcnt                wd, wm
7857    movifnidn            hd, hm
7858    movsxd               wq, [r5+wq*4]
7859    add                  wq, r5
7860    jmp                  wq
7861.w2:
7862    movd                 m4, [base+obmc_masks+2*2]
7863.w2_loop:
7864    movd                 m0, [dstq+strideq*0]
7865    movd                 m2, [tmpq+4*0]
7866    movd                 m1, [dstq+strideq*1]
7867    movd                 m3, [tmpq+4*1]
7868    add                tmpq, 4*2
7869    psubw                m2, m0
7870    psubw                m3, m1
7871    pmulhrsw             m2, m4
7872    pmulhrsw             m3, m4
7873    paddw                m0, m2
7874    paddw                m1, m3
7875    movd   [dstq+strideq*0], m0
7876    movd   [dstq+strideq*1], m1
7877    lea                dstq, [dstq+strideq*2]
7878    sub                  hd, 2
7879    jg .w2_loop
7880    RET
7881.w4:
7882    movddup              m2, [base+obmc_masks+4*2]
7883.w4_loop:
7884    movq                 m0, [dstq+strideq*0]
7885    movhps               m0, [dstq+strideq*1]
7886    mova                 m1, [tmpq]
7887    add                tmpq, 8*2
7888    psubw                m1, m0
7889    pmulhrsw             m1, m2
7890    paddw                m0, m1
7891    movq   [dstq+strideq*0], m0
7892    movhps [dstq+strideq*1], m0
7893    lea                dstq, [dstq+strideq*2]
7894    sub                  hd, 2
7895    jg .w4_loop
7896    RET
7897.w8:
7898    mova                 m4, [base+obmc_masks+8*2]
7899.w8_loop:
7900    mova                 m0, [dstq+strideq*0]
7901    mova                 m2, [tmpq+16*0]
7902    mova                 m1, [dstq+strideq*1]
7903    mova                 m3, [tmpq+16*1]
7904    add                tmpq, 16*2
7905    psubw                m2, m0
7906    psubw                m3, m1
7907    pmulhrsw             m2, m4
7908    pmulhrsw             m3, m4
7909    paddw                m0, m2
7910    paddw                m1, m3
7911    mova   [dstq+strideq*0], m0
7912    mova   [dstq+strideq*1], m1
7913    lea                dstq, [dstq+strideq*2]
7914    sub                  hd, 2
7915    jg .w8_loop
7916    RET
7917.w16:
7918    mova                 m4, [base+obmc_masks+16*2]
7919    movq                 m5, [base+obmc_masks+16*3]
7920.w16_loop:
7921    mova                 m0, [dstq+16*0]
7922    mova                 m2, [tmpq+16*0]
7923    mova                 m1, [dstq+16*1]
7924    mova                 m3, [tmpq+16*1]
7925    add                tmpq, 16*2
7926    psubw                m2, m0
7927    psubw                m3, m1
7928    pmulhrsw             m2, m4
7929    pmulhrsw             m3, m5
7930    paddw                m0, m2
7931    paddw                m1, m3
7932    mova        [dstq+16*0], m0
7933    mova        [dstq+16*1], m1
7934    add                dstq, strideq
7935    dec                  hd
7936    jg .w16_loop
7937    RET
7938.w32:
7939%if WIN64
7940    movaps          [rsp+8], m6
7941%endif
7942    mova                 m4, [base+obmc_masks+16*4]
7943    mova                 m5, [base+obmc_masks+16*5]
7944    mova                 m6, [base+obmc_masks+16*6]
7945.w32_loop:
7946    mova                 m0, [dstq+16*0]
7947    mova                 m2, [tmpq+16*0]
7948    mova                 m1, [dstq+16*1]
7949    mova                 m3, [tmpq+16*1]
7950    psubw                m2, m0
7951    psubw                m3, m1
7952    pmulhrsw             m2, m4
7953    pmulhrsw             m3, m5
7954    paddw                m0, m2
7955    mova                 m2, [dstq+16*2]
7956    paddw                m1, m3
7957    mova                 m3, [tmpq+16*2]
7958    add                tmpq, 16*4
7959    psubw                m3, m2
7960    pmulhrsw             m3, m6
7961    paddw                m2, m3
7962    mova        [dstq+16*0], m0
7963    mova        [dstq+16*1], m1
7964    mova        [dstq+16*2], m2
7965    add                dstq, strideq
7966    dec                  hd
7967    jg .w32_loop
7968%if WIN64
7969    movaps               m6, [rsp+8]
7970%endif
7971    RET
7972
7973%macro BLEND_H_ROW 2-3 0; dst_off, tmp_off, inc_tmp
7974    mova                 m0, [dstq+16*(%1+0)]
7975    mova                 m2, [tmpq+16*(%2+0)]
7976    mova                 m1, [dstq+16*(%1+1)]
7977    mova                 m3, [tmpq+16*(%2+1)]
7978%if %3
7979    add                tmpq, 16*%3
7980%endif
7981    psubw                m2, m0
7982    psubw                m3, m1
7983    pmulhrsw             m2, m5
7984    pmulhrsw             m3, m5
7985    paddw                m0, m2
7986    paddw                m1, m3
7987    mova   [dstq+16*(%1+0)], m0
7988    mova   [dstq+16*(%1+1)], m1
7989%endmacro
7990
7991cglobal blend_h_16bpc, 3, 7, 6, dst, ds, tmp, w, h, mask
7992%define base r6-blend_h_ssse3_table
7993    LEA                  r6, blend_h_ssse3_table
7994    tzcnt                wd, wm
7995    mov                  hd, hm
7996    movsxd               wq, [r6+wq*4]
7997    movddup              m4, [base+blend_shuf]
7998    lea               maskq, [base+obmc_masks+hq*2]
7999    lea                  hd, [hq*3]
8000    add                  wq, r6
8001    shr                  hd, 2 ; h * 3/4
8002    lea               maskq, [maskq+hq*2]
8003    neg                  hq
8004    jmp                  wq
8005.w2:
8006    movd                 m0, [dstq+dsq*0]
8007    movd                 m2, [dstq+dsq*1]
8008    movd                 m3, [maskq+hq*2]
8009    movq                 m1, [tmpq]
8010    add                tmpq, 4*2
8011    punpckldq            m0, m2
8012    punpcklwd            m3, m3
8013    psubw                m1, m0
8014    pmulhrsw             m1, m3
8015    paddw                m0, m1
8016    movd       [dstq+dsq*0], m0
8017    psrlq                m0, 32
8018    movd       [dstq+dsq*1], m0
8019    lea                dstq, [dstq+dsq*2]
8020    add                  hq, 2
8021    jl .w2
8022    RET
8023.w4:
8024    mova                 m3, [base+blend_shuf]
8025.w4_loop:
8026    movq                 m0, [dstq+dsq*0]
8027    movhps               m0, [dstq+dsq*1]
8028    movd                 m2, [maskq+hq*2]
8029    mova                 m1, [tmpq]
8030    add                tmpq, 8*2
8031    psubw                m1, m0
8032    pshufb               m2, m3
8033    pmulhrsw             m1, m2
8034    paddw                m0, m1
8035    movq       [dstq+dsq*0], m0
8036    movhps     [dstq+dsq*1], m0
8037    lea                dstq, [dstq+dsq*2]
8038    add                  hq, 2
8039    jl .w4_loop
8040    RET
8041.w8:
8042    movddup              m5, [base+blend_shuf+8]
8043%if WIN64
8044    movaps         [rsp+ 8], m6
8045    movaps         [rsp+24], m7
8046%endif
8047.w8_loop:
8048    movd                 m7, [maskq+hq*2]
8049    mova                 m0, [dstq+dsq*0]
8050    mova                 m2, [tmpq+16*0]
8051    mova                 m1, [dstq+dsq*1]
8052    mova                 m3, [tmpq+16*1]
8053    add                tmpq, 16*2
8054    pshufb               m6, m7, m4
8055    psubw                m2, m0
8056    pshufb               m7, m5
8057    psubw                m3, m1
8058    pmulhrsw             m2, m6
8059    pmulhrsw             m3, m7
8060    paddw                m0, m2
8061    paddw                m1, m3
8062    mova       [dstq+dsq*0], m0
8063    mova       [dstq+dsq*1], m1
8064    lea                dstq, [dstq+dsq*2]
8065    add                  hq, 2
8066    jl .w8_loop
8067%if WIN64
8068    movaps               m6, [rsp+ 8]
8069    movaps               m7, [rsp+24]
8070%endif
8071    RET
8072.w16:
8073    movd                 m5, [maskq+hq*2]
8074    pshufb               m5, m4
8075    BLEND_H_ROW           0, 0, 2
8076    add                dstq, dsq
8077    inc                  hq
8078    jl .w16
8079    RET
8080.w32:
8081    movd                 m5, [maskq+hq*2]
8082    pshufb               m5, m4
8083    BLEND_H_ROW           0, 0
8084    BLEND_H_ROW           2, 2, 4
8085    add                dstq, dsq
8086    inc                  hq
8087    jl .w32
8088    RET
8089.w64:
8090    movd                 m5, [maskq+hq*2]
8091    pshufb               m5, m4
8092    BLEND_H_ROW           0, 0
8093    BLEND_H_ROW           2, 2
8094    BLEND_H_ROW           4, 4
8095    BLEND_H_ROW           6, 6, 8
8096    add                dstq, dsq
8097    inc                  hq
8098    jl .w64
8099    RET
8100.w128:
8101    movd                 m5, [maskq+hq*2]
8102    pshufb               m5, m4
8103    BLEND_H_ROW           0,  0
8104    BLEND_H_ROW           2,  2
8105    BLEND_H_ROW           4,  4
8106    BLEND_H_ROW           6,  6, 16
8107    BLEND_H_ROW           8, -8
8108    BLEND_H_ROW          10, -6
8109    BLEND_H_ROW          12, -4
8110    BLEND_H_ROW          14, -2
8111    add                dstq, dsq
8112    inc                  hq
8113    jl .w128
8114    RET
8115
8116; emu_edge args:
8117; const intptr_t bw, const intptr_t bh, const intptr_t iw, const intptr_t ih,
8118; const intptr_t x, const intptr_t y, pixel *dst, const ptrdiff_t dst_stride,
8119; const pixel *ref, const ptrdiff_t ref_stride
8120;
8121; bw, bh total filled size
8122; iw, ih, copied block -> fill bottom, right
8123; x, y, offset in bw/bh -> fill top, left
8124cglobal emu_edge_16bpc, 10, 13, 1, bw, bh, iw, ih, x, \
8125                             y, dst, dstride, src, sstride, \
8126                             bottomext, rightext, blk
8127    ; we assume that the buffer (stride) is larger than width, so we can
8128    ; safely overwrite by a few bytes
8129
8130%if ARCH_X86_64
8131 %define reg_zero       r12q
8132 %define reg_tmp        r10
8133 %define reg_src        srcq
8134 %define reg_bottomext  bottomextq
8135 %define reg_rightext   rightextq
8136 %define reg_blkm       r9m
8137%else
8138 %define reg_zero       r6
8139 %define reg_tmp        r0
8140 %define reg_src        r1
8141 %define reg_bottomext  r0
8142 %define reg_rightext   r1
8143 %define reg_blkm       r2m
8144%endif
8145    ;
8146    ; ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
8147    xor            reg_zero, reg_zero
8148    lea             reg_tmp, [ihq-1]
8149    cmp                  yq, ihq
8150    cmovs           reg_tmp, yq
8151    test                 yq, yq
8152    cmovs           reg_tmp, reg_zero
8153%if ARCH_X86_64
8154    imul            reg_tmp, sstrideq
8155    add                srcq, reg_tmp
8156%else
8157    imul            reg_tmp, sstridem
8158    mov             reg_src, srcm
8159    add             reg_src, reg_tmp
8160%endif
8161    ;
8162    ; ref += iclip(x, 0, iw - 1)
8163    lea             reg_tmp, [iwq-1]
8164    cmp                  xq, iwq
8165    cmovs           reg_tmp, xq
8166    test                 xq, xq
8167    cmovs           reg_tmp, reg_zero
8168    lea             reg_src, [reg_src+reg_tmp*2]
8169%if ARCH_X86_32
8170    mov                srcm, reg_src
8171%endif
8172    ;
8173    ; bottom_ext = iclip(y + bh - ih, 0, bh - 1)
8174%if ARCH_X86_32
8175    mov                  r1, r1m ; restore bh
8176%endif
8177    lea       reg_bottomext, [yq+bhq]
8178    sub       reg_bottomext, ihq
8179    lea                  r3, [bhq-1]
8180    cmovs     reg_bottomext, reg_zero
8181    ;
8182
8183    DEFINE_ARGS bw, bh, iw, ih, x, \
8184                topext, dst, dstride, src, sstride, \
8185                bottomext, rightext, blk
8186
8187    ; top_ext = iclip(-y, 0, bh - 1)
8188    neg             topextq
8189    cmovs           topextq, reg_zero
8190    cmp       reg_bottomext, bhq
8191    cmovns    reg_bottomext, r3
8192    cmp             topextq, bhq
8193    cmovg           topextq, r3
8194 %if ARCH_X86_32
8195    mov                 r4m, reg_bottomext
8196    ;
8197    ; right_ext = iclip(x + bw - iw, 0, bw - 1)
8198    mov                  r0, r0m ; restore bw
8199 %endif
8200    lea        reg_rightext, [xq+bwq]
8201    sub        reg_rightext, iwq
8202    lea                  r2, [bwq-1]
8203    cmovs      reg_rightext, reg_zero
8204
8205    DEFINE_ARGS bw, bh, iw, ih, leftext, \
8206                topext, dst, dstride, src, sstride, \
8207                bottomext, rightext, blk
8208
8209    ; left_ext = iclip(-x, 0, bw - 1)
8210    neg            leftextq
8211    cmovs          leftextq, reg_zero
8212    cmp        reg_rightext, bwq
8213    cmovns     reg_rightext, r2
8214 %if ARCH_X86_32
8215    mov                 r3m, r1
8216 %endif
8217    cmp            leftextq, bwq
8218    cmovns         leftextq, r2
8219
8220%undef reg_zero
8221%undef reg_tmp
8222%undef reg_src
8223%undef reg_bottomext
8224%undef reg_rightext
8225
8226    DEFINE_ARGS bw, centerh, centerw, dummy, leftext, \
8227                topext, dst, dstride, src, sstride, \
8228                bottomext, rightext, blk
8229
8230    ; center_h = bh - top_ext - bottom_ext
8231%if ARCH_X86_64
8232    lea                  r3, [bottomextq+topextq]
8233    sub            centerhq, r3
8234%else
8235    mov                   r1, centerhm ; restore r1
8236    sub             centerhq, topextq
8237    sub             centerhq, r4m
8238    mov                  r1m, centerhq
8239%endif
8240    ;
8241    ; blk += top_ext * PXSTRIDE(dst_stride)
8242    mov                  r2, topextq
8243%if ARCH_X86_64
8244    imul                 r2, dstrideq
8245%else
8246    mov                  r6, r6m ; restore dstq
8247    imul                 r2, dstridem
8248%endif
8249    add                dstq, r2
8250    mov            reg_blkm, dstq ; save pointer for ext
8251    ;
8252    ; center_w = bw - left_ext - right_ext
8253    mov            centerwq, bwq
8254%if ARCH_X86_64
8255    lea                  r3, [rightextq+leftextq]
8256    sub            centerwq, r3
8257%else
8258    sub            centerwq, r3m
8259    sub            centerwq, leftextq
8260%endif
8261
8262; vloop Macro
8263%macro v_loop 3 ; need_left_ext, need_right_ext, suffix
8264  %if ARCH_X86_64
8265    %define reg_tmp        r12
8266  %else
8267    %define reg_tmp        r0
8268  %endif
8269.v_loop_%3:
8270  %if ARCH_X86_32
8271    mov                  r0, r0m
8272    mov                  r1, r1m
8273  %endif
8274%if %1
8275    ; left extension
8276  %if ARCH_X86_64
8277    movd                 m0, [srcq]
8278  %else
8279    mov                  r3, srcm
8280    movd                 m0, [r3]
8281  %endif
8282    pshuflw              m0, m0, q0000
8283    punpcklqdq           m0, m0
8284    xor                  r3, r3
8285.left_loop_%3:
8286    mova        [dstq+r3*2], m0
8287    add                  r3, mmsize/2
8288    cmp                  r3, leftextq
8289    jl .left_loop_%3
8290    ; body
8291    lea             reg_tmp, [dstq+leftextq*2]
8292%endif
8293    xor                  r3, r3
8294.body_loop_%3:
8295  %if ARCH_X86_64
8296    movu                 m0, [srcq+r3*2]
8297  %else
8298    mov                  r1, srcm
8299    movu                 m0, [r1+r3*2]
8300  %endif
8301%if %1
8302    movu     [reg_tmp+r3*2], m0
8303%else
8304    movu        [dstq+r3*2], m0
8305%endif
8306    add                  r3, mmsize/2
8307    cmp                  r3, centerwq
8308    jl .body_loop_%3
8309%if %2
8310    ; right extension
8311%if %1
8312    lea             reg_tmp, [reg_tmp+centerwq*2]
8313%else
8314    lea             reg_tmp, [dstq+centerwq*2]
8315%endif
8316  %if ARCH_X86_64
8317    movd                 m0, [srcq+centerwq*2-2]
8318  %else
8319    mov                  r3, srcm
8320    movd                 m0, [r3+centerwq*2-2]
8321  %endif
8322    pshuflw              m0, m0, q0000
8323    punpcklqdq           m0, m0
8324    xor                  r3, r3
8325.right_loop_%3:
8326    movu     [reg_tmp+r3*2], m0
8327    add                  r3, mmsize/2
8328  %if ARCH_X86_64
8329    cmp                  r3, rightextq
8330  %else
8331    cmp                  r3, r3m
8332  %endif
8333    jl .right_loop_%3
8334%endif
8335  %if ARCH_X86_64
8336    add                dstq, dstrideq
8337    add                srcq, sstrideq
8338    dec            centerhq
8339    jg .v_loop_%3
8340  %else
8341    add                dstq, dstridem
8342    mov                  r0, sstridem
8343    add                srcm, r0
8344    sub       dword centerhm, 1
8345    jg .v_loop_%3
8346    mov                  r0, r0m ; restore r0
8347  %endif
8348%endmacro ; vloop MACRO
8349
8350    test           leftextq, leftextq
8351    jnz .need_left_ext
8352 %if ARCH_X86_64
8353    test          rightextq, rightextq
8354    jnz .need_right_ext
8355 %else
8356    cmp            leftextq, r3m ; leftextq == 0
8357    jne .need_right_ext
8358 %endif
8359    v_loop                0, 0, 0
8360    jmp .body_done
8361
8362    ;left right extensions
8363.need_left_ext:
8364 %if ARCH_X86_64
8365    test          rightextq, rightextq
8366 %else
8367    mov                  r3, r3m
8368    test                 r3, r3
8369 %endif
8370    jnz .need_left_right_ext
8371    v_loop                1, 0, 1
8372    jmp .body_done
8373
8374.need_left_right_ext:
8375    v_loop                1, 1, 2
8376    jmp .body_done
8377
8378.need_right_ext:
8379    v_loop                0, 1, 3
8380
8381.body_done:
8382; r0 ; bw
8383; r1 ;; x loop
8384; r4 ;; y loop
8385; r5 ; topextq
8386; r6 ;dstq
8387; r7 ;dstrideq
8388; r8 ; srcq
8389%if ARCH_X86_64
8390 %define reg_dstride    dstrideq
8391%else
8392 %define reg_dstride    r2
8393%endif
8394    ;
8395    ; bottom edge extension
8396 %if ARCH_X86_64
8397    test         bottomextq, bottomextq
8398    jz .top
8399 %else
8400    xor                  r1, r1
8401    cmp                  r1, r4m
8402    je .top
8403 %endif
8404    ;
8405 %if ARCH_X86_64
8406    mov                srcq, dstq
8407    sub                srcq, dstrideq
8408    xor                  r1, r1
8409 %else
8410    mov                  r3, dstq
8411    mov         reg_dstride, dstridem
8412    sub                  r3, reg_dstride
8413    mov                srcm, r3
8414 %endif
8415    ;
8416.bottom_x_loop:
8417 %if ARCH_X86_64
8418    mova                 m0, [srcq+r1*2]
8419    lea                  r3, [dstq+r1*2]
8420    mov                  r4, bottomextq
8421 %else
8422    mov                  r3, srcm
8423    mova                 m0, [r3+r1*2]
8424    lea                  r3, [dstq+r1*2]
8425    mov                  r4, r4m
8426 %endif
8427    ;
8428.bottom_y_loop:
8429    mova               [r3], m0
8430    add                  r3, reg_dstride
8431    dec                  r4
8432    jg .bottom_y_loop
8433    add                  r1, mmsize/2
8434    cmp                  r1, bwq
8435    jl .bottom_x_loop
8436
8437.top:
8438    ; top edge extension
8439    test            topextq, topextq
8440    jz .end
8441%if ARCH_X86_64
8442    mov                srcq, reg_blkm
8443%else
8444    mov                  r3, reg_blkm
8445    mov         reg_dstride, dstridem
8446%endif
8447    mov                dstq, dstm
8448    xor                  r1, r1
8449    ;
8450.top_x_loop:
8451%if ARCH_X86_64
8452    mova                 m0, [srcq+r1*2]
8453%else
8454    mov                  r3, reg_blkm
8455    mova                 m0, [r3+r1*2]
8456%endif
8457    lea                  r3, [dstq+r1*2]
8458    mov                  r4, topextq
8459    ;
8460.top_y_loop:
8461    mova               [r3], m0
8462    add                  r3, reg_dstride
8463    dec                  r4
8464    jg .top_y_loop
8465    add                  r1, mmsize/2
8466    cmp                  r1, bwq
8467    jl .top_x_loop
8468
8469.end:
8470    RET
8471
8472%undef reg_dstride
8473%undef reg_blkm
8474%undef reg_tmp
8475
8476%macro SCRATCH 3
8477%if ARCH_X86_32
8478    mova [rsp+%3*mmsize], m%1
8479%define m%2 [rsp+%3*mmsize]
8480%else
8481    SWAP             %1, %2
8482%endif
8483%endmacro
8484
8485%if ARCH_X86_64
8486cglobal resize_16bpc, 0, 12, 16, 1*16, dst, dst_stride, src, src_stride, \
8487                                       dst_w, h, src_w, dx, mx0, pxmax
8488%elif STACK_ALIGNMENT >= 16
8489cglobal resize_16bpc, 0, 7, 8, 6*16, dst, dst_stride, src, src_stride, \
8490                                     dst_w, h, src_w, dx, mx0, pxmax
8491%else
8492cglobal resize_16bpc, 0, 6, 8, 6*16, dst, dst_stride, src, src_stride, \
8493                                     dst_w, h, src_w, dx, mx0, pxmax
8494%endif
8495    movifnidn         dstq, dstmp
8496    movifnidn         srcq, srcmp
8497%if STACK_ALIGNMENT >= 16
8498    movifnidn       dst_wd, dst_wm
8499%endif
8500%if ARCH_X86_64
8501    movifnidn           hd, hm
8502%endif
8503    sub         dword mx0m, 4<<14
8504    sub       dword src_wm, 8
8505    movd                m4, pxmaxm
8506    movd                m7, dxm
8507    movd                m6, mx0m
8508    movd                m5, src_wm
8509    punpcklwd           m4, m4
8510    pshufd              m4, m4, q0000
8511    pshufd              m7, m7, q0000
8512    pshufd              m6, m6, q0000
8513    pshufd              m5, m5, q0000
8514    mova [rsp+16*3*ARCH_X86_32], m4
8515%if ARCH_X86_64
8516 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x
8517    LEA                 r7, $$
8518 %define base r7-$$
8519%else
8520 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, x
8521 %define hd dword r5m
8522 %if STACK_ALIGNMENT >= 16
8523    LEA                 r6, $$
8524  %define base r6-$$
8525 %else
8526    LEA                 r4, $$
8527  %define base r4-$$
8528 %endif
8529%endif
8530%if ARCH_X86_64
8531    mova               m12, [base+pd_64]
8532    mova               m11, [base+pd_63]
8533%else
8534 %define m12 [base+pd_64]
8535 %define m11 [base+pd_63]
8536%endif
8537    pmaddwd             m4, m7, [base+rescale_mul] ; dx*[0,1,2,3]
8538    pslld               m7, 2                      ; dx*4
8539    pslld               m5, 14
8540    paddd               m6, m4                     ; mx+[0..3]*dx
8541    SCRATCH              7, 15, 0
8542    SCRATCH              6, 14, 1
8543    SCRATCH              5, 13, 2
8544    pxor                m1, m1
8545.loop_y:
8546    xor                 xd, xd
8547    mova                m0, m14            ; per-line working version of mx
8548.loop_x:
8549    pcmpgtd             m1, m0
8550    pandn               m1, m0
8551    psrad               m2, m0, 8          ; filter offset (unmasked)
8552    pcmpgtd             m3, m13, m1
8553    pand                m1, m3
8554    pandn               m3, m13
8555    por                 m1, m3
8556    psubd               m3, m0, m1         ; pshufb offset
8557    psrad               m1, 14             ; clipped src_x offset
8558    psrad               m3, 14             ; pshufb edge_emu offset
8559    pand                m2, m11            ; filter offset (masked)
8560    ; load source pixels
8561%if ARCH_X86_64
8562    movd               r8d, m1
8563    pshuflw             m1, m1, q3232
8564    movd               r9d, m1
8565    punpckhqdq          m1, m1
8566    movd              r10d, m1
8567    psrlq               m1, 32
8568    movd              r11d, m1
8569    movu                m4, [srcq+r8*2]
8570    movu                m5, [srcq+r9*2]
8571    movu                m6, [srcq+r10*2]
8572    movu                m7, [srcq+r11*2]
8573    ; if no emulation is required, we don't need to shuffle or emulate edges
8574    packssdw            m3, m3
8575    movq               r11, m3
8576    test               r11, r11
8577    jz .filter
8578    movsx               r8, r11w
8579    sar                r11, 16
8580    movsx               r9, r11w
8581    sar                r11, 16
8582    movsx              r10, r11w
8583    sar                r11, 16
8584    movu                m1, [base+resize_shuf+8+r8*2]
8585    movu                m3, [base+resize_shuf+8+r9*2]
8586    movu                m8, [base+resize_shuf+8+r10*2]
8587    movu                m9, [base+resize_shuf+8+r11*2]
8588    pshufb              m4, m1
8589    pshufb              m5, m3
8590    pshufb              m6, m8
8591    pshufb              m7, m9
8592.filter:
8593    movd               r8d, m2
8594    pshuflw             m2, m2, q3232
8595    movd               r9d, m2
8596    punpckhqdq          m2, m2
8597    movd              r10d, m2
8598    psrlq               m2, 32
8599    movd              r11d, m2
8600    movq                m8, [base+resize_filter+r8*8]
8601    movq                m2, [base+resize_filter+r9*8]
8602    pxor                m9, m9
8603    punpcklbw           m1, m9, m8
8604    punpcklbw           m3, m9, m2
8605    psraw               m1, 8
8606    psraw               m3, 8
8607    movq               m10, [base+resize_filter+r10*8]
8608    movq                m2, [base+resize_filter+r11*8]
8609    punpcklbw           m8, m9, m10
8610    punpcklbw           m9, m2
8611    psraw               m8, 8
8612    psraw               m9, 8
8613    pmaddwd             m4, m1
8614    pmaddwd             m5, m3
8615    pmaddwd             m6, m8
8616    pmaddwd             m7, m9
8617    phaddd              m4, m5
8618%else
8619    movd                r3, m1
8620    pshuflw             m1, m1, q3232
8621    movd                r1, m1
8622    punpckhqdq          m1, m1
8623    movu                m4, [srcq+r3*2]
8624    movu                m5, [srcq+r1*2]
8625    movd                r3, m1
8626    psrlq               m1, 32
8627    movd                r1, m1
8628    movu                m6, [srcq+r3*2]
8629    movu                m7, [srcq+r1*2]
8630    ; if no emulation is required, we don't need to shuffle or emulate edges
8631    pxor                m1, m1
8632    pcmpeqb             m1, m3
8633    pmovmskb           r3d, m1
8634    cmp                r3d, 0xffff
8635    je .filter
8636    movd                r3, m3
8637    movu                m1, [base+resize_shuf+8+r3*2]
8638    pshuflw             m3, m3, q3232
8639    movd                r1, m3
8640    pshufb              m4, m1
8641    movu                m1, [base+resize_shuf+8+r1*2]
8642    punpckhqdq          m3, m3
8643    movd                r3, m3
8644    pshufb              m5, m1
8645    movu                m1, [base+resize_shuf+8+r3*2]
8646    psrlq               m3, 32
8647    movd                r1, m3
8648    pshufb              m6, m1
8649    movu                m1, [base+resize_shuf+8+r1*2]
8650    pshufb              m7, m1
8651.filter:
8652    mova        [esp+4*16], m6
8653    mova        [esp+5*16], m7
8654    movd                r3, m2
8655    pshuflw             m2, m2, q3232
8656    movd                r1, m2
8657    movq                m6, [base+resize_filter+r3*8]
8658    movq                m7, [base+resize_filter+r1*8]
8659    pxor                m3, m3
8660    punpcklbw           m1, m3, m6
8661    punpcklbw           m3, m7
8662    psraw               m1, 8
8663    psraw               m3, 8
8664    pmaddwd             m4, m1
8665    pmaddwd             m5, m3
8666    punpckhqdq          m2, m2
8667    movd                r3, m2
8668    psrlq               m2, 32
8669    movd                r1, m2
8670    phaddd              m4, m5
8671    movq                m2, [base+resize_filter+r3*8]
8672    movq                m5, [base+resize_filter+r1*8]
8673    mova                m6, [esp+4*16]
8674    mova                m7, [esp+5*16]
8675    pxor                m3, m3
8676    punpcklbw           m1, m3, m2
8677    punpcklbw           m3, m5
8678    psraw               m1, 8
8679    psraw               m3, 8
8680    pmaddwd             m6, m1
8681    pmaddwd             m7, m3
8682%endif
8683    phaddd              m6, m7
8684    phaddd              m4, m6
8685    pxor                m1, m1
8686    psubd               m2, m12, m4
8687    psrad               m2, 7
8688    packssdw            m2, m2
8689    pmaxsw              m2, m1
8690    pminsw              m2, [rsp+16*3*ARCH_X86_32]
8691    movq       [dstq+xq*2], m2
8692    paddd               m0, m15
8693    add                 xd, 4
8694%if STACK_ALIGNMENT >= 16
8695    cmp                 xd, dst_wd
8696%else
8697    cmp                 xd, dst_wm
8698%endif
8699    jl .loop_x
8700    add               dstq, dst_stridemp
8701    add               srcq, src_stridemp
8702    dec                 hd
8703    jg .loop_y
8704    RET
8705