• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2020, VideoLAN and dav1d authors
2; Copyright © 2020, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33spel_h_shufA:  db  0,  1,  2,  3,  2,  3,  4,  5,  4,  5,  6,  7,  6,  7,  8,  9
34               db 32, 33, 34, 35, 34, 35, 36, 37, 36, 37, 38, 39, 38, 39, 40, 41
35spel_h_shufC:  db  8,  9, 10, 11, 10, 11, 12, 13, 12, 13, 14, 15, 14, 15, 16, 17
36               db 40, 41, 42, 43, 42, 43, 44, 45, 44, 45, 46, 47, 46, 47, 48, 49
37               db 16, 17, 18, 19, 18, 19, 20, 21, 20, 21, 22, 23, 22, 23, 24, 25
38               db 48, 49, 50, 51, 50, 51, 52, 53, 52, 53, 54, 55, 54, 55, 56, 57
39spel_h_shufB:  db  4,  5,  6,  7,  6,  7,  8,  9,  8,  9, 10, 11, 10, 11, 12, 13
40               db 36, 37, 38, 39, 38, 39, 40, 41, 40, 41, 42, 43, 42, 43, 44, 45
41spel_h_shufD:  db 12, 13, 14, 15, 14, 15, 16, 17, 16, 17, 18, 19, 18, 19, 20, 21
42               db 44, 45, 46, 47, 46, 47, 48, 49, 48, 49, 50, 51, 50, 51, 52, 53
43               db 20, 21, 22, 23, 22, 23, 24, 25, 24, 25, 26, 27, 26, 27, 28, 29
44               db 52, 53, 54, 55, 54, 55, 56, 57, 56, 57, 58, 59, 58, 59, 60, 61
45spel_v_shuf8:  db  0,  1, 16, 17,  2,  3, 18, 19,  4,  5, 20, 21,  6,  7, 22, 23
46               db 16, 17, 32, 33, 18, 19, 34, 35, 20, 21, 36, 37, 22, 23, 38, 39
47               db  8,  9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31
48               db 24, 25, 40, 41, 26, 27, 42, 43, 28, 29, 44, 45, 30, 31, 46, 47
49spel_v_shuf16: db  0,  1, 32, 33,  2,  3, 34, 35,  4,  5, 36, 37,  6,  7, 38, 39
50               db  8,  9, 40, 41, 10, 11, 42, 43, 12, 13, 44, 45, 14, 15, 46, 47
51               db 16, 17, 48, 49, 18, 19, 50, 51, 20, 21, 52, 53, 22, 23, 54, 55
52               db 24, 25, 56, 57, 26, 27, 58, 59, 28, 29, 60, 61, 30, 31, 62, 63
53prep_endA:     db  1,  2,  5,  6,  9, 10, 13, 14, 17, 18, 21, 22, 25, 26, 29, 30
54               db 33, 34, 37, 38, 41, 42, 45, 46, 49, 50, 53, 54, 57, 58, 61, 62
55               db 65, 66, 69, 70, 73, 74, 77, 78, 81, 82, 85, 86, 89, 90, 93, 94
56               db 97, 98,101,102,105,106,109,110,113,114,117,118,121,122,125,126
57prep_endB:     db  1,  2,  5,  6,  9, 10, 13, 14, 33, 34, 37, 38, 41, 42, 45, 46
58               db 17, 18, 21, 22, 25, 26, 29, 30, 49, 50, 53, 54, 57, 58, 61, 62
59               db 65, 66, 69, 70, 73, 74, 77, 78, 97, 98,101,102,105,106,109,110
60               db 81, 82, 85, 86, 89, 90, 93, 94,113,114,117,118,121,122,125,126
61prep_endC:     db  1,  2,  5,  6,  9, 10, 13, 14, 65, 66, 69, 70, 73, 74, 77, 78
62               db 17, 18, 21, 22, 25, 26, 29, 30, 81, 82, 85, 86, 89, 90, 93, 94
63               db 33, 34, 37, 38, 41, 42, 45, 46, 97, 98,101,102,105,106,109,110
64               db 49, 50, 53, 54, 57, 58, 61, 62,113,114,117,118,121,122,125,126
65spel_shuf4a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
66               db 17, 18, 33, 34, 21, 22, 37, 38, 25, 26, 41, 42, 29, 30, 45, 46
67               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
68               db 49, 50, 65, 66, 53, 54, 69, 70, 57, 58, 73, 74, 61, 62, 77, 78
69spel_shuf4b:   db 50, 51, 65, 66, 54, 55, 69, 70, 58, 59, 73, 74, 62, 63, 77, 78
70               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
71               db 81, 82, 97, 98, 85, 86,101,102, 89, 90,105,106, 93, 94,109,110
72               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
73spel_shuf8a:   db  1,  2, 17, 18,  5,  6, 21, 22,  9, 10, 25, 26, 13, 14, 29, 30
74               db 17, 18, 65, 66, 21, 22, 69, 70, 25, 26, 73, 74, 29, 30, 77, 78
75               db 33, 34, 49, 50, 37, 38, 53, 54, 41, 42, 57, 58, 45, 46, 61, 62
76               db 49, 50, 97, 98, 53, 54,101,102, 57, 58,105,106, 61, 62,109,110
77spel_shuf8b:   db 18, 19, 65, 66, 22, 23, 69, 70, 26, 27, 73, 74, 30, 31, 77, 78
78               db 65, 66, 81, 82, 69, 70, 85, 86, 73, 74, 89, 90, 77, 78, 93, 94
79               db 50, 51, 97, 98, 54, 55,101,102, 58, 59,105,106, 62, 63,109,110
80               db 97, 98,113,114,101,102,117,118,105,106,121,122,109,110,125,126
81spel_shuf16:   db  1,  2, 33, 34,  5,  6, 37, 38,  9, 10, 41, 42, 13, 14, 45, 46
82               db 17, 18, 49, 50, 21, 22, 53, 54, 25, 26, 57, 58, 29, 30, 61, 62
83               db 65, 66, 97, 98, 69, 70,101,102, 73, 74,105,106, 77, 78,109,110
84               db 81, 82,113,114, 85, 86,117,118, 89, 90,121,122, 93, 94,125,126
85spel_shuf32:   db  1,  2, 65, 66,  5,  6, 69, 70,  9, 10, 73, 74, 13, 14, 77, 78
86               db 17, 18, 81, 82, 21, 22, 85, 86, 25, 26, 89, 90, 29, 30, 93, 94
87               db 33, 34, 97, 98, 37, 38,101,102, 41, 42,105,106, 45, 46,109,110
88               db 49, 50,113,114, 53, 54,117,118, 57, 58,121,122, 61, 62,125,126
89spel_h_shuf2b: db  1,  2, 17, 18,  5,  6, 21, 22, 17, 18, 33, 34, 21, 22, 37, 38
90               db 33, 34, 49, 50, 37, 38, 53, 54, 49, 50,  9, 10, 53, 54, 13, 14
91               db  9, 10, 25, 26, 13, 14, 29, 30, 25, 26, 41, 42, 29, 30, 45, 46
92spel_shuf2:    db 10, 11, 17, 18, 14, 15, 21, 22, 17, 18, 25, 26, 21, 22, 29, 30
93spel_h_shuf2a: db  0,  1,  2,  3,  2,  3,  4,  5, 16, 17, 18, 19, 18, 19, 20, 21
94               db  4,  5,  6,  7,  6,  7,  8,  9, 20, 21, 22, 23, 22, 23, 24, 25
95w_mask_end42x: db  1,  5,  9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61
96               db 65, 69, 73, 77, 81, 85, 89, 93, 97,101,105,109,113,117,121,125
97w_mask_end444: db  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
98               db 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62
99               db 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94
100               db 96, 98,100,102,104,106,108,110,112,114,116,118,120,122,124,126
101w_mask_shuf4:  db  0,  2,  8, 10,  4,  6, 12, 14, 16, 18, 24, 26, 20, 22, 28, 30
102               db 32, 34, 40, 42, 36, 38, 44, 46, 48, 50, 56, 58, 52, 54, 60, 62
103               db 64, 66, 72, 74, 68, 70, 76, 78, 80, 82, 88, 90, 84, 86, 92, 94
104               db 96, 98,104,106,100,102,108,110,112,114,120,122,116,118,124,126
105w_mask_shuf8:  db  0,  2, 16, 18,  4,  6, 20, 22,  8, 10, 24, 26, 12, 14, 28, 30
106               db 32, 34, 48, 50, 36, 38, 52, 54, 40, 42, 56, 58, 44, 46, 60, 62
107               db 64, 66, 80, 82, 68, 70, 84, 86, 72, 74, 88, 90, 76, 78, 92, 94
108               db 96, 98,112,114,100,102,116,118,104,106,120,122,108,110,124,126
109w_mask_shuf16: db  0,  2, 32, 34,  4,  6, 36, 38,  8, 10, 40, 42, 12, 14, 44, 46
110               db 16, 18, 48, 50, 20, 22, 52, 54, 24, 26, 56, 58, 28, 30, 60, 62
111               db 64, 66, 96, 98, 68, 70,100,102, 72, 74,104,106, 76, 78,108,110
112               db 80, 82,112,114, 84, 86,116,118, 88, 90,120,122, 92, 94,124,126
113warp8x8_permA: db  0,  1,  2,  3, 32, 33, 34, 35,  2,  3,  4,  5, 34, 35, 36, 37
114               db  4,  5,  6,  7, 36, 37, 38, 39,  6,  7,  8,  9, 38, 39, 40, 41
115               db  8,  9, 10, 11, 40, 41, 42, 43, 10, 11, 12, 13, 42, 43, 44, 45
116               db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
117warp8x8_permB: db 12, 13, 14, 15, 44, 45, 46, 47, 14, 15, 16, 17, 46, 47, 48, 49
118               db 16, 17, 18, 19, 48, 49, 50, 51, 18, 19, 20, 21, 50, 51, 52, 53
119               db 20, 21, 22, 23, 52, 53, 54, 55, 22, 23, 24, 25, 54, 55, 56, 57
120               db 24, 25, 26, 27, 56, 57, 58, 59, 26, 27, 28, 29, 58, 59, 60, 61
121warp8x8_end:   db  0,  1,  4,  5, 16, 17, 20, 21, 32, 33, 36, 37, 48, 49, 52, 53
122               db  2,  3,  6,  7, 18, 19, 22, 23, 34, 35, 38, 39, 50, 51, 54, 55
123               db  8,  9, 12, 13, 24, 25, 28, 29, 40, 41, 44, 45, 56, 57, 60, 61
124               db 10, 11, 14, 15, 26, 27, 30, 31, 42, 43, 46, 47, 58, 59, 62, 63
125deint_q_shuf: ;dq  0,  2,  4,  6,  1,  3,  5,  7
126pd_0to7:       dd  0,  1,  2,  3,  4,  5,  6,  7
127               dd  1
128pw_2048:       times 2 dw 2048
129               dd  3
130pw_8192:       times 2 dw 8192
131avg_shift:     dw  5,  5,  3,  3
132pw_27615:      times 2 dw 27615
133pw_32766:      times 2 dw 32766
134warp8x8_permC: db -1,  0, -1,  1, -1,  8, -1,  9, -1,  4, -1,  5, -1, 12, -1, 13
135warp8x8_permD: db -1,  2, -1,  3, -1, 10, -1, 11, -1,  6, -1,  7, -1, 14, -1, 15
136warp_shift_h:  db 11, 19, 11, 19, 43, 51, 43, 51, 13, 21, 13, 21, 45, 53, 45, 53
137blend_shuf:    db  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  2,  3,  2,  3,  2,  3
138resize_permA:  dd  0,  4,  8, 12,  1,  5,  9, 13, 16, 20, 24, 28, 17, 21, 25, 29
139resize_permB:  dd  2,  6, 10, 14,  3,  7, 11, 15, 18, 22, 26, 30, 19, 23, 27, 31
140resize_permC:  dq  0,  1,  4,  5,  8,  9, 12, 13
141resize_permD:  dq  2,  3,  6,  7, 10, 11, 14, 15
142resize_permE:  dq  0,  2,  4,  6
143resize_shufA:  db -1,  0, -1,  1, -1,  4, -1,  5, -1,  8, -1,  9, -1, 12, -1, 13
144resize_shufB:  db -1,  2, -1,  3, -1,  6, -1,  7, -1, 10, -1, 11, -1, 14, -1, 15
145rescale_mul:   dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
146resize_shuf:   db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  2,  3,  4,  5,  6,  7
147               db  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15
148
149prep_hv_shift:    dq  6,  4
150put_bilin_h_rnd:  dw  8,  8, 10, 10
151prep_mul:         dw 16, 16,  4,  4
152put_8tap_h_rnd:   dd 34, 40
153prep_8tap_rnd:    dd 128 - (8192 << 8)
154warp_8x8_rnd_h:   dd 512, 2048
155warp_8x8_rnd_v:   dd 262144, 65536
156warp_8x8t_rnd_v:  dd 16384 - (8192 << 15)
157avg_round:        dw -16400, -16400, -16388, -16388
158w_avg_round:      dd 128 + (8192 << 4),  32 + (8192 << 4)
159mask_round:       dd 512 + (8192 << 6), 128 + (8192 << 6)
160w_mask_round:     dd 128, 64
161bidir_shift:      dw  6,  6,  4,  4
162
163pb_64:    times 4 db 64
164pw_m512:  times 2 dw -512
165pw_2:     times 2 dw 2
166pw_64:    times 2 dw 64
167pd_32:    dd 32
168pd_63:    dd 63
169pd_128:   dd 128
170pd_640:   dd 640
171pd_2176:  dd 2176
172pd_16384: dd 16384
173pd_0_4:   dd 0, 4
174
175%define pw_16 prep_mul
176%define pd_512 warp_8x8_rnd_h
177
178%macro BASE_JMP_TABLE 3-*
179    %xdefine %1_%2_table (%%table - %3)
180    %xdefine %%base %1_%2
181    %%table:
182    %rep %0 - 2
183        dw %%base %+ _w%3 - %%base
184        %rotate 1
185    %endrep
186%endmacro
187
188%macro HV_JMP_TABLE 5-*
189    %xdefine %%prefix mangle(private_prefix %+ _%1_%2_16bpc_%3)
190    %xdefine %%base %1_%3
191    %assign %%types %4
192    %if %%types & 1
193        %xdefine %1_%2_h_%3_table  (%%h  - %5)
194        %%h:
195        %rep %0 - 4
196            dw %%prefix %+ .h_w%5 - %%base
197            %rotate 1
198        %endrep
199        %rotate 4
200    %endif
201    %if %%types & 2
202        %xdefine %1_%2_v_%3_table  (%%v  - %5)
203        %%v:
204        %rep %0 - 4
205            dw %%prefix %+ .v_w%5 - %%base
206            %rotate 1
207        %endrep
208        %rotate 4
209    %endif
210    %if %%types & 4
211        %xdefine %1_%2_hv_%3_table (%%hv - %5)
212        %%hv:
213        %rep %0 - 4
214            dw %%prefix %+ .hv_w%5 - %%base
215            %rotate 1
216        %endrep
217    %endif
218%endmacro
219
220%macro BIDIR_JMP_TABLE 2-*
221    %xdefine %1_%2_table (%%table - 2*%3)
222    %xdefine %%base %1_%2_table
223    %xdefine %%prefix mangle(private_prefix %+ _%1_16bpc_%2)
224    %%table:
225    %rep %0 - 2
226        dd %%prefix %+ .w%3 - %%base
227        %rotate 1
228    %endrep
229%endmacro
230
231%xdefine put_avx512icl mangle(private_prefix %+ _put_bilin_16bpc_avx512icl.put)
232%xdefine prep_avx512icl mangle(private_prefix %+ _prep_bilin_16bpc_avx512icl.prep)
233
234BIDIR_JMP_TABLE avg,        avx512icl,       4, 8, 16, 32, 64, 128
235BIDIR_JMP_TABLE w_avg,      avx512icl,       4, 8, 16, 32, 64, 128
236BIDIR_JMP_TABLE mask,       avx512icl,       4, 8, 16, 32, 64, 128
237BIDIR_JMP_TABLE w_mask_420, avx512icl,       4, 8, 16, 32, 64, 128
238BIDIR_JMP_TABLE w_mask_422, avx512icl,       4, 8, 16, 32, 64, 128
239BIDIR_JMP_TABLE w_mask_444, avx512icl,       4, 8, 16, 32, 64, 128
240BIDIR_JMP_TABLE blend,      avx512icl,       4, 8, 16, 32
241BIDIR_JMP_TABLE blend_v,    avx512icl,    2, 4, 8, 16, 32
242BIDIR_JMP_TABLE blend_h,    avx512icl,    2, 4, 8, 16, 32, 64, 128
243BASE_JMP_TABLE put,         avx512icl,    2, 4, 8, 16, 32, 64, 128
244BASE_JMP_TABLE prep,        avx512icl,       4, 8, 16, 32, 64, 128
245HV_JMP_TABLE   put,  bilin, avx512icl, 7, 2, 4, 8, 16, 32, 64, 128
246HV_JMP_TABLE   prep, bilin, avx512icl, 7,    4, 8, 16, 32, 64, 128
247HV_JMP_TABLE   put,  6tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
248HV_JMP_TABLE   put,  8tap,  avx512icl, 2, 2, 4, 8, 16, 32, 64, 128
249HV_JMP_TABLE   prep, 6tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
250HV_JMP_TABLE   prep, 8tap,  avx512icl, 2,    4, 8, 16, 32, 64, 128
251
252%define table_offset(type, fn) type %+ fn %+ SUFFIX %+ _table - type %+ SUFFIX
253
254cextern mc_subpel_filters
255%define subpel_filters (mangle(private_prefix %+ _mc_subpel_filters)-8)
256
257cextern mc_warp_filter
258cextern obmc_masks_avx2
259cextern resize_filter
260
261SECTION .text
262
263%if WIN64
264DECLARE_REG_TMP 4
265%else
266DECLARE_REG_TMP 8
267%endif
268
269INIT_ZMM avx512icl
270cglobal put_bilin_16bpc, 4, 8, 13, dst, ds, src, ss, w, h, mxy
271    mov                mxyd, r6m ; mx
272    lea                  r7, [put_avx512icl]
273    tzcnt               t0d, wm
274    movifnidn            hd, hm
275    test               mxyd, mxyd
276    jnz .h
277    mov                mxyd, r7m ; my
278    test               mxyd, mxyd
279    jnz .v
280.put:
281    movzx               t0d, word [r7+t0*2+table_offset(put,)]
282    add                  t0, r7
283    jmp                  t0
284.put_w2:
285    mov                 r6d, [srcq+ssq*0]
286    mov                 r7d, [srcq+ssq*1]
287    lea                srcq, [srcq+ssq*2]
288    mov        [dstq+dsq*0], r6d
289    mov        [dstq+dsq*1], r7d
290    lea                dstq, [dstq+dsq*2]
291    sub                  hd, 2
292    jg .put_w2
293    RET
294.put_w4:
295    mov                  r6, [srcq+ssq*0]
296    mov                  r7, [srcq+ssq*1]
297    lea                srcq, [srcq+ssq*2]
298    mov        [dstq+dsq*0], r6
299    mov        [dstq+dsq*1], r7
300    lea                dstq, [dstq+dsq*2]
301    sub                  hd, 2
302    jg .put_w4
303    RET
304.put_w8:
305    movu               xmm0, [srcq+ssq*0]
306    movu               xmm1, [srcq+ssq*1]
307    lea                srcq, [srcq+ssq*2]
308    mova       [dstq+dsq*0], xmm0
309    mova       [dstq+dsq*1], xmm1
310    lea                dstq, [dstq+dsq*2]
311    sub                  hd, 2
312    jg .put_w8
313    RET
314.put_w16:
315    movu                ym0, [srcq+ssq*0]
316    movu                ym1, [srcq+ssq*1]
317    lea                srcq, [srcq+ssq*2]
318    mova       [dstq+dsq*0], ym0
319    mova       [dstq+dsq*1], ym1
320    lea                dstq, [dstq+dsq*2]
321    sub                  hd, 2
322    jg .put_w16
323    RET
324.put_w32:
325    movu                 m0, [srcq+ssq*0]
326    movu                 m1, [srcq+ssq*1]
327    lea                srcq, [srcq+ssq*2]
328    mova       [dstq+dsq*0], m0
329    mova       [dstq+dsq*1], m1
330    lea                dstq, [dstq+dsq*2]
331    sub                  hd, 2
332    jg .put_w32
333    RET
334.put_w64:
335    movu                 m0, [srcq+ssq*0+64*0]
336    movu                 m1, [srcq+ssq*0+64*1]
337    movu                 m2, [srcq+ssq*1+64*0]
338    movu                 m3, [srcq+ssq*1+64*1]
339    lea                srcq, [srcq+ssq*2]
340    mova  [dstq+dsq*0+64*0], m0
341    mova  [dstq+dsq*0+64*1], m1
342    mova  [dstq+dsq*1+64*0], m2
343    mova  [dstq+dsq*1+64*1], m3
344    lea                dstq, [dstq+dsq*2]
345    sub                  hd, 2
346    jg .put_w64
347    RET
348.put_w128:
349    movu                 m0, [srcq+64*0]
350    movu                 m1, [srcq+64*1]
351    movu                 m2, [srcq+64*2]
352    movu                 m3, [srcq+64*3]
353    add                srcq, ssq
354    mova        [dstq+64*0], m0
355    mova        [dstq+64*1], m1
356    mova        [dstq+64*2], m2
357    mova        [dstq+64*3], m3
358    add                dstq, dsq
359    dec                  hd
360    jg .put_w128
361    RET
362.h:
363    vpbroadcastw         m5, mxyd
364    mov                mxyd, r7m ; my
365    vpbroadcastd         m4, [pw_16]
366    psubw                m4, m5
367    test               mxyd, mxyd
368    jnz .hv
369    ; 12-bit is rounded twice so we can't use the same pmulhrsw approach as .v
370    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_h)]
371    mov                 r6d, r8m ; bitdepth_max
372    add                  t0, r7
373    shr                 r6d, 11
374    vpbroadcastd         m6, [r7-put_avx512icl+put_bilin_h_rnd+r6*4]
375    jmp                  t0
376.h_w2:
377    movq               xmm1, [srcq+ssq*0]
378    movhps             xmm1, [srcq+ssq*1]
379    lea                srcq, [srcq+ssq*2]
380    pmullw             xmm0, xmm1, xm4
381    psrlq              xmm1, 16
382    pmullw             xmm1, xm5
383    paddw              xmm0, xm6
384    paddw              xmm0, xmm1
385    psrlw              xmm0, 4
386    movd       [dstq+dsq*0], xmm0
387    pextrd     [dstq+dsq*1], xmm0, 2
388    lea                dstq, [dstq+dsq*2]
389    sub                  hd, 2
390    jg .h_w2
391    RET
392.h_w4:
393    movq               xmm0, [srcq+ssq*0+0]
394    movhps             xmm0, [srcq+ssq*1+0]
395    movq               xmm1, [srcq+ssq*0+2]
396    movhps             xmm1, [srcq+ssq*1+2]
397    lea                srcq, [srcq+ssq*2]
398    pmullw             xmm0, xm4
399    pmullw             xmm1, xm5
400    paddw              xmm0, xm6
401    paddw              xmm0, xmm1
402    psrlw              xmm0, 4
403    movq       [dstq+dsq*0], xmm0
404    movhps     [dstq+dsq*1], xmm0
405    lea                dstq, [dstq+dsq*2]
406    sub                  hd, 2
407    jg .h_w4
408    RET
409.h_w8:
410    movu                xm0, [srcq+ssq*0+0]
411    vinserti32x4        ym0, [srcq+ssq*1+0], 1
412    movu                xm1, [srcq+ssq*0+2]
413    vinserti32x4        ym1, [srcq+ssq*1+2], 1
414    lea                srcq, [srcq+ssq*2]
415    pmullw              ym0, ym4
416    pmullw              ym1, ym5
417    paddw               ym0, ym6
418    paddw               ym0, ym1
419    psrlw               ym0, 4
420    mova          [dstq+dsq*0], xm0
421    vextracti32x4 [dstq+dsq*1], ym0, 1
422    lea                dstq, [dstq+dsq*2]
423    sub                  hd, 2
424    jg .h_w8
425    RET
426.h_w16:
427    movu                ym0, [srcq+ssq*0+0]
428    vinserti32x8         m0, [srcq+ssq*1+0], 1
429    movu                ym1, [srcq+ssq*0+2]
430    vinserti32x8         m1, [srcq+ssq*1+2], 1
431    lea                srcq, [srcq+ssq*2]
432    pmullw               m0, m4
433    pmullw               m1, m5
434    paddw                m0, m6
435    paddw                m0, m1
436    psrlw                m0, 4
437    mova          [dstq+dsq*0], ym0
438    vextracti32x8 [dstq+dsq*1], m0, 1
439    lea                dstq, [dstq+dsq*2]
440    sub                  hd, 2
441    jg .h_w16
442    RET
443.h_w32:
444    pmullw               m0, m4, [srcq+ssq*0+0]
445    pmullw               m2, m5, [srcq+ssq*0+2]
446    pmullw               m1, m4, [srcq+ssq*1+0]
447    pmullw               m3, m5, [srcq+ssq*1+2]
448    lea                srcq, [srcq+ssq*2]
449    paddw                m0, m6
450    paddw                m1, m6
451    paddw                m0, m2
452    paddw                m1, m3
453    psrlw                m0, 4
454    psrlw                m1, 4
455    mova       [dstq+dsq*0], m0
456    mova       [dstq+dsq*1], m1
457    lea                dstq, [dstq+dsq*2]
458    sub                  hd, 2
459    jg .h_w32
460    RET
461.h_w64:
462    pmullw               m0, m4, [srcq+64*0+0]
463    pmullw               m2, m5, [srcq+64*0+2]
464    pmullw               m1, m4, [srcq+64*1+0]
465    pmullw               m3, m5, [srcq+64*1+2]
466    add                srcq, ssq
467    paddw                m0, m6
468    paddw                m1, m6
469    paddw                m0, m2
470    paddw                m1, m3
471    psrlw                m0, 4
472    psrlw                m1, 4
473    mova        [dstq+64*0], m0
474    mova        [dstq+64*1], m1
475    add                dstq, dsq
476    dec                  hd
477    jg .h_w64
478    RET
479.h_w128:
480    pmullw               m0, m4, [srcq+64*0+0]
481    pmullw               m7, m5, [srcq+64*0+2]
482    pmullw               m1, m4, [srcq+64*1+0]
483    pmullw               m8, m5, [srcq+64*1+2]
484    pmullw               m2, m4, [srcq+64*2+0]
485    pmullw               m9, m5, [srcq+64*2+2]
486    pmullw               m3, m4, [srcq+64*3+0]
487    pmullw              m10, m5, [srcq+64*3+2]
488    add                srcq, ssq
489    REPX      {paddw x, m6}, m0, m1, m2, m3
490    paddw                m0, m7
491    paddw                m1, m8
492    paddw                m2, m9
493    paddw                m3, m10
494    REPX       {psrlw x, 4}, m0, m1, m2, m3
495    mova        [dstq+64*0], m0
496    mova        [dstq+64*1], m1
497    mova        [dstq+64*2], m2
498    mova        [dstq+64*3], m3
499    add                dstq, dsq
500    dec                  hd
501    jg .h_w128
502    RET
503.v:
504    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_v)]
505    shl                mxyd, 11
506    vpbroadcastw         m8, mxyd
507    add                  t0, r7
508    jmp                  t0
509.v_w2:
510    movd               xmm0, [srcq+ssq*0]
511.v_w2_loop:
512    movd               xmm1, [srcq+ssq*1]
513    lea                srcq, [srcq+ssq*2]
514    punpckldq          xmm2, xmm0, xmm1
515    movd               xmm0, [srcq+ssq*0]
516    punpckldq          xmm1, xmm0
517    psubw              xmm1, xmm2
518    pmulhrsw           xmm1, xm8
519    paddw              xmm1, xmm2
520    movd       [dstq+dsq*0], xmm1
521    pextrd     [dstq+dsq*1], xmm1, 1
522    lea                dstq, [dstq+dsq*2]
523    sub                  hd, 2
524    jg .v_w2_loop
525    RET
526.v_w4:
527    movq               xmm0, [srcq+ssq*0]
528.v_w4_loop:
529    movq               xmm1, [srcq+ssq*1]
530    lea                srcq, [srcq+ssq*2]
531    punpcklqdq         xmm2, xmm0, xmm1
532    movq               xmm0, [srcq+ssq*0]
533    punpcklqdq         xmm1, xmm0
534    psubw              xmm1, xmm2
535    pmulhrsw           xmm1, xm8
536    paddw              xmm1, xmm2
537    movq       [dstq+dsq*0], xmm1
538    movhps     [dstq+dsq*1], xmm1
539    lea                dstq, [dstq+dsq*2]
540    sub                  hd, 2
541    jg .v_w4_loop
542    RET
543.v_w8:
544    movu               xmm0, [srcq+ssq*0]
545.v_w8_loop:
546    vbroadcasti128     ymm1, [srcq+ssq*1]
547    lea                srcq, [srcq+ssq*2]
548    vpblendd           ymm2, ymm0, ymm1, 0xf0
549    vbroadcasti128     ymm0, [srcq+ssq*0]
550    vpblendd           ymm1, ymm0, 0xf0
551    psubw              ymm1, ymm2
552    pmulhrsw           ymm1, ym8
553    paddw              ymm1, ymm2
554    mova         [dstq+dsq*0], xmm1
555    vextracti128 [dstq+dsq*1], ymm1, 1
556    lea                dstq, [dstq+dsq*2]
557    sub                  hd, 2
558    jg .v_w8_loop
559    vzeroupper
560    RET
561.v_w16:
562    movu                ym0, [srcq+ssq*0]
563.v_w16_loop:
564    movu                ym3, [srcq+ssq*1]
565    lea                srcq, [srcq+ssq*2]
566    psubw               ym1, ym3, ym0
567    pmulhrsw            ym1, ym8
568    paddw               ym1, ym0
569    movu                ym0, [srcq+ssq*0]
570    psubw               ym2, ym0, ym3
571    pmulhrsw            ym2, ym8
572    paddw               ym2, ym3
573    mova       [dstq+dsq*0], ym1
574    mova       [dstq+dsq*1], ym2
575    lea                dstq, [dstq+dsq*2]
576    sub                  hd, 2
577    jg .v_w16_loop
578    RET
579.v_w32:
580    movu                 m0, [srcq+ssq*0]
581.v_w32_loop:
582    movu                 m3, [srcq+ssq*1]
583    lea                srcq, [srcq+ssq*2]
584    psubw                m1, m3, m0
585    pmulhrsw             m1, m8
586    paddw                m1, m0
587    movu                 m0, [srcq+ssq*0]
588    psubw                m2, m0, m3
589    pmulhrsw             m2, m8
590    paddw                m2, m3
591    mova       [dstq+dsq*0], m1
592    mova       [dstq+dsq*1], m2
593    lea                dstq, [dstq+dsq*2]
594    sub                  hd, 2
595    jg .v_w32_loop
596    RET
597.v_w64:
598    movu                 m0, [srcq+ssq*0+64*0]
599    movu                 m1, [srcq+ssq*0+64*1]
600.v_w64_loop:
601    movu                 m2, [srcq+ssq*1+64*0]
602    movu                 m3, [srcq+ssq*1+64*1]
603    lea                srcq, [srcq+ssq*2]
604    psubw                m4, m2, m0
605    pmulhrsw             m4, m8
606    paddw                m4, m0
607    movu                 m0, [srcq+ssq*0+64*0]
608    psubw                m5, m3, m1
609    pmulhrsw             m5, m8
610    paddw                m5, m1
611    movu                 m1, [srcq+ssq*0+64*1]
612    psubw                m6, m0, m2
613    pmulhrsw             m6, m8
614    psubw                m7, m1, m3
615    pmulhrsw             m7, m8
616    mova  [dstq+dsq*0+64*0], m4
617    mova  [dstq+dsq*0+64*1], m5
618    paddw                m6, m2
619    paddw                m7, m3
620    mova  [dstq+dsq*1+64*0], m6
621    mova  [dstq+dsq*1+64*1], m7
622    lea                dstq, [dstq+dsq*2]
623    sub                  hd, 2
624    jg .v_w64_loop
625    RET
626.v_w128:
627    movu                 m0, [srcq+ssq*0+64*0]
628    movu                 m1, [srcq+ssq*0+64*1]
629    movu                 m2, [srcq+ssq*0+64*2]
630    movu                 m3, [srcq+ssq*0+64*3]
631.v_w128_loop:
632    movu                 m4, [srcq+ssq*1+64*0]
633    movu                 m5, [srcq+ssq*1+64*1]
634    movu                 m6, [srcq+ssq*1+64*2]
635    movu                 m7, [srcq+ssq*1+64*3]
636    lea                srcq, [srcq+ssq*2]
637    psubw                m9, m4, m0
638    pmulhrsw             m9, m8
639    paddw                m9, m0
640    movu                 m0, [srcq+ssq*0+64*0]
641    psubw               m10, m5, m1
642    pmulhrsw            m10, m8
643    paddw               m10, m1
644    movu                 m1, [srcq+ssq*0+64*1]
645    psubw               m11, m6, m2
646    pmulhrsw            m11, m8
647    paddw               m11, m2
648    movu                 m2, [srcq+ssq*0+64*2]
649    psubw               m12, m7, m3
650    pmulhrsw            m12, m8
651    paddw               m12, m3
652    movu                 m3, [srcq+ssq*0+64*3]
653    mova  [dstq+dsq*0+64*0], m9
654    psubw                m9, m0, m4
655    pmulhrsw             m9, m8
656    mova  [dstq+dsq*0+64*1], m10
657    psubw               m10, m1, m5
658    pmulhrsw            m10, m8
659    mova  [dstq+dsq*0+64*2], m11
660    psubw               m11, m2, m6
661    pmulhrsw            m11, m8
662    mova  [dstq+dsq*0+64*3], m12
663    psubw               m12, m3, m7
664    pmulhrsw            m12, m8
665    paddw                m9, m4
666    paddw               m10, m5
667    mova  [dstq+dsq*1+64*0], m9
668    mova  [dstq+dsq*1+64*1], m10
669    paddw               m11, m6
670    paddw               m12, m7
671    mova  [dstq+dsq*1+64*2], m11
672    mova  [dstq+dsq*1+64*3], m12
673    lea                dstq, [dstq+dsq*2]
674    sub                  hd, 2
675    jg .v_w128_loop
676    RET
677.hv:
678    movzx               t0d, word [r7+t0*2+table_offset(put, _bilin_hv)]
679    shl                mxyd, 11
680    vpbroadcastd         m6, [pw_2]
681    vpbroadcastw         m7, mxyd
682    vpbroadcastd         m8, [pw_8192]
683    add                  t0, r7
684    test          dword r8m, 0x800
685    jnz .hv_12bpc
686    psllw                m4, 2
687    psllw                m5, 2
688    vpbroadcastd         m8, [pw_2048]
689.hv_12bpc:
690    jmp                  t0
691.hv_w2:
692    vpbroadcastq       xmm1, [srcq+ssq*0]
693    pmullw             xmm0, xmm1, xm4
694    psrlq              xmm1, 16
695    pmullw             xmm1, xm5
696    paddw              xmm0, xm6
697    paddw              xmm0, xmm1
698    psrlw              xmm0, 2
699.hv_w2_loop:
700    movq               xmm2, [srcq+ssq*1]
701    lea                srcq, [srcq+ssq*2]
702    movhps             xmm2, [srcq+ssq*0]
703    pmullw             xmm1, xmm2, xm4
704    psrlq              xmm2, 16
705    pmullw             xmm2, xm5
706    paddw              xmm1, xm6
707    paddw              xmm1, xmm2
708    psrlw              xmm1, 2                ; 1 _ 2 _
709    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 _ 1 _
710    mova               xmm0, xmm1
711    psubw              xmm1, xmm2
712    paddw              xmm1, xmm1
713    pmulhw             xmm1, xm7
714    paddw              xmm1, xmm2
715    pmulhrsw           xmm1, xm8
716    movd       [dstq+dsq*0], xmm1
717    pextrd     [dstq+dsq*1], xmm1, 2
718    lea                dstq, [dstq+dsq*2]
719    sub                  hd, 2
720    jg .hv_w2_loop
721    RET
722.hv_w4:
723    pmullw             xmm0, xm4, [srcq+ssq*0-8]
724    pmullw             xmm1, xm5, [srcq+ssq*0-6]
725    paddw              xmm0, xm6
726    paddw              xmm0, xmm1
727    psrlw              xmm0, 2
728.hv_w4_loop:
729    movq               xmm1, [srcq+ssq*1+0]
730    movq               xmm2, [srcq+ssq*1+2]
731    lea                srcq, [srcq+ssq*2]
732    movhps             xmm1, [srcq+ssq*0+0]
733    movhps             xmm2, [srcq+ssq*0+2]
734    pmullw             xmm1, xm4
735    pmullw             xmm2, xm5
736    paddw              xmm1, xm6
737    paddw              xmm1, xmm2
738    psrlw              xmm1, 2                ; 1 2
739    shufpd             xmm2, xmm0, xmm1, 0x01 ; 0 1
740    mova               xmm0, xmm1
741    psubw              xmm1, xmm2
742    paddw              xmm1, xmm1
743    pmulhw             xmm1, xm7
744    paddw              xmm1, xmm2
745    pmulhrsw           xmm1, xm8
746    movq       [dstq+dsq*0], xmm1
747    movhps     [dstq+dsq*1], xmm1
748    lea                dstq, [dstq+dsq*2]
749    sub                  hd, 2
750    jg .hv_w4_loop
751    RET
752.hv_w8:
753    pmullw             xmm0, xm4, [srcq+ssq*0+0]
754    pmullw             xmm1, xm5, [srcq+ssq*0+2]
755    paddw              xmm0, xm6
756    paddw              xmm0, xmm1
757    psrlw              xmm0, 2
758    vinserti32x4        ym0, xmm0, 1
759.hv_w8_loop:
760    movu                xm1, [srcq+ssq*1+0]
761    movu                xm2, [srcq+ssq*1+2]
762    lea                srcq, [srcq+ssq*2]
763    vinserti32x4        ym1, [srcq+ssq*0+0], 1
764    vinserti32x4        ym2, [srcq+ssq*0+2], 1
765    pmullw              ym1, ym4
766    pmullw              ym2, ym5
767    paddw               ym1, ym6
768    paddw               ym1, ym2
769    psrlw               ym1, 2              ; 1 2
770    vshufi32x4          ym2, ym0, ym1, 0x01 ; 0 1
771    mova                ym0, ym1
772    psubw               ym1, ym2
773    paddw               ym1, ym1
774    pmulhw              ym1, ym7
775    paddw               ym1, ym2
776    pmulhrsw            ym1, ym8
777    mova          [dstq+dsq*0], xm1
778    vextracti32x4 [dstq+dsq*1], ym1, 1
779    lea                dstq, [dstq+dsq*2]
780    sub                  hd, 2
781    jg .hv_w8_loop
782    RET
783.hv_w16:
784    pmullw              ym0, ym4, [srcq+ssq*0+0]
785    pmullw              ym1, ym5, [srcq+ssq*0+2]
786    paddw               ym0, ym6
787    paddw               ym0, ym1
788    psrlw               ym0, 2
789    vinserti32x8         m0, ym0, 1
790.hv_w16_loop:
791    movu                ym1, [srcq+ssq*1+0]
792    movu                ym2, [srcq+ssq*1+2]
793    lea                srcq, [srcq+ssq*2]
794    vinserti32x8         m1, [srcq+ssq*0+0], 1
795    vinserti32x8         m2, [srcq+ssq*0+2], 1
796    pmullw               m1, m4
797    pmullw               m2, m5
798    paddw                m1, m6
799    paddw                m1, m2
800    psrlw                m1, 2             ; 1 2
801    vshufi32x4           m2, m0, m1, q1032 ; 0 1
802    mova                 m0, m1
803    psubw                m1, m2
804    paddw                m1, m1
805    pmulhw               m1, m7
806    paddw                m1, m2
807    pmulhrsw             m1, m8
808    mova          [dstq+dsq*0], ym1
809    vextracti32x8 [dstq+dsq*1], m1, 1
810    lea                dstq, [dstq+dsq*2]
811    sub                  hd, 2
812    jg .hv_w16_loop
813    RET
814.hv_w32:
815.hv_w64:
816.hv_w128:
817    movifnidn            wd, wm
818    lea                 r6d, [hq+wq*8-256]
819    mov                  r4, srcq
820    mov                  r7, dstq
821.hv_w32_loop0:
822    pmullw               m0, m4, [srcq+ssq*0+0]
823    pmullw               m1, m5, [srcq+ssq*0+2]
824    paddw                m0, m6
825    paddw                m0, m1
826    psrlw                m0, 2
827.hv_w32_loop:
828    pmullw               m3, m4, [srcq+ssq*1+0]
829    pmullw               m1, m5, [srcq+ssq*1+2]
830    lea                srcq, [srcq+ssq*2]
831    paddw                m3, m6
832    paddw                m3, m1
833    psrlw                m3, 2
834    psubw                m1, m3, m0
835    paddw                m1, m1
836    pmulhw               m1, m7
837    paddw                m1, m0
838    pmullw               m0, m4, [srcq+ssq*0+0]
839    pmullw               m2, m5, [srcq+ssq*0+2]
840    paddw                m0, m6
841    paddw                m0, m2
842    psrlw                m0, 2
843    psubw                m2, m0, m3
844    paddw                m2, m2
845    pmulhw               m2, m7
846    paddw                m2, m3
847    pmulhrsw             m1, m8
848    pmulhrsw             m2, m8
849    mova       [dstq+dsq*0], m1
850    mova       [dstq+dsq*1], m2
851    lea                dstq, [dstq+dsq*2]
852    sub                  hd, 2
853    jg .hv_w32_loop
854    add                  r4, 64
855    add                  r7, 64
856    movzx                hd, r6b
857    mov                srcq, r4
858    mov                dstq, r7
859    sub                 r6d, 1<<8
860    jg .hv_w32_loop0
861    RET
862
863cglobal prep_bilin_16bpc, 3, 7, 16, tmp, src, stride, w, h, mxy, stride3
864    movifnidn          mxyd, r5m ; mx
865    lea                  r6, [prep_avx512icl]
866    tzcnt                wd, wm
867    movifnidn            hd, hm
868    test               mxyd, mxyd
869    jnz .h
870    mov                mxyd, r6m ; my
871    test               mxyd, mxyd
872    jnz .v
873.prep:
874    movzx                wd, word [r6+wq*2+table_offset(prep,)]
875    mov                 r5d, r7m ; bitdepth_max
876    vpbroadcastd         m5, [r6-prep_avx512icl+pw_8192]
877    add                  wq, r6
878    shr                 r5d, 11
879    vpbroadcastd         m4, [r6-prep_avx512icl+prep_mul+r5*4]
880    lea            stride3q, [strideq*3]
881    jmp                  wq
882.prep_w4:
883    mov                 r3d, 0x0c
884    kmovb                k1, r3d
885.prep_w4_loop:
886    movq                xm0, [srcq+strideq*0]
887    movhps              xm0, [srcq+strideq*1]
888    vpbroadcastq        ym1, [srcq+strideq*2]
889    vpunpcklqdq     ym0{k1}, ym1, [srcq+stride3q] {1to4}
890    lea                srcq, [srcq+strideq*4]
891    pmullw              ym0, ym4
892    psubw               ym0, ym5
893    mova             [tmpq], ym0
894    add                tmpq, 32
895    sub                  hd, 4
896    jg .prep_w4_loop
897    RET
898.prep_w8:
899    movu                xm0, [srcq+strideq*0]
900    vinserti32x4        ym0, [srcq+strideq*1], 1
901    vinserti32x4         m0, [srcq+strideq*2], 2
902    vinserti32x4         m0, [srcq+stride3q ], 3
903    lea                srcq, [srcq+strideq*4]
904    pmullw               m0, m4
905    psubw                m0, m5
906    mova             [tmpq], m0
907    add                tmpq, 64
908    sub                  hd, 4
909    jg .prep_w8
910    RET
911.prep_w16:
912    movu                ym0, [srcq+strideq*0]
913    vinserti32x8         m0, [srcq+strideq*1], 1
914    movu                ym1, [srcq+strideq*2]
915    vinserti32x8         m1, [srcq+stride3q ], 1
916    lea                srcq, [srcq+strideq*4]
917    pmullw               m0, m4
918    pmullw               m1, m4
919    psubw                m0, m5
920    psubw                m1, m5
921    mova        [tmpq+64*0], m0
922    mova        [tmpq+64*1], m1
923    add                tmpq, 64*2
924    sub                  hd, 4
925    jg .prep_w16
926    RET
927.prep_w32:
928    pmullw               m0, m4, [srcq+strideq*0]
929    pmullw               m1, m4, [srcq+strideq*1]
930    pmullw               m2, m4, [srcq+strideq*2]
931    pmullw               m3, m4, [srcq+stride3q ]
932    lea                srcq, [srcq+strideq*4]
933    REPX      {psubw x, m5}, m0, m1, m2, m3
934    mova        [tmpq+64*0], m0
935    mova        [tmpq+64*1], m1
936    mova        [tmpq+64*2], m2
937    mova        [tmpq+64*3], m3
938    add                tmpq, 64*4
939    sub                  hd, 4
940    jg .prep_w32
941    RET
942.prep_w64:
943    pmullw               m0, m4, [srcq+strideq*0+64*0]
944    pmullw               m1, m4, [srcq+strideq*0+64*1]
945    pmullw               m2, m4, [srcq+strideq*1+64*0]
946    pmullw               m3, m4, [srcq+strideq*1+64*1]
947    lea                srcq, [srcq+strideq*2]
948    REPX      {psubw x, m5}, m0, m1, m2, m3
949    mova        [tmpq+64*0], m0
950    mova        [tmpq+64*1], m1
951    mova        [tmpq+64*2], m2
952    mova        [tmpq+64*3], m3
953    add                tmpq, 64*4
954    sub                  hd, 2
955    jg .prep_w64
956    RET
957.prep_w128:
958    pmullw               m0, m4, [srcq+64*0]
959    pmullw               m1, m4, [srcq+64*1]
960    pmullw               m2, m4, [srcq+64*2]
961    pmullw               m3, m4, [srcq+64*3]
962    add                srcq, strideq
963    REPX      {psubw x, m5}, m0, m1, m2, m3
964    mova        [tmpq+64*0], m0
965    mova        [tmpq+64*1], m1
966    mova        [tmpq+64*2], m2
967    mova        [tmpq+64*3], m3
968    add                tmpq, 64*4
969    dec                  hd
970    jg .prep_w128
971    RET
972.h:
973    vpbroadcastw         m5, mxyd
974    mov                mxyd, r6m ; my
975    vpbroadcastd         m4, [pw_16]
976    vpbroadcastd         m6, [pw_32766]
977    psubw                m4, m5
978    test          dword r7m, 0x800
979    jnz .h_12bpc
980    psllw                m4, 2
981    psllw                m5, 2
982.h_12bpc:
983    test               mxyd, mxyd
984    jnz .hv
985    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_h)]
986    add                  wq, r6
987    lea            stride3q, [strideq*3]
988    jmp                  wq
989.h_w4:
990    movu                xm1, [srcq+strideq*0]
991    vinserti32x4        ym1, [srcq+strideq*2], 1
992    movu                xm2, [srcq+strideq*1]
993    vinserti32x4        ym2, [srcq+stride3q ], 1
994    lea                srcq, [srcq+strideq*4]
995    punpcklqdq          ym0, ym1, ym2
996    psrldq              ym1, 2
997    psrldq              ym2, 2
998    pmullw              ym0, ym4
999    punpcklqdq          ym1, ym2
1000    pmullw              ym1, ym5
1001    psubw               ym0, ym6
1002    paddw               ym0, ym1
1003    psraw               ym0, 2
1004    mova             [tmpq], ym0
1005    add                tmpq, 32
1006    sub                  hd, 4
1007    jg .h_w4
1008    RET
1009.h_w8:
1010    movu                xm0, [srcq+strideq*0+0]
1011    movu                xm1, [srcq+strideq*0+2]
1012    vinserti32x4        ym0, [srcq+strideq*1+0], 1
1013    vinserti32x4        ym1, [srcq+strideq*1+2], 1
1014    vinserti32x4         m0, [srcq+strideq*2+0], 2
1015    vinserti32x4         m1, [srcq+strideq*2+2], 2
1016    vinserti32x4         m0, [srcq+stride3q +0], 3
1017    vinserti32x4         m1, [srcq+stride3q +2], 3
1018    lea                srcq, [srcq+strideq*4]
1019    pmullw               m0, m4
1020    pmullw               m1, m5
1021    psubw                m0, m6
1022    paddw                m0, m1
1023    psraw                m0, 2
1024    mova             [tmpq], m0
1025    add                tmpq, 64
1026    sub                  hd, 4
1027    jg .h_w8
1028    RET
1029.h_w16:
1030    movu                ym0, [srcq+strideq*0+0]
1031    vinserti32x8         m0, [srcq+strideq*1+0], 1
1032    movu                ym1, [srcq+strideq*0+2]
1033    vinserti32x8         m1, [srcq+strideq*1+2], 1
1034    lea                srcq, [srcq+strideq*2]
1035    pmullw               m0, m4
1036    pmullw               m1, m5
1037    psubw                m0, m6
1038    paddw                m0, m1
1039    psraw                m0, 2
1040    mova             [tmpq], m0
1041    add                tmpq, 64
1042    sub                  hd, 2
1043    jg .h_w16
1044    RET
1045.h_w32:
1046    pmullw               m0, m4, [srcq+strideq*0+0]
1047    pmullw               m2, m5, [srcq+strideq*0+2]
1048    pmullw               m1, m4, [srcq+strideq*1+0]
1049    pmullw               m3, m5, [srcq+strideq*1+2]
1050    lea                srcq, [srcq+strideq*2]
1051    psubw                m0, m6
1052    psubw                m1, m6
1053    paddw                m0, m2
1054    paddw                m1, m3
1055    psraw                m0, 2
1056    psraw                m1, 2
1057    mova        [tmpq+64*0], m0
1058    mova        [tmpq+64*1], m1
1059    add                tmpq, 64*2
1060    sub                  hd, 2
1061    jg .h_w32
1062    RET
1063.h_w64:
1064    pmullw               m0, m4, [srcq+ 0]
1065    pmullw               m2, m5, [srcq+ 2]
1066    pmullw               m1, m4, [srcq+64]
1067    pmullw               m3, m5, [srcq+66]
1068    add                srcq, strideq
1069    psubw                m0, m6
1070    psubw                m1, m6
1071    paddw                m0, m2
1072    paddw                m1, m3
1073    psraw                m0, 2
1074    psraw                m1, 2
1075    mova        [tmpq+64*0], m0
1076    mova        [tmpq+64*1], m1
1077    add                tmpq, 64*2
1078    dec                  hd
1079    jg .h_w64
1080    RET
1081.h_w128:
1082    pmullw               m0, m4, [srcq+  0]
1083    pmullw               m7, m5, [srcq+  2]
1084    pmullw               m1, m4, [srcq+ 64]
1085    pmullw               m8, m5, [srcq+ 66]
1086    pmullw               m2, m4, [srcq+128]
1087    pmullw               m9, m5, [srcq+130]
1088    pmullw               m3, m4, [srcq+192]
1089    pmullw              m10, m5, [srcq+194]
1090    add                srcq, strideq
1091    REPX      {psubw x, m6}, m0, m1, m2, m3
1092    paddw                m0, m7
1093    paddw                m1, m8
1094    paddw                m2, m9
1095    paddw                m3, m10
1096    REPX       {psraw x, 2}, m0, m1, m2, m3
1097    mova        [tmpq+64*0], m0
1098    mova        [tmpq+64*1], m1
1099    mova        [tmpq+64*2], m2
1100    mova        [tmpq+64*3], m3
1101    add                tmpq, 64*4
1102    dec                  hd
1103    jg .h_w128
1104    RET
1105.v:
1106    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_v)]
1107    vpbroadcastw         m9, mxyd
1108    vpbroadcastd         m8, [pw_16]
1109    vpbroadcastd        m10, [pw_32766]
1110    add                  wq, r6
1111    lea            stride3q, [strideq*3]
1112    psubw                m8, m9
1113    test          dword r7m, 0x800
1114    jnz .v_12bpc
1115    psllw                m8, 2
1116    psllw                m9, 2
1117.v_12bpc:
1118    jmp                  wq
1119.v_w4:
1120    movq               xmm0, [srcq+strideq*0]
1121.v_w4_loop:
1122    vpbroadcastq       xmm2, [srcq+strideq*1]
1123    vpbroadcastq       ymm1, [srcq+strideq*2]
1124    vpbroadcastq       ymm3, [srcq+stride3q ]
1125    lea                srcq, [srcq+strideq*4]
1126    vpblendd           ymm2, ymm1, 0x30
1127    vpblendd           ymm2, ymm3, 0xc0
1128    vpblendd           ymm1, ymm2, ymm0, 0x03 ; 0 1 2 3
1129    movq               xmm0, [srcq+strideq*0]
1130    valignq            ymm2, ymm0, ymm2, 1    ; 1 2 3 4
1131    pmullw             ymm1, ym8
1132    pmullw             ymm2, ym9
1133    psubw              ymm1, ym10
1134    paddw              ymm1, ymm2
1135    psraw              ymm1, 2
1136    mova             [tmpq], ymm1
1137    add                tmpq, 32
1138    sub                  hd, 4
1139    jg .v_w4_loop
1140    vzeroupper
1141    RET
1142.v_w8:
1143    movu                xm0, [srcq+strideq*0]
1144.v_w8_loop:
1145    vinserti32x4        ym1, ym0, [srcq+strideq*1], 1
1146    vinserti32x4         m1, [srcq+strideq*2], 2
1147    vinserti32x4         m1, [srcq+stride3q ], 3 ; 0 1 2 3
1148    lea                srcq, [srcq+strideq*4]
1149    movu                xm0, [srcq+strideq*0]
1150    valignq              m2, m0, m1, 2           ; 1 2 3 4
1151    pmullw               m1, m8
1152    pmullw               m2, m9
1153    psubw                m1, m10
1154    paddw                m1, m2
1155    psraw                m1, 2
1156    mova             [tmpq], m1
1157    add                tmpq, 64
1158    sub                  hd, 4
1159    jg .v_w8_loop
1160    RET
1161.v_w16:
1162    movu                ym0, [srcq+strideq*0]
1163.v_w16_loop:
1164    vinserti32x8         m1, m0, [srcq+strideq*1], 1 ; 0 1
1165    movu                ym3, [srcq+strideq*2]
1166    vinserti32x8         m2, m3, [srcq+stride3q ], 1 ; 2 3
1167    lea                srcq, [srcq+strideq*4]
1168    movu                ym0, [srcq+strideq*0]
1169    vshufi32x4           m3, m1, m3, q1032           ; 1 2
1170    vshufi32x4           m4, m2, m0, q1032           ; 3 4
1171    pmullw               m1, m8
1172    pmullw               m2, m8
1173    pmullw               m3, m9
1174    pmullw               m4, m9
1175    psubw                m1, m10
1176    psubw                m2, m10
1177    paddw                m1, m3
1178    paddw                m2, m4
1179    psraw                m1, 2
1180    psraw                m2, 2
1181    mova        [tmpq+64*0], m1
1182    mova        [tmpq+64*1], m2
1183    add                tmpq, 64*2
1184    sub                  hd, 4
1185    jg .v_w16_loop
1186    RET
1187.v_w32:
1188    movu                 m0, [srcq+strideq*0]
1189.v_w32_loop:
1190    movu                 m3, [srcq+strideq*1]
1191    lea                srcq, [srcq+strideq*2]
1192    pmullw               m1, m8, m0
1193    movu                 m0, [srcq+strideq*0]
1194    pmullw               m2, m8, m3
1195    pmullw               m3, m9
1196    pmullw               m4, m9, m0
1197    psubw                m1, m10
1198    psubw                m2, m10
1199    paddw                m1, m3
1200    paddw                m2, m4
1201    psraw                m1, 2
1202    psraw                m2, 2
1203    mova        [tmpq+64*0], m1
1204    mova        [tmpq+64*1], m2
1205    add                tmpq, 64*2
1206    sub                  hd, 2
1207    jg .v_w32_loop
1208    RET
1209.v_w64:
1210    movu                 m0, [srcq+64*0]
1211    movu                 m1, [srcq+64*1]
1212.v_w64_loop:
1213    add                srcq, strideq
1214    pmullw               m2, m8, m0
1215    movu                 m0, [srcq+64*0]
1216    pmullw               m3, m8, m1
1217    movu                 m1, [srcq+64*1]
1218    pmullw               m4, m9, m0
1219    pmullw               m5, m9, m1
1220    psubw                m2, m10
1221    psubw                m3, m10
1222    paddw                m2, m4
1223    paddw                m3, m5
1224    psraw                m2, 2
1225    psraw                m3, 2
1226    mova        [tmpq+64*0], m2
1227    mova        [tmpq+64*1], m3
1228    add                tmpq, 64*2
1229    dec                  hd
1230    jg .v_w64_loop
1231    RET
1232.v_w128:
1233    movu                 m0, [srcq+64*0]
1234    movu                 m1, [srcq+64*1]
1235    movu                 m2, [srcq+64*2]
1236    movu                 m3, [srcq+64*3]
1237.v_w128_loop:
1238    add                srcq, strideq
1239    pmullw               m4, m8, m0
1240    movu                 m0, [srcq+64*0]
1241    pmullw               m5, m8, m1
1242    movu                 m1, [srcq+64*1]
1243    pmullw               m6, m8, m2
1244    movu                 m2, [srcq+64*2]
1245    pmullw               m7, m8, m3
1246    movu                 m3, [srcq+64*3]
1247    pmullw              m11, m9, m0
1248    pmullw              m12, m9, m1
1249    pmullw              m13, m9, m2
1250    pmullw              m14, m9, m3
1251    REPX     {psubw x, m10}, m4, m5, m6, m7
1252    paddw                m4, m11
1253    paddw                m5, m12
1254    paddw                m6, m13
1255    paddw                m7, m14
1256    REPX       {psraw x, 2}, m4, m5, m6, m7
1257    mova        [tmpq+64*0], m4
1258    mova        [tmpq+64*1], m5
1259    mova        [tmpq+64*2], m6
1260    mova        [tmpq+64*3], m7
1261    add                tmpq, 64*4
1262    dec                  hd
1263    jg .v_w128_loop
1264    RET
1265.hv:
1266    movzx                wd, word [r6+wq*2+table_offset(prep, _bilin_hv)]
1267    shl                mxyd, 11
1268    vpbroadcastw         m7, mxyd
1269    add                  wq, r6
1270    lea            stride3q, [strideq*3]
1271    jmp                  wq
1272.hv_w4:
1273    movq               xmm0, [srcq+strideq*0+0]
1274    movq               xmm1, [srcq+strideq*0+2]
1275    pmullw             xmm0, xm4
1276    pmullw             xmm1, xm5
1277    psubw              xmm0, xm6
1278    paddw              xmm0, xmm1
1279    psraw              xmm0, 2
1280    vpbroadcastq        ym0, xmm0
1281.hv_w4_loop:
1282    movu                xm1, [srcq+strideq*1]
1283    vinserti128         ym1, [srcq+stride3q ], 1
1284    movu                xm2, [srcq+strideq*2]
1285    lea                srcq, [srcq+strideq*4]
1286    vinserti128         ym2, [srcq+strideq*0], 1
1287    punpcklqdq          ym3, ym1, ym2
1288    psrldq              ym1, 2
1289    psrldq              ym2, 2
1290    pmullw              ym3, ym4
1291    punpcklqdq          ym1, ym2
1292    pmullw              ym1, ym5
1293    psubw               ym3, ym6
1294    paddw               ym1, ym3
1295    psraw               ym1, 2           ; 1 2 3 4
1296    valignq             ym2, ym1, ym0, 3 ; 0 1 2 3
1297    mova                ym0, ym1
1298    psubw               ym1, ym2
1299    pmulhrsw            ym1, ym7
1300    paddw               ym1, ym2
1301    mova             [tmpq], ym1
1302    add                tmpq, 32
1303    sub                  hd, 4
1304    jg .hv_w4_loop
1305    RET
1306.hv_w8:
1307    pmullw              xm0, xm4, [srcq+strideq*0+0]
1308    pmullw              xm1, xm5, [srcq+strideq*0+2]
1309    psubw               xm0, xm6
1310    paddw               xm0, xm1
1311    psraw               xm0, 2
1312    vinserti32x4         m0, xm0, 3
1313.hv_w8_loop:
1314    movu                xm1, [srcq+strideq*1+0]
1315    movu                xm2, [srcq+strideq*1+2]
1316    vinserti32x4        ym1, [srcq+strideq*2+0], 1
1317    vinserti32x4        ym2, [srcq+strideq*2+2], 1
1318    vinserti32x4         m1, [srcq+stride3q +0], 2
1319    vinserti32x4         m2, [srcq+stride3q +2], 2
1320    lea                srcq, [srcq+strideq*4]
1321    vinserti32x4         m1, [srcq+strideq*0+0], 3
1322    vinserti32x4         m2, [srcq+strideq*0+2], 3
1323    pmullw               m1, m4
1324    pmullw               m2, m5
1325    psubw                m1, m6
1326    paddw                m1, m2
1327    psraw                m1, 2         ; 1 2 3 4
1328    valignq              m2, m1, m0, 6 ; 0 1 2 3
1329    mova                 m0, m1
1330    psubw                m1, m2
1331    pmulhrsw             m1, m7
1332    paddw                m1, m2
1333    mova             [tmpq], m1
1334    add                tmpq, 64
1335    sub                  hd, 4
1336    jg .hv_w8_loop
1337    RET
1338.hv_w16:
1339    pmullw              ym0, ym4, [srcq+strideq*0+0]
1340    pmullw              ym1, ym5, [srcq+strideq*0+2]
1341    psubw               ym0, ym6
1342    paddw               ym0, ym1
1343    psraw               ym0, 2
1344    vinserti32x8         m0, ym0, 1
1345.hv_w16_loop:
1346    movu                ym1, [srcq+strideq*1+0]
1347    movu                ym2, [srcq+strideq*1+2]
1348    lea                srcq, [srcq+strideq*2]
1349    vinserti32x8         m1, [srcq+strideq*0+0], 1
1350    vinserti32x8         m2, [srcq+strideq*0+2], 1
1351    pmullw               m1, m4
1352    pmullw               m2, m5
1353    psubw                m1, m6
1354    paddw                m1, m2
1355    psraw                m1, 2             ; 1 2
1356    vshufi32x4           m2, m0, m1, q1032 ; 0 1
1357    mova                 m0, m1
1358    psubw                m1, m2
1359    pmulhrsw             m1, m7
1360    paddw                m1, m2
1361    mova             [tmpq], m1
1362    add                tmpq, 64
1363    sub                  hd, 2
1364    jg .hv_w16_loop
1365    RET
1366.hv_w32:
1367    pmullw               m0, m4, [srcq+strideq*0+0]
1368    pmullw               m1, m5, [srcq+strideq*0+2]
1369    psubw                m0, m6
1370    paddw                m0, m1
1371    psraw                m0, 2
1372.hv_w32_loop:
1373    pmullw               m3, m4, [srcq+strideq*1+0]
1374    pmullw               m1, m5, [srcq+strideq*1+2]
1375    lea                srcq, [srcq+strideq*2]
1376    psubw                m3, m6
1377    paddw                m3, m1
1378    psraw                m3, 2
1379    psubw                m1, m3, m0
1380    pmulhrsw             m1, m7
1381    paddw                m1, m0
1382    pmullw               m0, m4, [srcq+strideq*0+0]
1383    pmullw               m2, m5, [srcq+strideq*0+2]
1384    psubw                m0, m6
1385    paddw                m0, m2
1386    psraw                m0, 2
1387    psubw                m2, m0, m3
1388    pmulhrsw             m2, m7
1389    paddw                m2, m3
1390    mova        [tmpq+64*0], m1
1391    mova        [tmpq+64*1], m2
1392    add                tmpq, 64*2
1393    sub                  hd, 2
1394    jg .hv_w32_loop
1395    RET
1396.hv_w64:
1397    pmullw               m0, m4, [srcq+ 0]
1398    pmullw               m2, m5, [srcq+ 2]
1399    pmullw               m1, m4, [srcq+64]
1400    pmullw               m3, m5, [srcq+66]
1401    psubw                m0, m6
1402    psubw                m1, m6
1403    paddw                m0, m2
1404    paddw                m1, m3
1405    psraw                m0, 2
1406    psraw                m1, 2
1407.hv_w64_loop:
1408    add                srcq, strideq
1409    pmullw               m2, m4, [srcq+ 0]
1410    pmullw               m8, m5, [srcq+ 2]
1411    pmullw               m3, m4, [srcq+64]
1412    pmullw               m9, m5, [srcq+66]
1413    psubw                m2, m6
1414    psubw                m3, m6
1415    paddw                m2, m8
1416    paddw                m3, m9
1417    psraw                m2, 2
1418    psraw                m3, 2
1419    psubw                m8, m2, m0
1420    psubw                m9, m3, m1
1421    pmulhrsw             m8, m7
1422    pmulhrsw             m9, m7
1423    paddw                m8, m0
1424    mova                 m0, m2
1425    paddw                m9, m1
1426    mova                 m1, m3
1427    mova        [tmpq+64*0], m8
1428    mova        [tmpq+64*1], m9
1429    add                tmpq, 64*2
1430    dec                  hd
1431    jg .hv_w64_loop
1432    RET
1433.hv_w128:
1434    pmullw               m0, m4, [srcq+  0]
1435    pmullw               m8, m5, [srcq+  2]
1436    pmullw               m1, m4, [srcq+ 64]
1437    pmullw               m9, m5, [srcq+ 66]
1438    pmullw               m2, m4, [srcq+128]
1439    pmullw              m10, m5, [srcq+130]
1440    pmullw               m3, m4, [srcq+192]
1441    pmullw              m11, m5, [srcq+194]
1442    REPX      {psubw x, m6}, m0, m1, m2, m3
1443    paddw                m0, m8
1444    paddw                m1, m9
1445    paddw                m2, m10
1446    paddw                m3, m11
1447    REPX       {psraw x, 2}, m0, m1, m2, m3
1448.hv_w128_loop:
1449    add                srcq, strideq
1450    pmullw               m8, m4, [srcq+  0]
1451    pmullw              m12, m5, [srcq+  2]
1452    pmullw               m9, m4, [srcq+ 64]
1453    pmullw              m13, m5, [srcq+ 66]
1454    pmullw              m10, m4, [srcq+128]
1455    pmullw              m14, m5, [srcq+130]
1456    pmullw              m11, m4, [srcq+192]
1457    pmullw              m15, m5, [srcq+194]
1458    REPX      {psubw x, m6}, m8, m9, m10, m11
1459    paddw                m8, m12
1460    paddw                m9, m13
1461    paddw               m10, m14
1462    paddw               m11, m15
1463    REPX       {psraw x, 2}, m8, m9, m10, m11
1464    psubw               m12, m8, m0
1465    psubw               m13, m9, m1
1466    psubw               m14, m10, m2
1467    psubw               m15, m11, m3
1468    REPX   {pmulhrsw x, m7}, m12, m13, m14, m15
1469    paddw               m12, m0
1470    mova                 m0, m8
1471    paddw               m13, m1
1472    mova                 m1, m9
1473    mova        [tmpq+64*0], m12
1474    mova        [tmpq+64*1], m13
1475    paddw               m14, m2
1476    mova                 m2, m10
1477    paddw               m15, m3
1478    mova                 m3, m11
1479    mova        [tmpq+64*2], m14
1480    mova        [tmpq+64*3], m15
1481    add                tmpq, 64*4
1482    dec                  hd
1483    jg .hv_w128_loop
1484    RET
1485
1486; int8_t subpel_filters[5][15][8]
1487%assign FILTER_REGULAR (0*15 << 16) | 3*15
1488%assign FILTER_SMOOTH  (1*15 << 16) | 4*15
1489%assign FILTER_SHARP   (2*15 << 16) | 3*15
1490
1491%macro FN 4-5 ; prefix, type, type_h, type_v, jmp_to
1492cglobal %1_%2_16bpc
1493    mov                 t0d, FILTER_%3
1494%ifidn %3, %4
1495    mov                 t1d, t0d
1496%else
1497    mov                 t1d, FILTER_%4
1498%endif
1499%if %0 == 5 ; skip the jump in the last filter
1500    jmp mangle(private_prefix %+ _%5 %+ SUFFIX)
1501%endif
1502%endmacro
1503
1504%macro MC_8TAP_FN 4 ; prefix, type, type_h, type_v
1505cglobal %1_8tap_%2_16bpc
1506    mov                 t0d, FILTER_%3
1507%ifidn %3, %4
1508    mov                 t1d, t0d
1509%else
1510    mov                 t1d, FILTER_%4
1511%endif
1512%ifnidn %2, regular ; skip the jump in the last filter
1513    jmp mangle(private_prefix %+ _%1_8tap_16bpc %+ SUFFIX)
1514%endif
1515%endmacro
1516
1517%if WIN64
1518DECLARE_REG_TMP 4, 5
1519%define buf rsp+stack_offset+8 ; shadow space
1520%else
1521DECLARE_REG_TMP 7, 8
1522%define buf rsp-40 ; red zone
1523%endif
1524
1525%define PUT_8TAP_FN FN put_8tap,
1526PUT_8TAP_FN smooth,         SMOOTH,  SMOOTH,  put_6tap_16bpc
1527PUT_8TAP_FN smooth_regular, SMOOTH,  REGULAR, put_6tap_16bpc
1528PUT_8TAP_FN regular_smooth, REGULAR, SMOOTH,  put_6tap_16bpc
1529PUT_8TAP_FN regular,        REGULAR, REGULAR
1530
1531cglobal put_6tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
1532%define base r8-put_avx512icl
1533    imul                mxd, mxm, 0x010101
1534    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
1535    imul                myd, mym, 0x010101
1536    add                 myd, t1d ; 6tap_v, my, 4tap_v
1537    lea                  r8, [put_avx512icl]
1538    movifnidn            wd, wm
1539    movifnidn            hd, hm
1540    test                mxd, 0xf00
1541    jnz .h
1542    test                myd, 0xf00
1543    jnz .v
1544.put:
1545    tzcnt                wd, wd
1546    movzx                wd, word [r8+wq*2+table_offset(put,)]
1547    add                  wq, r8
1548%if WIN64
1549    pop                  r8
1550%endif
1551    jmp                  wq
1552.h_w8:
1553    mova                 m4, [spel_h_shufA]
1554    movu                 m5, [spel_h_shufB]
1555    movu                 m6, [spel_h_shufC]
1556.h_w8_loop:
1557    movu                ym2, [srcq+ssq*0]
1558    vinserti32x8         m2, [srcq+ssq*1], 1
1559    lea                srcq, [srcq+ssq*2]
1560    mova                 m0, m8
1561    vpermb               m1, m4, m2
1562    vpdpwssd             m0, m10, m1
1563    vpermb               m1, m5, m2
1564    vpdpwssd             m0, m11, m1
1565    vpermb               m1, m6, m2
1566    vpdpwssd             m0, m12, m1
1567    psrad                m0, 6
1568    vextracti32x8       ym1, m0, 1
1569    packusdw            ym0, ym1
1570    pminsw              ym0, ym15
1571    mova          [dstq+dsq*0], xm0
1572    vextracti32x4 [dstq+dsq*1], ym0, 1
1573    lea                dstq, [dstq+dsq*2]
1574    sub                  hd, 2
1575    jg .h_w8_loop
1576    RET
1577.h:
1578    vpbroadcastw        m15, r8m
1579    test                myd, 0xf00
1580    jnz .hv
1581    mov                 r7d, r8m
1582    shr                 r7d, 11
1583    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
1584    cmp                  wd, 4
1585    jle mangle(private_prefix %+ _put_8tap_16bpc_avx512icl).h_w4
1586    shr                 mxd, 16
1587    sub                srcq, 4
1588    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
1589    mova              [buf], xmm0
1590    vpbroadcastd        m10, xmm0
1591    vpbroadcastd        m12, [buf+8]
1592    vpbroadcastd        m11, [buf+4]
1593    sub                  wd, 16
1594    jl .h_w8
1595    vbroadcasti32x4      m6, [spel_h_shufA]
1596    vbroadcasti32x4      m7, [spel_h_shufB]
1597    jg .h_w32
1598.h_w16_loop:
1599    movu                ym2, [srcq+ssq*0+ 0]
1600    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
1601    movu                ym3, [srcq+ssq*0+12]
1602    vinserti32x8         m3, [srcq+ssq*1+12], 1
1603    lea                srcq, [srcq+ssq*2]
1604    mova                 m0, m8
1605    mova                 m1, m8
1606    pshufb               m4, m2, m6
1607    vpdpwssd             m0, m10, m4 ; a0  b0
1608    pshufb               m4, m3, m7
1609    vpdpwssd             m1, m12, m4 ; a2' b2'
1610    pshufb               m2, m7
1611    pshufb               m3, m6
1612    vpdpwssd             m0, m11, m2 ; a1  b1
1613    vpdpwssd             m1, m11, m3 ; a1' b1'
1614    shufpd               m2, m3, 0x55
1615    vpdpwssd             m0, m12, m2 ; a2  b2
1616    vpdpwssd             m1, m10, m2 ; a0' b0'
1617    psrad                m0, 6
1618    psrad                m1, 6
1619    packusdw             m0, m1
1620    pminsw               m0, m15
1621    mova          [dstq+dsq*0], ym0
1622    vextracti32x8 [dstq+dsq*1], m0, 1
1623    lea                dstq, [dstq+dsq*2]
1624    sub                  hd, 2
1625    jg .h_w16_loop
1626    RET
1627.h_w32:
1628    lea                srcq, [srcq+wq*2]
1629    lea                dstq, [dstq+wq*2]
1630    neg                  wq
1631.h_w32_loop0:
1632    mov                  r6, wq
1633.h_w32_loop:
1634    movu                 m2, [srcq+r6*2+ 0]
1635    movu                 m3, [srcq+r6*2+12]
1636    mova                 m0, m8
1637    mova                 m1, m8
1638    pshufb               m4, m2, m6
1639    vpdpwssd             m0, m10, m4 ; a0
1640    pshufb               m4, m3, m7
1641    vpdpwssd             m1, m12, m4 ; b2
1642    pshufb               m2, m7
1643    pshufb               m3, m6
1644    vpdpwssd             m0, m11, m2 ; a1
1645    vpdpwssd             m1, m11, m3 ; b1
1646    shufpd               m2, m3, 0x55
1647    vpdpwssd             m0, m12, m2 ; a2
1648    vpdpwssd             m1, m10, m2 ; b0
1649    psrad                m0, 6
1650    psrad                m1, 6
1651    packusdw             m0, m1
1652    pminsw               m0, m15
1653    mova        [dstq+r6*2], m0
1654    add                  r6, 32
1655    jl .h_w32_loop
1656    add                srcq, ssq
1657    add                dstq, dsq
1658    dec                  hd
1659    jg .h_w32_loop0
1660    RET
1661.v:
1662    movzx               mxd, myb
1663    shr                 myd, 16
1664    cmp                  hd, 6
1665    cmovs               myd, mxd
1666    vpbroadcastd        m11, [pd_32]
1667    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
1668    tzcnt               r7d, wd
1669    vpbroadcastw        m15, r8m
1670    mov                  r6, ssq
1671    movzx               r7d, word [r8+r7*2+table_offset(put, _6tap_v)]
1672    neg                  r6
1673    mova [rsp+stack_offset+8], xmm0
1674    vpbroadcastd        m12, xmm0
1675    add                  r7, r8
1676    vpbroadcastd        m13, [rsp+stack_offset+12]
1677    vpbroadcastd        m14, [rsp+stack_offset+16]
1678    jmp                  r7
1679.v_w2:
1680    movd               xmm2, [srcq+r6 *2]
1681    pinsrd             xmm2, [srcq+r6 *1], 1
1682    pinsrd             xmm2, [srcq+ssq*0], 2
1683    pinsrd             xmm2, [srcq+ssq*1], 3 ; 0 1 2 3
1684    lea                srcq, [srcq+ssq*2]
1685    movd               xmm0, [srcq+ssq*0]
1686    palignr            xmm3, xmm0, xmm2, 4   ; 1 2 3 4
1687    punpcklwd          xmm1, xmm2, xmm3      ; 01 12
1688    punpckhwd          xmm2, xmm3            ; 23 34
1689.v_w2_loop:
1690    movd               xmm3, [srcq+ssq*1]
1691    mova               xmm4, xm11
1692    vpdpwssd           xmm4, xmm1, xm12      ; a0 b0
1693    lea                srcq, [srcq+ssq*2]
1694    mova               xmm1, xmm2
1695    vpdpwssd           xmm4, xmm2, xm13      ; a1 b1
1696    punpckldq          xmm2, xmm0, xmm3      ; 4 5
1697    movd               xmm0, [srcq+ssq*0]
1698    punpckldq          xmm3, xmm0            ; 5 6
1699    punpcklwd          xmm2, xmm3            ; 45 56
1700    vpdpwssd           xmm4, xmm2, xm14      ; a2 b2
1701    psrad              xmm4, 6
1702    packusdw           xmm4, xmm4
1703    pminsw             xmm4, xm15
1704    movd       [dstq+dsq*0], xmm4
1705    pextrd     [dstq+dsq*1], xmm4, 1
1706    lea                dstq, [dstq+dsq*2]
1707    sub                  hd, 2
1708    jg .v_w2_loop
1709    RET
1710.v_w4:
1711    movq               xmm1, [srcq+r6 *2]
1712    vpbroadcastq       ymm3, [srcq+r6 *1]
1713    vpbroadcastq       ymm2, [srcq+ssq*0]
1714    vpbroadcastq       ymm4, [srcq+ssq*1]
1715    lea                srcq, [srcq+ssq*2]
1716    vpbroadcastq       ymm0, [srcq+ssq*0]
1717    vpblendd           ymm1, ymm3, 0x30
1718    vpblendd           ymm3, ymm2, 0x30
1719    punpcklwd          ymm1, ymm3       ; 01 12
1720    vpblendd           ymm2, ymm4, 0x30
1721    vpblendd           ymm4, ymm0, 0x30
1722    punpcklwd          ymm2, ymm4       ; 23 34
1723.v_w4_loop:
1724    vpbroadcastq       ymm3, [srcq+ssq*1]
1725    mova               ymm4, ym11
1726    vpdpwssd           ymm4, ymm1, ym12 ; a0 b0
1727    lea                srcq, [srcq+ssq*2]
1728    mova               ymm1, ymm2
1729    vpdpwssd           ymm4, ymm2, ym13 ; a1 b1
1730    vpblendd           ymm2, ymm0, ymm3, 0x30
1731    vpbroadcastq       ymm0, [srcq+ssq*0]
1732    vpblendd           ymm3, ymm0, 0x30
1733    punpcklwd          ymm2, ymm3       ; 45 56
1734    vpdpwssd           ymm4, ymm2, ym14 ; a2 b2
1735    psrad              ymm4, 6
1736    vextracti128       xmm3, ymm4, 1
1737    packusdw           xmm4, xmm3
1738    pminsw             xmm4, xm15
1739    movq       [dstq+dsq*0], xmm4
1740    movhps     [dstq+dsq*1], xmm4
1741    lea                dstq, [dstq+dsq*2]
1742    sub                  hd, 2
1743    jg .v_w4_loop
1744    vzeroupper
1745    RET
1746.v_w8:
1747    vbroadcasti32x4      m0, [srcq+ssq*0]
1748    vinserti32x4         m1, m0, [srcq+r6 *2], 0
1749    vinserti32x4         m1, [srcq+r6 *1], 1 ; 0 1 2
1750    vinserti32x4        ym0, [srcq+ssq*1], 1
1751    lea                srcq, [srcq+ssq*2]
1752    mova                 m5, [spel_v_shuf8]
1753    vinserti32x4         m0, [srcq+ssq*0], 2 ; 2 3 4
1754    vpermb               m1, m5, m1          ; 01 12
1755    vpermb               m2, m5, m0          ; 23 34
1756.v_w8_loop:
1757    vinserti32x4         m0, [srcq+ssq*1], 3
1758    lea                srcq, [srcq+ssq*2]
1759    movu                xm3, [srcq+ssq*0]
1760    mova                 m4, m11
1761    vpdpwssd             m4, m12, m1         ; a0 b0
1762    vshufi32x4           m0, m3, q1032       ; 4 5 6
1763    mova                 m1, m2
1764    vpdpwssd             m4, m13, m2         ; a1 b1
1765    vpermb               m2, m5, m0          ; 45 56
1766    vpdpwssd             m4, m14, m2         ; a2 b2
1767    psrad                m4, 6
1768    vextracti32x8       ym3, m4, 1
1769    packusdw            ym4, ym3
1770    pminsw              ym4, ym15
1771    mova          [dstq+dsq*0], xm4
1772    vextracti32x4 [dstq+dsq*1], ym4, 1
1773    lea                dstq, [dstq+dsq*2]
1774    sub                  hd, 2
1775    jg .v_w8_loop
1776    RET
1777.v_w16:
1778    vbroadcasti32x8      m0, [srcq+r6 *1]
1779    vinserti32x8         m1, m0, [srcq+ssq*0], 1
1780    vinserti32x8         m0, [srcq+r6*2], 0
1781    mova                 m6, [spel_v_shuf16]
1782    movu                ym3, [srcq+ssq*1]
1783    lea                srcq, [srcq+ssq*2]
1784    vinserti32x8         m3, [srcq+ssq*0], 1
1785    vpermb               m1, m6, m1     ; 12
1786    vpermb               m0, m6, m0     ; 01
1787    vpermb               m3, m6, m3     ; 34
1788    mova                 m7, [deint_q_shuf]
1789    vpshrdd              m2, m1, m3, 16 ; 23
1790.v_w16_loop:
1791    mova                 m5, m11
1792    vpdpwssd             m5, m12, m1    ; b0
1793    mova                 m4, m11
1794    vpdpwssd             m4, m12, m0    ; a0
1795    mova                 m1, m3
1796    vpdpwssd             m5, m13, m3    ; b1
1797    mova                 m0, m2
1798    vpdpwssd             m4, m13, m2    ; a1
1799    movu                ym3, [srcq+ssq*1]
1800    lea                srcq, [srcq+ssq*2]
1801    vinserti32x8         m3, [srcq+ssq*0], 1
1802    vpermb               m3, m6, m3     ; 56
1803    vpshrdd              m2, m1, m3, 16 ; 45
1804    vpdpwssd             m5, m14, m3    ; b2
1805    vpdpwssd             m4, m14, m2    ; a2
1806    psrad                m5, 6
1807    psrad                m4, 6
1808    packusdw             m4, m5
1809    pminsw               m4, m15
1810    vpermq               m4, m7, m4
1811    mova          [dstq+dsq*0], ym4
1812    vextracti32x8 [dstq+dsq*1], m4, 1
1813    lea                dstq, [dstq+dsq*2]
1814    sub                  hd, 2
1815    jg .v_w16_loop
1816    RET
1817.v_w32:
1818.v_w64:
1819.v_w128:
1820    lea                  wd, [hq+wq*8-256]
1821.v_w32_loop0:
1822    movu                m16, [srcq+r6 *2]
1823    movu                m17, [srcq+r6 *1]
1824    lea                  r7, [srcq+ssq*2]
1825    movu                m18, [srcq+ssq*0]
1826    movu                m19, [srcq+ssq*1]
1827    mov                  r8, dstq
1828    movu                m20, [r7  +ssq*0]
1829    punpcklwd            m0, m16, m17 ; 01
1830    punpckhwd           m16, m17
1831    punpcklwd            m1, m17, m18 ; 12
1832    punpckhwd           m17, m18
1833    punpcklwd            m2, m18, m19 ; 23
1834    punpckhwd           m18, m19
1835    punpcklwd            m3, m19, m20 ; 34
1836    punpckhwd           m19, m20
1837.v_w32_loop:
1838    mova                 m4, m11
1839    vpdpwssd             m4, m12, m0  ; a0
1840    mova                 m6, m11
1841    vpdpwssd             m6, m12, m16
1842    mova                 m5, m11
1843    vpdpwssd             m5, m12, m1  ; b0
1844    mova                 m7, m11
1845    vpdpwssd             m7, m12, m17
1846    mova                 m0, m2
1847    vpdpwssd             m4, m13, m2  ; a1
1848    mova                m16, m18
1849    vpdpwssd             m6, m13, m18
1850    mova                 m1, m3
1851    vpdpwssd             m5, m13, m3  ; b1
1852    mova                m17, m19
1853    vpdpwssd             m7, m13, m19
1854    movu                m19, [r7+ssq*1]
1855    lea                  r7, [r7+ssq*2]
1856    punpcklwd            m2, m20, m19 ; 45
1857    punpckhwd           m18, m20, m19
1858    movu                m20, [r7+ssq*0]
1859    vpdpwssd             m4, m14, m2  ; a2
1860    vpdpwssd             m6, m14, m18
1861    punpcklwd            m3, m19, m20 ; 56
1862    punpckhwd           m19, m20
1863    vpdpwssd             m5, m14, m3  ; b2
1864    vpdpwssd             m7, m14, m19
1865    REPX       {psrad x, 6}, m4, m6, m5, m7
1866    packusdw             m4, m6
1867    packusdw             m5, m7
1868    pminsw               m4, m15
1869    pminsw               m5, m15
1870    mova         [r8+dsq*0], m4
1871    mova         [r8+dsq*1], m5
1872    lea                  r8, [r8+dsq*2]
1873    sub                  hd, 2
1874    jg .v_w32_loop
1875    add                srcq, 64
1876    add                dstq, 64
1877    movzx                hd, wb
1878    sub                  wd, 1<<8
1879    jg .v_w32_loop0
1880    vzeroupper
1881    RET
1882.hv:
1883    cmp                  wd, 4
1884    jg .hv_w8
1885    movzx               mxd, mxb
1886    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
1887    movzx               mxd, myb
1888    shr                 myd, 16
1889    cmp                  hd, 6
1890    cmovs               myd, mxd
1891    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
1892    mov                  r6, ssq
1893    sub                srcq, 2
1894    neg                  r6
1895    test          dword r8m, 0x800
1896    jnz .hv_12bit
1897    vpbroadcastd        m10, [pd_2176]
1898    psllw              xmm0, 6
1899    jmp .hv_main
1900.hv_12bit:
1901    vpbroadcastd        m10, [pd_640]
1902    psllw              xmm0, 4
1903    psllw              xmm1, 2
1904.hv_main:
1905    movu                xm4, [srcq+r6 *2]
1906    vinserti32x4        ym4, [srcq+r6 *1], 1
1907    vinserti32x4         m4, [srcq+ssq*0], 2
1908    vbroadcasti32x4      m6, [spel_h_shufA]
1909    vinserti32x4         m4, [srcq+ssq*1], 3 ; 0 1 2 3
1910    lea                srcq, [srcq+ssq*2]
1911    movu                xm5, [srcq+ssq*0]    ; 4
1912    mova           [buf+ 0], xmm0
1913    mova           [buf+16], xmm1
1914    vpbroadcastd         m8, [buf+ 4]
1915    vpbroadcastd         m9, [buf+ 8]
1916    vpbroadcastd       ym12, xmm1
1917    vpbroadcastd       ym13, [buf+20]
1918    vpbroadcastd       ym14, [buf+24]
1919    cmp                  wd, 4
1920    je .hv_w4
1921    vbroadcasti32x4      m2, [spel_h_shufA]
1922    mova                 m3, [spel_h_shuf2b]
1923    mova                 m1, m10
1924    pshufb               m4, m6
1925    pshufb              xm5, xm6
1926    punpcklqdq           m2, m4, m5
1927    vpdpwssd             m1, m8, m2    ; 04 1_ 2_ 3_
1928    mova                ym6, [spel_h_shuf2a]
1929    punpckhqdq           m4, m5
1930    mova                xm5, [spel_shuf2]
1931    vpdpwssd             m1, m9, m4
1932    vpermb               m1, m3, m1    ; 01 12
1933    vextracti32x4       xm2, ym1, 1    ; 23 34
1934.hv_w2_loop:
1935    movu                xm3, [srcq+ssq*1]
1936    lea                srcq, [srcq+ssq*2]
1937    vinserti32x4        ym3, [srcq+ssq*0], 1
1938    vpermb              ym3, ym6, ym3
1939    pmaddwd            xmm0, xm12, xm1 ; a0 b0
1940    mova                xm4, xm10
1941    vpdpwssd            xm4, xm8, xm3
1942    vextracti32x4       xm3, ym3, 1
1943    mova                xm1, xm2
1944    vpdpwssd           xmm0, xm13, xm2 ; a1 b1
1945    vpdpwssd            xm4, xm9, xm3  ; 5 6
1946    vpermt2b            xm2, xm5, xm4  ; 45 56
1947    vpdpwssd           xmm0, xm14, xm2 ; a2 b2
1948    psrad              xmm0, 10
1949    packusdw           xmm0, xmm0
1950    pminsw             xmm0, xm15
1951    movd       [dstq+dsq*0], xmm0
1952    pextrd     [dstq+dsq*1], xmm0, 1
1953    lea                dstq, [dstq+dsq*2]
1954    sub                  hd, 2
1955    jg .hv_w2_loop
1956    RET
1957.hv_w4:
1958    vbroadcasti32x4      m7, [spel_h_shufB]
1959    mova                ym0, [spel_shuf4a]
1960    pshufb               m1, m4, m6
1961    mova                 m2, m10
1962    vpdpwssd             m2, m8, m1
1963    pshufb              xm1, xm5, xm6
1964    mova                xm3, xm10
1965    vpdpwssd            xm3, xm8, xm1
1966    pshufb               m4, m7
1967    pshufb              xm5, xm7
1968    vpdpwssd             m2, m9, m4    ; 0 1 2 3
1969    vpdpwssd            xm3, xm9, xm5  ; 4
1970    mova                ym5, [spel_shuf4b]
1971    vpermb               m1, m0, m2    ; 01 12
1972    vshufi32x4           m2, m3, q1032 ; 2 3 4
1973    vpermb               m2, m0, m2    ; 23 34
1974.hv_w4_loop:
1975    movu                xm3, [srcq+ssq*1]
1976    lea                srcq, [srcq+ssq*2]
1977    vinserti32x4        ym3, [srcq+ssq*0], 1
1978    pmaddwd             ym0, ym12, ym1 ; a0 b0
1979    mova                ym1, ym2
1980    pshufb              ym4, ym3, ym6
1981    mova                ym2, ym10
1982    vpdpwssd            ym2, ym8, ym4
1983    pshufb              ym3, ym7
1984    vpdpwssd            ym0, ym13, ym1 ; a1 b1
1985    vpdpwssd            ym2, ym9, ym3  ; 5 6
1986    vpermt2b            ym2, ym5, ym1  ; 45 56
1987    vpdpwssd            ym0, ym14, ym2 ; a2 b2
1988    psrad               ym0, 10
1989    vextracti32x4       xm4, ym0, 1
1990    packusdw            xm0, xm4
1991    pminsw             xmm0, xm0, xm15
1992    movq       [dstq+dsq*0], xmm0
1993    movhps     [dstq+dsq*1], xmm0
1994    lea                dstq, [dstq+dsq*2]
1995    sub                  hd, 2
1996    jg .hv_w4_loop
1997    RET
1998.hv_w8:
1999    shr                 mxd, 16
2000    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
2001    movzx               mxd, myb
2002    shr                 myd, 16
2003    cmp                  hd, 6
2004    cmovs               myd, mxd
2005    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
2006    mov                  r6, ssq
2007    sub                srcq, 4
2008    neg                  r6
2009    test          dword r8m, 0x800
2010    jnz .hv_w8_12bit
2011    vpbroadcastd         m8, [pd_2176]
2012    psllw              xmm0, 6
2013    jmp .hv_w8_main
2014.hv_w8_12bit:
2015    vpbroadcastd         m8, [pd_640]
2016    psllw              xmm0, 4
2017    psllw              xmm1, 2
2018.hv_w8_main:
2019    mova           [buf+ 0], xmm0
2020    mova           [buf+16], xmm1
2021    vpbroadcastd         m9, xmm0
2022    vpbroadcastd        m10, [buf+ 4]
2023    vpbroadcastd        m11, [buf+ 8]
2024    vpbroadcastd        m12, xmm1
2025    vpbroadcastd        m13, [buf+20]
2026    vpbroadcastd        m14, [buf+24]
2027    cmp                  wd, 16
2028    jge .hv_w16
2029    mova                 m6, [spel_h_shufA]
2030    movu               ym16, [srcq+r6 *2]
2031    vinserti32x8        m16, [srcq+r6 *1], 1 ; 0 1
2032    movu               ym17, [srcq+ssq*0]
2033    vinserti32x8        m17, [srcq+ssq*1], 1 ; 2 3
2034    lea                srcq, [srcq+ssq*2]
2035    movu               ym18, [srcq+ssq*0]    ; 4
2036    movu                 m7, [spel_h_shufC]
2037    vpermb               m3, m6, m16
2038    mova                 m1, m8
2039    vpermb               m4, m6, m17
2040    vpdpwssd             m1, m9, m3   ; a0 b0
2041    mova                 m2, m8
2042    vpermb               m5, m6, m18
2043    vpdpwssd             m2, m9, m4   ; c0 d0
2044    mova                 m0, m8
2045    vpermb              m16, m7, m16
2046    vpdpwssd             m0, m9, m5   ; e0
2047    vpermb              m17, m7, m17
2048    vpdpwssd             m1, m11, m16 ; a2 b2
2049    vpermb              m18, m7, m18
2050    vpdpwssd             m2, m11, m17 ; c2 d2
2051    shufpd               m3, m16, 0x55
2052    vpdpwssd             m0, m11, m18 ; e2
2053    mova                m16, [spel_shuf8a]
2054    shufpd               m4, m17, 0x55
2055    vpdpwssd             m1, m10, m3  ; a1 b1
2056    shufpd               m5, m18, 0x55
2057    vpdpwssd             m2, m10, m4  ; c1 d1
2058    vpdpwssd             m0, m10, m5  ; e1
2059    mova                 m5, [spel_shuf8b]
2060    vpermt2b             m1, m16, m2  ; 01 12
2061    vpermt2b             m2, m16, m0  ; 23 34
2062.hv_w8_loop:
2063    movu               ym18, [srcq+ssq*1]
2064    lea                srcq, [srcq+ssq*2]
2065    vinserti32x8        m18, [srcq+ssq*0], 1
2066    mova                 m0, m8
2067    vpermb              m17, m6, m18
2068    vpdpwssd             m0, m9, m17  ; f0 g0
2069    vpermb              m18, m7, m18
2070    pmaddwd             m16, m12, m1  ; A0 B0
2071    vpdpwssd             m0, m11, m18 ; f2 g2
2072    shufpd              m17, m18, 0x55
2073    mova                 m1, m2
2074    vpdpwssd            m16, m13, m2  ; A1 B1
2075    vpdpwssd             m0, m10, m17 ; f1 g1
2076    vpermt2b             m2, m5, m0   ; 45 56
2077    vpdpwssd            m16, m14, m2  ; A2 B2
2078    psrad               m16, 10
2079    vextracti32x8      ym17, m16, 1
2080    packusdw           ym16, ym17
2081    pminsw             ym16, ym15
2082    mova         [dstq+dsq*0], xm16
2083    vextracti128 [dstq+dsq*1], ym16, 1
2084    lea                dstq, [dstq+dsq*2]
2085    sub                  hd, 2
2086    jg .hv_w8_loop
2087    vzeroupper
2088    RET
2089.hv_w16:
2090    vbroadcasti32x4     m20, [spel_h_shufA]
2091    vbroadcasti32x4     m21, [spel_h_shufB]
2092    jg .hv_w32
2093    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
2094    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
2095    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
2096    movu               ym16, [srcq+r6 *1+ 0]
2097    movu               ym17, [srcq+r6 *1+12]
2098    vinserti32x8        m16, [srcq+ssq*0+ 0], 1
2099    vinserti32x8        m17, [srcq+ssq*0+12], 1 ; 1 2
2100    movu               ym18, [srcq+ssq*1+ 0]
2101    movu               ym19, [srcq+ssq*1+12]
2102    lea                srcq, [srcq+ssq*2]
2103    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2104    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 3 4
2105    pshufb               m2, m20
2106    mova                 m1, m8
2107    pshufb               m3, m16, m20
2108    vpdpwssd             m1, m11, m2    ; a2
2109    mova                 m2, m8
2110    pshufb               m4, m17, m21
2111    vpdpwssd             m2, m9, m3     ; b0  c0
2112    mova                 m3, m8
2113    pshufb               m5, m18, m20
2114    vpdpwssd             m3, m11, m4    ; b2' c2'
2115    mova                 m4, m8
2116    pshufb               m7, m19, m21
2117    vpdpwssd             m4, m9, m5     ; d0  e0
2118    mova                 m5, m8
2119    pshufb               m0, m6, m20
2120    vpdpwssd             m5, m11, m7    ; d2' e2'
2121    mova                 m7, [spel_shuf16]
2122    pshufb              m16, m21
2123    vpdpwssd             m1, m9, m0     ; a0
2124    pshufb              m17, m20
2125    vpdpwssd             m2, m10, m16   ; b1  c1
2126    pshufb              m18, m21
2127    vpdpwssd             m3, m10, m17   ; b1' c1'
2128    pshufb              m19, m20
2129    vpdpwssd             m4, m10, m18   ; d1  e1
2130    pshufb               m6, m21
2131    vpdpwssd             m5, m10, m19   ; d1' e1'
2132    shufpd              m16, m17, 0x55
2133    vpdpwssd             m1, m10, m6    ; a1
2134    shufpd              m18, m19, 0x55
2135    vpdpwssd             m2, m11, m16   ; b2  c2
2136    vpdpwssd             m3, m9, m16    ; b0' c0'
2137    vpdpwssd             m4, m11, m18   ; d2  e2
2138    vpdpwssd             m5, m9, m18    ; d0' e0'
2139    pslldq               m1, 1
2140    vpermt2b             m2, m7, m3     ; 12
2141    vpermt2b             m4, m7, m5     ; 34
2142    vpshrdd              m1, m2, 16     ; 01
2143    vpshrdd              m3, m2, m4, 16 ; 23
2144.hv_w16_loop:
2145    movu               ym18, [srcq+ssq*1+ 0]
2146    movu               ym19, [srcq+ssq*1+12]
2147    lea                srcq, [srcq+ssq*2]
2148    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
2149    vinserti32x8        m19, [srcq+ssq*0+12], 1
2150    mova                 m5, m8
2151    mova                 m6, m8
2152    pshufb              m17, m18, m20
2153    vpdpwssd             m5, m9, m17    ; f0  g0
2154    pshufb              m16, m19, m21
2155    vpdpwssd             m6, m11, m16   ; f2' g2'
2156    pmaddwd             m17, m12, m2    ; B0
2157    mova                 m2, m4
2158    pmaddwd             m16, m12, m1    ; A0
2159    mova                 m1, m3
2160    pshufb              m18, m21
2161    vpdpwssd             m5, m10, m18   ; f1  g1
2162    pshufb              m19, m20
2163    vpdpwssd             m6, m10, m19   ; f1' g1'
2164    vpdpwssd            m17, m13, m4    ; B1
2165    vpdpwssd            m16, m13, m3    ; A1
2166    shufpd              m18, m19, 0x55
2167    vpdpwssd             m5, m11, m18   ; f2  g2
2168    vpdpwssd             m6, m9, m18    ; f0' g0'
2169    mova                 m4, m7
2170    vpermi2b             m4, m5, m6     ; 56
2171    vpshrdd              m3, m2, m4, 16 ; 45
2172    vpdpwssd            m17, m14, m4    ; B2
2173    vpdpwssd            m16, m14, m3    ; A2
2174    psrad               m16, 10
2175    psrad               m17, 10
2176    vshufi32x4          m18, m16, m17, q3232
2177    vinserti32x8        m16, ym17, 1
2178    packusdw            m16, m18
2179    pminsw              m16, m15
2180    mova          [dstq+dsq*0], ym16
2181    vextracti32x8 [dstq+dsq*1], m16, 1
2182    lea                dstq, [dstq+dsq*2]
2183    sub                  hd, 2
2184    jg .hv_w16_loop
2185    vzeroupper
2186    RET
2187.hv_w32:
2188    WIN64_SPILL_XMM      28
2189    mova                m27, [spel_shuf32]
2190    lea                  wd, [hq+wq*8-256]
2191.hv_w32_loop0:
2192    movu                m16, [srcq+r6 *2+ 0]
2193    movu                 m7, [srcq+r6 *2+12]
2194    movu                 m6, [srcq+r6 *1+ 0]
2195    movu                m18, [srcq+r6 *1+12]
2196    lea                  r7, [srcq+ssq*2]
2197    movu                m17, [srcq+ssq*0+ 0]
2198    movu                m19, [srcq+ssq*0+12]
2199    movu                m22, [srcq+ssq*1+ 0]
2200    movu                m24, [srcq+ssq*1+12]
2201    mov                  r8, dstq
2202    movu                m23, [r7  +ssq*0+ 0]
2203    movu                m25, [r7  +ssq*0+12]
2204    pshufb               m1, m16, m20
2205    mova                 m0, m8
2206    pshufb               m2, m7, m21
2207    vpdpwssd             m0, m9, m1     ; a0
2208    mova                 m1, m8
2209    pshufb               m4, m6, m20
2210    vpdpwssd             m1, m11, m2    ; a2'
2211    mova                 m2, m8
2212    pshufb               m3, m17, m20
2213    vpdpwssd             m2, m9, m4     ; b0
2214    mova                 m4, m8
2215    pshufb               m5, m18, m21
2216    vpdpwssd             m4, m9, m3     ; c0
2217    mova                 m3, m8
2218    pshufb              m26, m19, m21
2219    vpdpwssd             m3, m11, m5    ; b2'
2220    mova                 m5, m8
2221    pshufb              m16, m21
2222    vpdpwssd             m5, m11, m26   ; c2'
2223    pshufb               m7, m20
2224    vpdpwssd             m0, m10, m16   ; a1
2225    pshufb               m6, m21
2226    vpdpwssd             m1, m10, m7    ; a1'
2227    pshufb              m17, m21
2228    vpdpwssd             m2, m10, m6    ; b1
2229    pshufb              m18, m20
2230    vpdpwssd             m4, m10, m17   ; c1
2231    pshufb              m19, m20
2232    vpdpwssd             m3, m10, m18   ; b1'
2233    shufpd              m16, m7, 0x55
2234    vpdpwssd             m5, m10, m19   ; c1'
2235    shufpd               m6, m18, 0x55
2236    vpdpwssd             m0, m11, m16   ; a2
2237    shufpd              m17, m19, 0x55
2238    vpdpwssd             m1, m9, m16    ; a0'
2239    pshufb              m16, m22, m20
2240    vpdpwssd             m2, m11, m6    ; b2
2241    pshufb               m7, m23, m20
2242    vpdpwssd             m4, m11, m17   ; c2
2243    vpdpwssd             m3, m9, m6     ; b0'
2244    mova                 m6, m8
2245    vpdpwssd             m5, m9, m17    ; c0'
2246    pshufb              m17, m24, m21
2247    vpdpwssd             m6, m9, m16    ; d0
2248    mova                m16, m8
2249    pshufb              m26, m25, m21
2250    vpdpwssd            m16, m9, m7     ; e0
2251    mova                 m7, m8
2252    pshufb              m22, m21
2253    vpdpwssd             m7, m11, m17   ; d2'
2254    mova                m17, m8
2255    pshufb              m23, m21
2256    vpdpwssd            m17, m11, m26   ; e2'
2257    pshufb              m24, m20
2258    vpdpwssd             m6, m10, m22   ; d1
2259    pshufb              m25, m20
2260    vpdpwssd            m16, m10, m23   ; e1
2261    shufpd              m22, m24, 0x55
2262    vpdpwssd             m7, m10, m24   ; d1'
2263    shufpd              m23, m25, 0x55
2264    vpdpwssd            m17, m10, m25   ; e1'
2265    pslldq               m0, 1
2266    vpdpwssd             m6, m11, m22   ; d2
2267    pslldq               m1, 1
2268    vpdpwssd            m16, m11, m23   ; e2
2269    vpermt2b             m2, m27, m4    ; 12
2270    vpdpwssd             m7, m9, m22    ; d0'
2271    vpermt2b             m3, m27, m5    ; 12'
2272    vpdpwssd            m17, m9, m23    ; e0'
2273    vpshrdd              m0, m2, 16     ; 01
2274    vpermt2b             m6, m27, m16   ; 34
2275    vpshrdd              m1, m3, 16     ; 01'
2276    vpermt2b             m7, m27, m17   ; 34'
2277    vpshrdd              m4, m2, m6, 16 ; 23
2278    vpshrdd              m5, m3, m7, 16 ; 23'
2279.hv_w32_loop:
2280    movu                m22, [r7+ssq*1+ 0]
2281    movu                m24, [r7+ssq*1+12]
2282    lea                  r7, [r7+ssq*2]
2283    movu                m23, [r7+ssq*0+ 0]
2284    movu                m25, [r7+ssq*0+12]
2285    pmaddwd             m17, m12, m2    ; B0
2286    mova                 m2, m6
2287    pmaddwd             m19, m12, m3    ; B0'
2288    mova                 m3, m7
2289    pmaddwd             m16, m12, m0    ; A0
2290    mova                 m0, m4
2291    pmaddwd             m18, m12, m1    ; A0'
2292    mova                 m1, m5
2293    vpdpwssd            m17, m13, m6    ; B1
2294    vpdpwssd            m19, m13, m7    ; B1'
2295    mova                 m6, m8
2296    vpdpwssd            m16, m13, m4    ; A1
2297    pshufb               m4, m22, m20
2298    vpdpwssd            m18, m13, m5    ; A1'
2299    pshufb               m7, m23, m20
2300    vpdpwssd             m6, m9, m4     ; f0
2301    mova                 m4, m8
2302    pshufb               m5, m24, m21
2303    vpdpwssd             m4, m9, m7     ; g0
2304    mova                 m7, m8
2305    pshufb              m26, m25, m21
2306    vpdpwssd             m7, m11, m5    ; f2'
2307    mova                 m5, m8
2308    pshufb              m22, m21
2309    vpdpwssd             m5, m11, m26   ; g2'
2310    pshufb              m23, m21
2311    vpdpwssd             m6, m10, m22   ; f1
2312    pshufb              m24, m20
2313    vpdpwssd             m4, m10, m23   ; g1
2314    pshufb              m25, m20
2315    vpdpwssd             m7, m10, m24   ; f1'
2316    shufpd              m22, m24, 0x55
2317    vpdpwssd             m5, m10, m25   ; g1'
2318    shufpd              m23, m25, 0x55
2319    vpdpwssd             m6, m11, m22   ; f2
2320    vpdpwssd             m4, m11, m23   ; g2
2321    vpdpwssd             m7, m9, m22    ; f0'
2322    vpdpwssd             m5, m9, m23    ; g0'
2323    vpermt2b             m6, m27, m4    ; 56
2324    vpermt2b             m7, m27, m5    ; 56'
2325    vpdpwssd            m17, m14, m6    ; B2
2326    vpshrdd              m4, m2, m6, 16 ; 45
2327    vpdpwssd            m19, m14, m7    ; B2'
2328    vpshrdd              m5, m3, m7, 16 ; 45'
2329    vpdpwssd            m16, m14, m4    ; A2
2330    vpdpwssd            m18, m14, m5    ; A2'
2331    REPX      {psrad x, 10}, m17, m19, m16, m18
2332    packusdw            m17, m19
2333    packusdw            m16, m18
2334    pminsw              m17, m15
2335    pminsw              m16, m15
2336    mova         [r8+dsq*0], m16
2337    mova         [r8+dsq*1], m17
2338    lea                  r8, [r8+dsq*2]
2339    sub                  hd, 2
2340    jg .hv_w32_loop
2341    add                srcq, 64
2342    add                dstq, 64
2343    movzx                hd, wb
2344    sub                  wd, 1<<8
2345    jg .hv_w32_loop0
2346    RET
2347
2348PUT_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   put_8tap_16bpc
2349PUT_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  put_8tap_16bpc
2350PUT_8TAP_FN regular_sharp,  REGULAR, SHARP,   put_8tap_16bpc
2351PUT_8TAP_FN sharp_regular,  SHARP,   REGULAR, put_8tap_16bpc
2352PUT_8TAP_FN sharp,          SHARP,   SHARP
2353
2354cglobal put_8tap_16bpc, 4, 9, 0, dst, ds, src, ss, w, h, mx, my
2355    imul                mxd, mxm, 0x010101
2356    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
2357    imul                myd, mym, 0x010101
2358    add                 myd, t1d ; 8tap_v, my, 4tap_v
2359    lea                  r8, [put_avx512icl]
2360    movifnidn            wd, wm
2361    movifnidn            hd, hm
2362    test                mxd, 0xf00
2363    jnz .h
2364    test                myd, 0xf00
2365    jz mangle(private_prefix %+ _put_6tap_16bpc_avx512icl).put
2366.v:
2367    movzx               mxd, myb
2368    shr                 myd, 16
2369    cmp                  hd, 6
2370    cmovs               myd, mxd
2371    vpbroadcastd        m10, [pd_32]
2372    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
2373    tzcnt               r7d, wd
2374    vpbroadcastw        m11, r8m
2375    lea                  r6, [ssq*3]
2376    movzx               r7d, word [r8+r7*2+table_offset(put, _8tap_v)]
2377    sub                srcq, r6
2378    mova [rsp+stack_offset+8], xmm0
2379    vpbroadcastd        m12, xmm0
2380    add                  r7, r8
2381    vpbroadcastd        m13, [rsp+stack_offset+12]
2382    vpbroadcastd        m14, [rsp+stack_offset+16]
2383    vpbroadcastd        m15, [rsp+stack_offset+20]
2384    jmp                  r7
2385.v_w2:
2386    movd               xmm2, [srcq+ssq*0]
2387    pinsrd             xmm2, [srcq+ssq*1], 1
2388    pinsrd             xmm2, [srcq+ssq*2], 2
2389    add                srcq, r6
2390    pinsrd             xmm2, [srcq+ssq*0], 3  ; 0 1 2 3
2391    movd               xmm3, [srcq+ssq*1]
2392    vpbroadcastd       xmm1, [srcq+ssq*2]
2393    add                srcq, r6
2394    vpbroadcastd       xmm0, [srcq+ssq*0]
2395    vpblendd           xmm3, xmm1, 0x02       ; 4 5
2396    vpblendd           xmm1, xmm0, 0x02       ; 5 6
2397    palignr            xmm4, xmm3, xmm2, 4    ; 1 2 3 4
2398    punpcklwd          xmm3, xmm1             ; 45 56
2399    punpcklwd          xmm1, xmm2, xmm4       ; 01 12
2400    punpckhwd          xmm2, xmm4             ; 23 34
2401.v_w2_loop:
2402    vpbroadcastd       xmm4, [srcq+ssq*1]
2403    lea                srcq, [srcq+ssq*2]
2404    mova               xmm5, xm10
2405    vpdpwssd           xmm5, xm12, xmm1       ; a0 b0
2406    mova               xmm1, xmm2
2407    vpdpwssd           xmm5, xm13, xmm2       ; a1 b1
2408    mova               xmm2, xmm3
2409    vpdpwssd           xmm5, xm14, xmm3       ; a2 b2
2410    vpblendd           xmm3, xmm0, xmm4, 0x02 ; 6 7
2411    vpbroadcastd       xmm0, [srcq+ssq*0]
2412    vpblendd           xmm4, xmm0, 0x02       ; 7 8
2413    punpcklwd          xmm3, xmm4             ; 67 78
2414    vpdpwssd           xmm5, xm15, xmm3       ; a3 b3
2415    psrad              xmm5, 6
2416    packusdw           xmm5, xmm5
2417    pminsw             xmm5, xm11
2418    movd       [dstq+dsq*0], xmm5
2419    pextrd     [dstq+dsq*1], xmm5, 1
2420    lea                dstq, [dstq+dsq*2]
2421    sub                  hd, 2
2422    jg .v_w2_loop
2423    RET
2424.v_w4:
2425    movq               xmm1, [srcq+ssq*0]
2426    vpbroadcastq       ymm0, [srcq+ssq*1]
2427    vpbroadcastq       ymm2, [srcq+ssq*2]
2428    add                srcq, r6
2429    vpbroadcastq       ymm4, [srcq+ssq*0]
2430    vpbroadcastq       ymm3, [srcq+ssq*1]
2431    vpbroadcastq       ymm5, [srcq+ssq*2]
2432    add                srcq, r6
2433    vpblendd           ymm1, ymm0, 0x30
2434    vpblendd           ymm0, ymm2, 0x30
2435    punpcklwd          ymm1, ymm0       ; 01 12
2436    vpbroadcastq       ymm0, [srcq+ssq*0]
2437    vpblendd           ymm2, ymm4, 0x30
2438    vpblendd           ymm4, ymm3, 0x30
2439    punpcklwd          ymm2, ymm4       ; 23 34
2440    vpblendd           ymm3, ymm5, 0x30
2441    vpblendd           ymm5, ymm0, 0x30
2442    punpcklwd          ymm3, ymm5       ; 45 56
2443.v_w4_loop:
2444    vpbroadcastq       ymm5, [srcq+ssq*1]
2445    lea                srcq, [srcq+ssq*2]
2446    mova               ymm4, ym10
2447    vpdpwssd           ymm4, ym12, ymm1 ; a0 b0
2448    mova               ymm1, ymm2
2449    vpdpwssd           ymm4, ym13, ymm2 ; a1 b1
2450    mova               ymm2, ymm3
2451    vpdpwssd           ymm4, ym14, ymm3 ; a2 b2
2452    vpblendd           ymm3, ymm0, ymm5, 0x30
2453    vpbroadcastq       ymm0, [srcq+ssq*0]
2454    vpblendd           ymm5, ymm0, 0x30
2455    punpcklwd          ymm3, ymm5       ; 67 78
2456    vpdpwssd           ymm4, ym15, ymm3 ; a3 b3
2457    psrad              ymm4, 6
2458    vextracti128       xmm5, ymm4, 1
2459    packusdw           xmm4, xmm5
2460    pminsw             xmm4, xm11
2461    movq       [dstq+dsq*0], xmm4
2462    movhps     [dstq+dsq*1], xmm4
2463    lea                dstq, [dstq+dsq*2]
2464    sub                  hd, 2
2465    jg .v_w4_loop
2466    vzeroupper
2467    RET
2468.v_w8:
2469    vbroadcasti32x4      m2, [srcq+ssq*2]
2470    vinserti32x4         m1, m2, [srcq+ssq*0], 0
2471    vinserti32x4         m1, [srcq+ssq*1], 1 ; 0 1 2
2472    add                srcq, r6
2473    vinserti32x4        ym2, [srcq+ssq*0], 1
2474    vinserti32x4         m2, [srcq+ssq*1], 2 ; 2 3 4
2475    mova                 m6, [spel_v_shuf8]
2476    movu                xm0, [srcq+ssq*1]
2477    vinserti32x4        ym0, [srcq+ssq*2], 1
2478    add                srcq, r6
2479    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2480    vpermb               m1, m6, m1          ; 01 12
2481    vpermb               m2, m6, m2          ; 23 34
2482    vpermb               m3, m6, m0          ; 45 56
2483.v_w8_loop:
2484    vinserti32x4         m0, [srcq+ssq*1], 3
2485    lea                srcq, [srcq+ssq*2]
2486    movu                xm5, [srcq+ssq*0]
2487    mova                 m4, m10
2488    vpdpwssd             m4, m12, m1         ; a0 b0
2489    mova                 m1, m2
2490    vshufi32x4           m0, m5, q1032       ; 6 7 8
2491    vpdpwssd             m4, m13, m2         ; a1 b1
2492    mova                 m2, m3
2493    vpdpwssd             m4, m14, m3         ; a2 b2
2494    vpermb               m3, m6, m0          ; 67 78
2495    vpdpwssd             m4, m15, m3         ; a3 b3
2496    psrad                m4, 6
2497    vextracti32x8       ym5, m4, 1
2498    packusdw            ym4, ym5
2499    pminsw              ym4, ym11
2500    mova          [dstq+dsq*0], xm4
2501    vextracti32x4 [dstq+dsq*1], ym4, 1
2502    lea                dstq, [dstq+dsq*2]
2503    sub                  hd, 2
2504    jg .v_w8_loop
2505    RET
2506.v_w16:
2507    vbroadcasti32x8      m0, [srcq+ssq*1]
2508    vinserti32x8         m1, m0, [srcq+ssq*2], 1
2509    vinserti32x8         m0, [srcq+ssq*0], 0
2510    mova                 m8, [spel_v_shuf16]
2511    add                srcq, r6
2512    movu                ym3, [srcq+ssq*0]
2513    vinserti32x8         m3, [srcq+ssq*1], 1
2514    movu                ym5, [srcq+ssq*2]
2515    add                srcq, r6
2516    vinserti32x8         m5, [srcq+ssq*0], 1
2517    vpermb               m1, m8, m1     ; 12
2518    vpermb               m0, m8, m0     ; 01
2519    vpermb               m3, m8, m3     ; 34
2520    vpermb               m5, m8, m5     ; 56
2521    mova                 m9, [deint_q_shuf]
2522    vpshrdd              m2, m1, m3, 16 ; 23
2523    vpshrdd              m4, m3, m5, 16 ; 45
2524.v_w16_loop:
2525    mova                 m7, m10
2526    vpdpwssd             m7, m12, m1    ; b0
2527    mova                 m6, m10
2528    vpdpwssd             m6, m12, m0    ; a0
2529    mova                 m1, m3
2530    vpdpwssd             m7, m13, m3    ; b1
2531    mova                 m0, m2
2532    vpdpwssd             m6, m13, m2    ; a1
2533    mova                 m3, m5
2534    vpdpwssd             m7, m14, m5    ; b2
2535    mova                 m2, m4
2536    vpdpwssd             m6, m14, m4    ; a2
2537    movu                ym5, [srcq+ssq*1]
2538    lea                srcq, [srcq+ssq*2]
2539    vinserti32x8         m5, [srcq+ssq*0], 1
2540    vpermb               m5, m8, m5     ; 78
2541    vpshrdd              m4, m3, m5, 16 ; 67
2542    vpdpwssd             m7, m15, m5    ; b3
2543    vpdpwssd             m6, m15, m4    ; a3
2544    psrad                m7, 6
2545    psrad                m6, 6
2546    packusdw             m6, m7
2547    pminsw               m6, m11
2548    vpermq               m6, m9, m6
2549    mova          [dstq+dsq*0], ym6
2550    vextracti32x8 [dstq+dsq*1], m6, 1
2551    lea                dstq, [dstq+dsq*2]
2552    sub                  hd, 2
2553    jg .v_w16_loop
2554    RET
2555.v_w32:
2556.v_w64:
2557.v_w128:
2558    WIN64_SPILL_XMM      23
2559    lea                  wd, [hq+wq*8-256]
2560.v_w32_loop0:
2561    movu                m16, [srcq+ssq*0]
2562    movu                m17, [srcq+ssq*1]
2563    lea                  r7, [srcq+r6   ]
2564    movu                m18, [srcq+ssq*2]
2565    movu                m19, [r7  +ssq*0]
2566    mov                  r8, dstq
2567    movu                m20, [r7  +ssq*1]
2568    movu                m21, [r7  +ssq*2]
2569    add                  r7, r6
2570    movu                m22, [r7  +ssq*0]
2571    punpcklwd            m0, m16, m17 ; 01l
2572    punpckhwd           m16, m17      ; 01h
2573    punpcklwd            m1, m17, m18 ; 12l
2574    punpckhwd           m17, m18      ; 12h
2575    punpcklwd            m2, m18, m19 ; 23l
2576    punpckhwd           m18, m19      ; 23h
2577    punpcklwd            m3, m19, m20 ; 34l
2578    punpckhwd           m19, m20      ; 34h
2579    punpcklwd            m4, m20, m21 ; 45l
2580    punpckhwd           m20, m21      ; 45h
2581    punpcklwd            m5, m21, m22 ; 56l
2582    punpckhwd           m21, m22      ; 56h
2583.v_w32_loop:
2584    mova                 m6, m10
2585    vpdpwssd             m6, m12, m0  ; a0l
2586    mova                 m8, m10
2587    vpdpwssd             m8, m12, m16 ; a0h
2588    mova                 m7, m10
2589    vpdpwssd             m7, m12, m1  ; b0l
2590    mova                 m9, m10
2591    vpdpwssd             m9, m12, m17 ; b0h
2592    mova                 m0, m2
2593    vpdpwssd             m6, m13, m2  ; a1l
2594    mova                m16, m18
2595    vpdpwssd             m8, m13, m18 ; a1h
2596    mova                 m1, m3
2597    vpdpwssd             m7, m13, m3  ; b1l
2598    mova                m17, m19
2599    vpdpwssd             m9, m13, m19 ; b1h
2600    mova                 m2, m4
2601    vpdpwssd             m6, m14, m4  ; a2l
2602    mova                m18, m20
2603    vpdpwssd             m8, m14, m20 ; a2h
2604    mova                 m3, m5
2605    vpdpwssd             m7, m14, m5  ; b2l
2606    mova                m19, m21
2607    vpdpwssd             m9, m14, m21 ; b2h
2608    movu                m21, [r7+ssq*1]
2609    lea                  r7, [r7+ssq*2]
2610    punpcklwd            m4, m22, m21 ; 67l
2611    punpckhwd           m20, m22, m21 ; 67h
2612    movu                m22, [r7+ssq*0]
2613    vpdpwssd             m6, m15, m4  ; a3l
2614    vpdpwssd             m8, m15, m20 ; a3h
2615    punpcklwd            m5, m21, m22 ; 78l
2616    punpckhwd           m21, m22      ; 78h
2617    vpdpwssd             m7, m15, m5  ; b3l
2618    vpdpwssd             m9, m15, m21 ; b3h
2619    REPX       {psrad x, 6}, m6, m8, m7, m9
2620    packusdw             m6, m8
2621    packusdw             m7, m9
2622    pminsw               m6, m11
2623    pminsw               m7, m11
2624    mova         [r8+dsq*0], m6
2625    mova         [r8+dsq*1], m7
2626    lea                  r8, [r8+dsq*2]
2627    sub                  hd, 2
2628    jg .v_w32_loop
2629    add                srcq, 64
2630    add                dstq, 64
2631    movzx                hd, wb
2632    sub                  wd, 1<<8
2633    jg .v_w32_loop0
2634    RET
2635.h_w2:
2636    RESET_STACK_STATE
2637    mova                ym2, [spel_h_shuf2a]
2638    sub                srcq, 2
2639    pshufd             xmm3, xmm0, q1111
2640    pshufd             xmm4, xmm0, q2222
2641.h_w2_loop:
2642    movu                xm1, [srcq+ssq*0]
2643    vinserti32x4        ym1, [srcq+ssq*1], 1
2644    lea                srcq, [srcq+ssq*2]
2645    mova               xmm0, xm8
2646    vpermb              ym1, ym2, ym1
2647    vpdpwssd           xmm0, xmm3, xm1
2648    vextracti32x4       xm1, ym1, 1
2649    vpdpwssd           xmm0, xmm4, xm1
2650    psrad              xmm0, 6
2651    packusdw           xmm0, xmm0
2652    pminsw             xmm0, xm15
2653    movd       [dstq+dsq*0], xmm0
2654    pextrd     [dstq+dsq*1], xmm0, 1
2655    lea                dstq, [dstq+dsq*2]
2656    sub                  hd, 2
2657    jg .h_w2_loop
2658    RET
2659.h_w4:
2660    movzx               mxd, mxb
2661    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2662    jl .h_w2
2663    vbroadcasti32x4     ym4, [spel_h_shufA]
2664    vbroadcasti32x4     ym5, [spel_h_shufB]
2665    sub                srcq, 2
2666    pshufd             xmm0, xmm0, q2211
2667    vpbroadcastq        ym6, xmm0
2668    vpermq              ym7, ymm0, q1111
2669.h_w4_loop:
2670    movu                xm2, [srcq+ssq*0]
2671    vinserti32x4        ym2, [srcq+ssq*1], 1
2672    lea                srcq, [srcq+ssq*2]
2673    mova                ym0, ym8
2674    pshufb              ym1, ym2, ym4
2675    vpdpwssd            ym0, ym6, ym1
2676    pshufb              ym2, ym5
2677    vpdpwssd            ym0, ym7, ym2
2678    psrad               ym0, 6
2679    vextracti32x4       xm1, ym0, 1
2680    packusdw            xm0, xm1
2681    pminsw             xmm0, xm0, xm15
2682    movq       [dstq+dsq*0], xmm0
2683    movhps     [dstq+dsq*1], xmm0
2684    lea                dstq, [dstq+dsq*2]
2685    sub                  hd, 2
2686    jg .h_w4_loop
2687    RET
2688.h_w8:
2689    mova                 m4, [spel_h_shufA]
2690    movu                 m5, [spel_h_shufB]
2691    movu                 m6, [spel_h_shufC]
2692    mova                 m7, [spel_h_shufD]
2693.h_w8_loop:
2694    movu                ym2, [srcq+ssq*0]
2695    vinserti32x8         m2, [srcq+ssq*1], 1
2696    lea                srcq, [srcq+ssq*2]
2697    mova                 m0, m8
2698    vpermb               m1, m4, m2
2699    vpdpwssd             m0, m10, m1
2700    vpermb               m1, m5, m2
2701    vpdpwssd             m0, m11, m1
2702    vpermb               m1, m6, m2
2703    vpdpwssd             m0, m12, m1
2704    vpermb               m1, m7, m2
2705    vpdpwssd             m0, m13, m1
2706    psrad                m0, 6
2707    vextracti32x8       ym1, m0, 1
2708    packusdw            ym0, ym1
2709    pminsw              ym0, ym15
2710    mova          [dstq+dsq*0], xm0
2711    vextracti32x4 [dstq+dsq*1], ym0, 1
2712    lea                dstq, [dstq+dsq*2]
2713    sub                  hd, 2
2714    jg .h_w8_loop
2715    RET
2716.h:
2717    vpbroadcastw        m15, r8m
2718    test                myd, 0xf00
2719    jnz .hv
2720    mov                 r7d, r8m
2721    shr                 r7d, 11
2722    vpbroadcastd         m8, [base+put_8tap_h_rnd+r7*4]
2723    cmp                  wd, 4
2724    jle .h_w4
2725    shr                 mxd, 16
2726    sub                srcq, 6
2727    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2728    mova              [buf], xmm0
2729    vpbroadcastd        m10, xmm0
2730    vpbroadcastd        m11, [buf+ 4]
2731    vpbroadcastd        m12, [buf+ 8]
2732    vpbroadcastd        m13, [buf+12]
2733    sub                  wd, 16
2734    jl .h_w8
2735    vbroadcasti32x4      m6, [spel_h_shufA]
2736    vbroadcasti32x4      m7, [spel_h_shufB]
2737    jg .h_w32
2738.h_w16_loop:
2739    movu                ym2, [srcq+ssq*0+ 0]
2740    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
2741    movu                ym3, [srcq+ssq*0+16]
2742    vinserti32x8         m3, [srcq+ssq*1+16], 1
2743    lea                srcq, [srcq+ssq*2]
2744    mova                 m0, m8
2745    mova                 m1, m8
2746    pshufb               m4, m2, m6
2747    vpdpwssd             m0, m10, m4 ; a0
2748    pshufb               m4, m3, m6
2749    vpdpwssd             m1, m12, m4 ; b2
2750    pshufb               m4, m2, m7
2751    vpdpwssd             m0, m11, m4 ; a1
2752    pshufb               m4, m3, m7
2753    vpdpwssd             m1, m13, m4 ; b3
2754    shufpd               m2, m3, 0x55
2755    pshufb               m4, m2, m6
2756    vpdpwssd             m0, m12, m4 ; a2
2757    vpdpwssd             m1, m10, m4 ; b0
2758    pshufb               m2, m7
2759    vpdpwssd             m0, m13, m2 ; a3
2760    vpdpwssd             m1, m11, m2 ; b1
2761    psrad                m0, 6
2762    psrad                m1, 6
2763    packusdw             m0, m1
2764    pminsw               m0, m15
2765    mova          [dstq+dsq*0], ym0
2766    vextracti32x8 [dstq+dsq*1], m0, 1
2767    lea                dstq, [dstq+dsq*2]
2768    sub                  hd, 2
2769    jg .h_w16_loop
2770    RET
2771.h_w32:
2772    lea                srcq, [srcq+wq*2]
2773    lea                dstq, [dstq+wq*2]
2774    neg                  wq
2775.h_w32_loop0:
2776    mov                  r6, wq
2777.h_w32_loop:
2778    movu                 m2, [srcq+r6*2+ 0]
2779    movu                 m3, [srcq+r6*2+ 8]
2780    mova                 m0, m8
2781    mova                 m1, m8
2782    pshufb               m4, m2, m6
2783    vpdpwssd             m0, m10, m4 ; a0
2784    pshufb               m4, m3, m6
2785    vpdpwssd             m1, m10, m4 ; b0
2786    vpdpwssd             m0, m12, m4 ; a2
2787    movu                 m4, [srcq+r6*2+16]
2788    pshufb               m3, m7
2789    vpdpwssd             m1, m11, m3 ; b1
2790    vpdpwssd             m0, m13, m3 ; a3
2791    pshufb               m3, m4, m6
2792    vpdpwssd             m1, m12, m3 ; b2
2793    pshufb               m2, m7
2794    vpdpwssd             m0, m11, m2 ; a1
2795    pshufb               m4, m7
2796    vpdpwssd             m1, m13, m4 ; b3
2797    psrad                m0, 6
2798    psrad                m1, 6
2799    packusdw             m0, m1
2800    pminsw               m0, m15
2801    mova        [dstq+r6*2], m0
2802    add                  r6, 32
2803    jl .h_w32_loop
2804    add                srcq, ssq
2805    add                dstq, dsq
2806    dec                  hd
2807    jg .h_w32_loop0
2808    RET
2809.hv:
2810    cmp                  wd, 4
2811    jg .hv_w8
2812    movzx               mxd, mxb
2813    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2814    movzx               mxd, myb
2815    shr                 myd, 16
2816    cmp                  hd, 6
2817    cmovs               myd, mxd
2818    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2819    lea                  r6, [ssq*3]
2820    sub                srcq, 2
2821    sub                srcq, r6
2822    test          dword r8m, 0x800
2823    jnz .hv_12bit
2824    vpbroadcastd        m10, [pd_2176]
2825    psllw              xmm0, 6
2826    jmp .hv_main
2827.hv_12bit:
2828    vpbroadcastd        m10, [pd_640]
2829    psllw              xmm0, 4
2830    psllw              xmm1, 2
2831.hv_main:
2832    mova           [buf+ 0], xmm0
2833    mova           [buf+16], xmm1
2834    vpbroadcastd         m8, [buf+ 4]
2835    vpbroadcastd         m9, [buf+ 8]
2836    vpbroadcastd       ym11, xmm1
2837    vpbroadcastd       ym12, [buf+20]
2838    vpbroadcastd       ym13, [buf+24]
2839    vpbroadcastd       ym14, [buf+28]
2840    movu                xm4, [srcq+ssq*0]
2841    vinserti32x4        ym4, [srcq+ssq*1], 1
2842    vinserti32x4         m4, [srcq+ssq*2], 2
2843    add                srcq, r6
2844    vinserti32x4         m4, [srcq+ssq*0], 3 ; 0 1 2 3
2845    movu                xm0, [srcq+ssq*1]
2846    vinserti32x4        ym0, [srcq+ssq*2], 1
2847    add                srcq, r6
2848    vinserti32x4         m0, [srcq+ssq*0], 2 ; 4 5 6
2849    cmp                  wd, 4
2850    je .hv_w4
2851    vbroadcasti32x4      m2, [spel_h_shufA]
2852    mova                 m3, [spel_h_shuf2b]
2853    mova                ym6, [spel_h_shuf2a]
2854    mova                xm7, [spel_shuf2]
2855    mova                 m1, m10
2856    pshufb               m4, m2
2857    pshufb               m0, m2
2858    punpcklqdq           m2, m4, m0
2859    vpdpwssd             m1, m8, m2    ; 04 15 26 3_
2860    punpckhqdq           m4, m0
2861    vpdpwssd             m1, m9, m4
2862    vpermb               m1, m3, m1    ; 01 12
2863    vextracti32x4       xm2, ym1, 1    ; 23 34
2864    vextracti32x4       xm3, m1, 2     ; 45 56
2865.hv_w2_loop:
2866    movu                xm5, [srcq+ssq*1]
2867    lea                srcq, [srcq+ssq*2]
2868    vinserti32x4        ym5, [srcq+ssq*0], 1
2869    mova                xm4, xm10
2870    vpermb              ym5, ym6, ym5
2871    pmaddwd            xmm0, xm11, xm1 ; a0 b0
2872    vpdpwssd            xm4, xm8, xm5
2873    vextracti32x4       xm5, ym5, 1
2874    mova                xm1, xm2
2875    vpdpwssd           xmm0, xm12, xm2 ; a1 b1
2876    vpdpwssd            xm4, xm9, xm5  ; 7 8
2877    mova                xm2, xm3
2878    vpdpwssd           xmm0, xm13, xm3 ; a2 b2
2879    vpermt2b            xm3, xm7, xm4  ; 67 78
2880    vpdpwssd           xmm0, xm14, xm3 ; a3 b3
2881    psrad              xmm0, 10
2882    packusdw           xmm0, xmm0
2883    pminsw             xmm0, xm15
2884    movd       [dstq+dsq*0], xmm0
2885    pextrd     [dstq+dsq*1], xmm0, 1
2886    lea                dstq, [dstq+dsq*2]
2887    sub                  hd, 2
2888    jg .hv_w2_loop
2889    RET
2890.hv_w4:
2891    vbroadcasti32x4     m19, [spel_h_shufA]
2892    vbroadcasti32x4     m20, [spel_h_shufB]
2893    mova                ym6, [spel_shuf4a]
2894    mova                ym7, [spel_shuf4b]
2895    mova                 m2, m10
2896    mova                 m3, m10
2897    pshufb               m1, m4, m19
2898    vpdpwssd             m2, m8, m1
2899    pshufb               m1, m0, m19
2900    vpdpwssd             m3, m8, m1
2901    pshufb               m4, m20
2902    vpdpwssd             m2, m9, m4
2903    pshufb               m0, m20
2904    vpdpwssd             m3, m9, m0
2905    vpermb               m1, m6, m2    ; 01 12
2906    vshufi32x4           m2, m3, q1032
2907    vpermb               m3, m6, m3    ; 45 56
2908    vpermb               m2, m6, m2    ; 23 34
2909.hv_w4_loop:
2910    movu               xm18, [srcq+ssq*1]
2911    lea                srcq, [srcq+ssq*2]
2912    vinserti128        ym18, [srcq+ssq*0], 1
2913    pmaddwd            ym16, ym11, ym1 ; a0 b0
2914    mova                ym1, ym2
2915    mova                ym2, ym3
2916    pshufb             ym17, ym18, ym19
2917    mova                ym3, ym10
2918    vpdpwssd            ym3, ym8, ym17
2919    pshufb             ym18, ym20
2920    vpdpwssd           ym16, ym12, ym1 ; a1 b1
2921    vpdpwssd            ym3, ym9, ym18 ; 7 8
2922    vpdpwssd           ym16, ym13, ym2 ; a2 b2
2923    vpermt2b            ym3, ym7, ym2  ; 67 78
2924    vpdpwssd           ym16, ym14, ym3 ; a3 b3
2925    psrad              ym16, 10
2926    vextracti128       xm17, ym16, 1
2927    packusdw           xm16, xm17
2928    pminsw             xm16, xm15
2929    movq       [dstq+dsq*0], xm16
2930    movhps     [dstq+dsq*1], xm16
2931    lea                dstq, [dstq+dsq*2]
2932    sub                  hd, 2
2933    jg .hv_w4_loop
2934    vzeroupper
2935    RET
2936.hv_w8:
2937    shr                 mxd, 16
2938    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
2939    movzx               mxd, myb
2940    shr                 myd, 16
2941    cmp                  hd, 6
2942    cmovs               myd, mxd
2943    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
2944    lea                  r6, [ssq*3]
2945    sub                srcq, 6
2946    sub                srcq, r6
2947    test          dword r8m, 0x800
2948    jnz .hv_w8_12bit
2949    vpbroadcastd        m10, [pd_2176]
2950    psllw              xmm0, 6
2951    jmp .hv_w8_main
2952.hv_w8_12bit:
2953    vpbroadcastd        m10, [pd_640]
2954    psllw              xmm0, 4
2955    psllw              xmm1, 2
2956.hv_w8_main:
2957    mova           [buf+ 0], xmm0
2958    mova           [buf+16], xmm1
2959    vpbroadcastd        m11, xmm0
2960    vpbroadcastd        m12, [buf+ 4]
2961    vpbroadcastd        m13, [buf+ 8]
2962    vpbroadcastd        m14, [buf+12]
2963    vpbroadcastd        m16, xmm1
2964    vpbroadcastd        m17, [buf+20]
2965    vpbroadcastd        m18, [buf+24]
2966    vpbroadcastd        m19, [buf+28]
2967    cmp                  wd, 8
2968    jg .hv_w16
2969    mova                 m5, [spel_h_shufA]
2970    movu                ym0, [srcq+ssq*0]
2971    vinserti32x8         m0, [srcq+ssq*1], 1 ; 0 1
2972    movu                ym9, [srcq+ssq*2]
2973    add                srcq, r6
2974    vinserti32x8         m9, [srcq+ssq*0], 1 ; 2 3
2975    movu               ym20, [srcq+ssq*1]
2976    vinserti32x8        m20, [srcq+ssq*2], 1 ; 4 5
2977    add srcq, r6
2978    movu               ym21, [srcq+ssq*0]    ; 6
2979    movu                 m6, [spel_h_shufB]
2980    movu                 m7, [spel_h_shufC]
2981    vpermb               m8, m5, m0
2982    mova                 m1, m10
2983    vpdpwssd             m1, m11, m8  ; a0 b0
2984    vpermb               m8, m5, m9
2985    mova                 m2, m10
2986    vpdpwssd             m2, m11, m8  ; c0 d0
2987    vpermb               m8, m5, m20
2988    mova                 m3, m10
2989    vpdpwssd             m3, m11, m8  ; e0 f0
2990    vpermb               m8, m5, m21
2991    mova                 m4, m10
2992    vpdpwssd             m4, m11, m8  ; g0
2993    vpermb               m8, m6, m0
2994    vpdpwssd             m1, m12, m8  ; a1 b1
2995    vpermb               m8, m6, m9
2996    vpdpwssd             m2, m12, m8  ; c1 d1
2997    vpermb               m8, m6, m20
2998    vpdpwssd             m3, m12, m8  ; e1 f1
2999    vpermb               m8, m6, m21
3000    vpdpwssd             m4, m12, m8  ; g1
3001    vpermb               m8, m7, m0
3002    vpdpwssd             m1, m13, m8  ; a2 b2
3003    vpermb               m8, m7, m9
3004    vpdpwssd             m2, m13, m8  ; c2 d2
3005    vpermb               m8, m7, m20
3006    vpdpwssd             m3, m13, m8  ; e2 f2
3007    vpermb               m8, m7, m21
3008    vpdpwssd             m4, m13, m8  ; g2
3009    mova                 m8, [spel_h_shufD]
3010    vpermb               m0, m8, m0
3011    vpdpwssd             m1, m14, m0  ; a3 b3
3012    mova                 m0, [spel_shuf8a]
3013    vpermb               m9, m8, m9
3014    vpdpwssd             m2, m14, m9  ; c3 d3
3015    mova                 m9, [spel_shuf8b]
3016    vpermb              m20, m8, m20
3017    vpdpwssd             m3, m14, m20 ; e3 f3
3018    vpermb              m21, m8, m21
3019    vpdpwssd             m4, m14, m21 ; g3
3020    vpermt2b             m1, m0, m2   ; 01 12
3021    vpermt2b             m2, m0, m3   ; 23 34
3022    vpermt2b             m3, m0, m4   ; 45 56
3023.hv_w8_loop:
3024    movu                ym0, [srcq+ssq*1]
3025    lea                srcq, [srcq+ssq*2]
3026    vinserti32x8         m0, [srcq+ssq*0], 1
3027    mova                 m4, m10
3028    vpermb              m21, m5, m0
3029    vpdpwssd             m4, m11, m21 ; h0 i0
3030    vpermb              m21, m6, m0
3031    pmaddwd             m20, m16, m1  ; A0 B0
3032    vpdpwssd             m4, m12, m21 ; h1 i1
3033    vpermb              m21, m7, m0
3034    mova                 m1, m2
3035    vpdpwssd            m20, m17, m2  ; A1 B1
3036    vpdpwssd             m4, m13, m21 ; h2 i2
3037    vpermb              m21, m8, m0
3038    mova                 m2, m3
3039    vpdpwssd            m20, m18, m3  ; A2 B2
3040    vpdpwssd             m4, m14, m21 ; h3 i3
3041    vpermt2b             m3, m9, m4   ; 67 78
3042    vpdpwssd            m20, m19, m3  ; A3 B3
3043    psrad               m20, 10
3044    vextracti32x8      ym21, m20, 1
3045    packusdw           ym20, ym21
3046    pminsw             ym20, ym15
3047    mova         [dstq+dsq*0], xm20
3048    vextracti128 [dstq+dsq*1], ym20, 1
3049    lea                dstq, [dstq+dsq*2]
3050    sub                  hd, 2
3051    jg .hv_w8_loop
3052    vzeroupper
3053    RET
3054.hv_w16:
3055    WIN64_SPILL_XMM 26
3056    vbroadcasti32x4     m20, [spel_h_shufA]
3057    vbroadcasti32x4     m21, [spel_h_shufB]
3058    add                  wd, wd
3059    mova                 m9, [spel_shuf16]
3060    lea                  wd, [hq+wq*8-256]
3061.hv_w16_loop0:
3062    vbroadcasti32x8      m5, [srcq+ssq*0+ 8]
3063    vinserti32x8         m4, m5, [srcq+ssq*0+ 0], 0
3064    vinserti32x8         m5, [srcq+ssq*0+16], 1 ; 0
3065    movu                ym6, [srcq+ssq*1+ 0]
3066    movu                ym7, [srcq+ssq*1+16]
3067    lea                  r7, [srcq+r6]
3068    vinserti32x8         m6, [srcq+ssq*2+ 0], 1
3069    vinserti32x8         m7, [srcq+ssq*2+16], 1 ; 1 2
3070    movu               ym22, [r7  +ssq*0+ 0]
3071    movu               ym23, [r7  +ssq*0+16]
3072    mov                  r8, dstq
3073    vinserti32x8        m22, [r7  +ssq*1+ 0], 1
3074    vinserti32x8        m23, [r7  +ssq*1+16], 1 ; 3 4
3075    movu               ym24, [r7  +ssq*2+ 0]
3076    movu               ym25, [r7  +ssq*2+16]
3077    add                  r7, r6
3078    vinserti32x8        m24, [r7  +ssq*0+ 0], 1
3079    vinserti32x8        m25, [r7  +ssq*0+16], 1 ; 5 6
3080    pshufb               m0, m4, m20
3081    mova                 m1, m10
3082    vpdpwssd             m1, m11, m0    ; a0
3083    pshufb               m0, m6, m20
3084    mova                 m2, m10
3085    vpdpwssd             m2, m11, m0    ; b0
3086    pshufb               m0, m7, m20
3087    mova                 m3, m10
3088    vpdpwssd             m3, m13, m0    ; c2
3089    pshufb               m0, m4, m21
3090    vpdpwssd             m1, m12, m0    ; a1
3091    pshufb               m0, m6, m21
3092    vpdpwssd             m2, m12, m0    ; b1
3093    pshufb               m0, m7, m21
3094    vpdpwssd             m3, m14, m0    ; c3
3095    pshufb               m0, m5, m20
3096    vpdpwssd             m1, m13, m0    ; a2
3097    shufpd               m6, m7, 0x55
3098    pshufb               m7, m6, m20
3099    vpdpwssd             m2, m13, m7    ; b2
3100    vpdpwssd             m3, m11, m7    ; c0
3101    pshufb               m5, m21
3102    vpdpwssd             m1, m14, m5    ; a3
3103    pshufb               m6, m21
3104    vpdpwssd             m2, m14, m6    ; b3
3105    vpdpwssd             m3, m12, m6    ; c1
3106    pshufb               m0, m22, m20
3107    mova                 m4, m10
3108    vpdpwssd             m4, m11, m0    ; d0
3109    pshufb               m0, m23, m20
3110    mova                 m5, m10
3111    vpdpwssd             m5, m13, m0    ; e2
3112    pshufb               m0, m24, m20
3113    mova                 m6, m10
3114    vpdpwssd             m6, m11, m0    ; f0
3115    pshufb               m0, m25, m20
3116    mova                 m7, m10
3117    vpdpwssd             m7, m13, m0    ; g2
3118    pshufb               m0, m22, m21
3119    vpdpwssd             m4, m12, m0    ; d1
3120    pshufb               m0, m23, m21
3121    vpdpwssd             m5, m14, m0    ; e3
3122    pshufb               m0, m24, m21
3123    vpdpwssd             m6, m12, m0    ; f1
3124    pshufb               m0, m25, m21
3125    vpdpwssd             m7, m14, m0    ; g3
3126    shufpd              m22, m23, 0x55
3127    pshufb              m23, m22, m20
3128    vpdpwssd             m4, m13, m23   ; d2
3129    vpdpwssd             m5, m11, m23   ; e0
3130    shufpd              m24, m25, 0x55
3131    pshufb              m25, m24, m20
3132    vpdpwssd             m6, m13, m25   ; f2
3133    vpdpwssd             m7, m11, m25   ; g0
3134    pshufb              m22, m21
3135    vpdpwssd             m4, m14, m22   ; d3
3136    vpdpwssd             m5, m12, m22   ; e1
3137    pshufb              m24, m21
3138    vpdpwssd             m6, m14, m24   ; f3
3139    vpdpwssd             m7, m12, m24   ; g1
3140    pslldq               m1, 1
3141    vpermt2b             m2, m9, m3     ; 12
3142    vpermt2b             m4, m9, m5     ; 34
3143    vpermt2b             m6, m9, m7     ; 56
3144    vpshrdd              m1, m2, 16     ; 01
3145    vpshrdd              m3, m2, m4, 16 ; 23
3146    vpshrdd              m5, m4, m6, 16 ; 45
3147.hv_w16_loop:
3148    movu               ym24, [r7+ssq*1+ 0]
3149    movu               ym25, [r7+ssq*1+16]
3150    lea                  r7, [r7+ssq*2]
3151    vinserti32x8        m24, [r7+ssq*0+ 0], 1
3152    vinserti32x8        m25, [r7+ssq*0+16], 1
3153    mova                 m7, m10
3154    mova                 m8, m10
3155    pshufb               m0, m24, m20
3156    vpdpwssd             m7, m11, m0    ; h0
3157    pshufb               m0, m25, m20
3158    vpdpwssd             m8, m13, m0    ; i2
3159    pmaddwd             m22, m16, m1    ; A0
3160    mova                 m1, m3
3161    pmaddwd             m23, m16, m2    ; B0
3162    mova                 m2, m4
3163    pshufb               m0, m24, m21
3164    vpdpwssd             m7, m12, m0    ; h1
3165    pshufb               m0, m25, m21
3166    vpdpwssd             m8, m14, m0    ; i3
3167    vpdpwssd            m22, m17, m3    ; A1
3168    mova                 m3, m5
3169    vpdpwssd            m23, m17, m4    ; B1
3170    mova                 m4, m6
3171    shufpd              m24, m25, 0x55
3172    pshufb              m25, m24, m20
3173    vpdpwssd             m7, m13, m25   ; h2
3174    vpdpwssd             m8, m11, m25   ; i0
3175    vpdpwssd            m22, m18, m5    ; A2
3176    vpdpwssd            m23, m18, m6    ; B2
3177    pshufb              m24, m21
3178    vpdpwssd             m7, m14, m24   ; h3
3179    vpdpwssd             m8, m12, m24   ; i1
3180    vpermt2b             m7, m9, m8     ; 78
3181    vpshrdd              m5, m6, m7, 16 ; 67
3182    vpdpwssd            m22, m19, m5    ; A3
3183    vpdpwssd            m23, m19, m7    ; B3
3184    mova                 m6, m7
3185    psrad               m22, 10
3186    psrad               m23, 10
3187    vshufi32x4           m0, m22, m23, q3232
3188    vinserti32x8        m22, ym23, 1
3189    packusdw            m22, m0
3190    pminsw              m22, m15
3191    mova          [r8+dsq*0], ym22
3192    vextracti32x8 [r8+dsq*1], m22, 1
3193    lea                  r8, [r8+dsq*2]
3194    sub                  hd, 2
3195    jg .hv_w16_loop
3196    add                srcq, 32
3197    add                dstq, 32
3198    movzx                hd, wb
3199    sub                  wd, 1<<8
3200    jg .hv_w16_loop0
3201    RET
3202
3203%if WIN64
3204DECLARE_REG_TMP 6, 4
3205%else
3206DECLARE_REG_TMP 6, 7
3207%endif
3208
3209%define PREP_8TAP_FN FN prep_8tap,
3210PREP_8TAP_FN smooth,         SMOOTH,  SMOOTH,  prep_6tap_16bpc
3211PREP_8TAP_FN smooth_regular, SMOOTH,  REGULAR, prep_6tap_16bpc
3212PREP_8TAP_FN regular_smooth, REGULAR, SMOOTH,  prep_6tap_16bpc
3213PREP_8TAP_FN regular,        REGULAR, REGULAR
3214
3215cglobal prep_6tap_16bpc, 3, 8, 0, tmp, src, ss, w, h, mx, my
3216%define base r7-prep_avx512icl
3217    imul                mxd, mxm, 0x010101
3218    add                 mxd, t0d ; 6tap_h, mx, 4tap_h
3219    imul                myd, mym, 0x010101
3220    add                 myd, t1d ; 6tap_v, my, 4tap_v
3221    lea                  r7, [prep_avx512icl]
3222    mov                  wd, wm
3223    movifnidn            hd, hm
3224    test                mxd, 0xf00
3225    jnz .h
3226    test                myd, 0xf00
3227    jnz .v
3228.prep:
3229    tzcnt                wd, wd
3230    mov                 r5d, r7m ; bitdepth_max
3231    vpbroadcastd         m5, [pw_8192]
3232    movzx                wd, word [r7+wq*2+table_offset(prep,)]
3233    shr                 r5d, 11
3234    vpbroadcastd         m4, [r7-prep_avx512icl+prep_mul+r5*4]
3235    add                  wq, r7
3236    lea                  r6, [ssq*3]
3237%if WIN64
3238    pop                  r7
3239%endif
3240    jmp                  wq
3241.h_w8:
3242    mova                 m6, [spel_h_shufA]
3243    movu                 m7, [spel_h_shufC]
3244    mova                 m8, [prep_endB]
3245.h_w8_loop:
3246    movu                ym4, [srcq+ssq*0]
3247    vinserti32x8         m4, [srcq+ssq*1], 1
3248    movu                ym5, [srcq+ssq*2]
3249    vinserti32x8         m5, [srcq+r6   ], 1
3250    lea                srcq, [srcq+ssq*4]
3251    mova                 m0, m10
3252    mova                 m1, m10
3253    vpermb               m2, m6, m4
3254    vpermb               m3, m6, m5
3255    vpdpwssd             m0, m12, m2 ; a0 b0
3256    vpdpwssd             m1, m12, m3 ; c0 d0
3257    vpermb               m4, m7, m4
3258    vpermb               m5, m7, m5
3259    vpdpwssd             m0, m14, m4 ; a2 b2
3260    vpdpwssd             m1, m14, m5 ; c2 d2
3261    shufpd               m2, m4, 0x55
3262    shufpd               m3, m5, 0x55
3263    vpdpwssd             m0, m13, m2 ; a1 b1
3264    vpdpwssd             m1, m13, m3 ; c1 d1
3265    vpermt2b             m0, m8, m1
3266    mova             [tmpq], m0
3267    add                tmpq, 64
3268    sub                  hd, 4
3269    jg .h_w8_loop
3270    RET
3271.h:
3272    vpbroadcastd        m10, [prep_8tap_rnd]
3273    test                myd, 0xf00
3274    jnz .hv
3275    lea                  r6, [ssq*3]
3276    cmp                  wd, 4
3277    je mangle(private_prefix %+ _prep_8tap_16bpc_avx512icl).h_w4
3278    shr                 mxd, 16
3279    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3280    mov                 r5d, r7m
3281    sub                srcq, 4
3282    shr                 r5d, 11
3283    psllw              xmm0, [base+prep_hv_shift+r5*8]
3284    mova             [tmpq], xmm0
3285    vpbroadcastd        m12, xmm0
3286    vpbroadcastd        m13, [tmpq+ 4]
3287    vpbroadcastd        m14, [tmpq+ 8]
3288    cmp                  wd, 16
3289    jl .h_w8
3290    vbroadcasti32x4      m5, [spel_h_shufA]
3291    vbroadcasti32x4      m6, [spel_h_shufB]
3292    mova                 m7, [prep_endC]
3293    jg .h_w32
3294.h_w16_loop:
3295    movu                ym2, [srcq+ssq*0+ 0]
3296    vinserti32x8         m2, [srcq+ssq*1+ 0], 1
3297    movu                ym3, [srcq+ssq*0+12]
3298    vinserti32x8         m3, [srcq+ssq*1+12], 1
3299    lea                srcq, [srcq+ssq*2]
3300    mova                 m0, m10
3301    mova                 m1, m10
3302    pshufb               m4, m2, m5   ; 01
3303    vpdpwssd             m0, m12, m4  ; a0  b0
3304    pshufb               m4, m3, m6   ; 89
3305    vpdpwssd             m1, m14, m4  ; a2' b2'
3306    pshufb               m2, m6       ; 23
3307    pshufb               m3, m5       ; 67
3308    vpdpwssd             m0, m13, m2  ; a1  b1
3309    vpdpwssd             m1, m13, m3  ; a1' b1'
3310    shufpd               m2, m3, 0x55 ; 45
3311    vpdpwssd             m0, m14, m2  ; a2  b2
3312    vpdpwssd             m1, m12, m2  ; a0' b0'
3313    vpermt2b             m0, m7, m1
3314    mova             [tmpq], m0
3315    add                tmpq, 64
3316    sub                  hd, 2
3317    jg .h_w16_loop
3318    RET
3319.h_w32:
3320    lea                srcq, [srcq+wq*2]
3321    neg                  wq
3322.h_w32_loop0:
3323    mov                  r6, wq
3324.h_w32_loop:
3325    movu                 m2, [srcq+r6*2+ 0]
3326    movu                 m3, [srcq+r6*2+12]
3327    mova                 m0, m10
3328    mova                 m1, m10
3329    pshufb               m4, m2, m5
3330    vpdpwssd             m0, m12, m4
3331    pshufb               m4, m3, m6
3332    vpdpwssd             m1, m14, m4
3333    pshufb               m2, m6
3334    pshufb               m3, m5
3335    vpdpwssd             m0, m13, m2
3336    vpdpwssd             m1, m13, m3
3337    shufpd               m2, m3, 0x55
3338    vpdpwssd             m0, m14, m2
3339    vpdpwssd             m1, m12, m2
3340    vpermt2b             m0, m7, m1
3341    mova             [tmpq], m0
3342    add                tmpq, 64
3343    add                  r6, 32
3344    jl .h_w32_loop
3345    add                srcq, ssq
3346    dec                  hd
3347    jg .h_w32_loop0
3348    RET
3349.v:
3350    movzx               mxd, myb
3351    shr                 myd, 16
3352    cmp                  hd, 4
3353    cmove               myd, mxd
3354    mov                 r5d, r7m
3355    vpbroadcastd        m10, [prep_8tap_rnd]
3356    pmovsxbw           xmm0, [base+subpel_filters+1+myq*8]
3357    tzcnt               r6d, wd
3358    shr                 r5d, 11
3359    movzx               r6d, word [r7+r6*2+table_offset(prep, _6tap_v)]
3360    psllw              xmm0, [base+prep_hv_shift+r5*8]
3361    add                  r7, r6
3362    mova             [tmpq], xmm0
3363    vpbroadcastd        m12, xmm0
3364    mov                  r6, ssq
3365    vpbroadcastd        m13, [tmpq+ 4]
3366    neg                  r6
3367    vpbroadcastd        m14, [tmpq+ 8]
3368    jmp                  r7
3369.v_w4:
3370    mov                 r3d, 0x330c
3371    movq                xm1, [srcq+r6 *2]
3372    kmovw                k1, r3d
3373    vpbroadcastq    ym1{k1}, [srcq+r6 *1]
3374    vpbroadcastq         m2, [srcq+ssq*0]
3375    vinserti32x4     m1{k1}, m2, [srcq+ssq*1], 3
3376    movq                xm0, [srcq+ssq*2]
3377    mova                ym4, [prep_endA]
3378    valignq              m0, m1, 2
3379    punpcklwd            m1, m0        ; 01 12 23 34
3380.v_w4_loop:
3381    lea                srcq, [srcq+ssq*4]
3382    movq                xm2, [srcq+r6 *1]
3383    vpbroadcastq    ym2{k1}, [srcq+ssq*0]
3384    vpbroadcastq         m3, [srcq+ssq*1]
3385    vinserti32x4     m2{k1}, m3, [srcq+ssq*2], 3
3386    mova                 m3, m10
3387    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3388    valignq              m0, m2, m0, 6 ; 4 5 6 7
3389    punpcklwd            m0, m2        ; 45 56 67 78
3390    vpdpwssd             m3, m14, m0   ; a2 b2 c2 d2
3391    vshufi32x4           m1, m0, q1032 ; 23 34 45 56
3392    vpdpwssd             m3, m13, m1   ; a1 b1 c1 d1
3393    mova                 m1, m0
3394    mova                 m0, m2
3395    vpermb               m3, m4, m3
3396    mova             [tmpq], ym3
3397    add                tmpq, 32
3398    sub                  hd, 4
3399    jg .v_w4_loop
3400    RET
3401.v_w8:
3402    vbroadcasti32x4     ym1, [srcq+r6 *1]
3403    mov                 r3d, 0x33
3404    vbroadcasti32x4      m2, [srcq+ssq*0]
3405    kmovb                k1, r3d
3406    mova                 m6, [spel_v_shuf8]
3407    vinserti64x2     m1{k1}, m2, [srcq+r6 *2], 0 ; 0 1 2
3408    vbroadcasti32x4     ym0, [srcq+ssq*1]
3409    vinserti64x2     m0{k1}, m2, [srcq+ssq*2], 2 ; 2 3 4
3410    mova                 m7, [prep_endB]
3411    vpermb               m1, m6, m1  ; 01 12
3412    vpermb               m2, m6, m0  ; 23 34
3413.v_w8_loop:
3414    lea                srcq, [srcq+ssq*4]
3415    vbroadcasti32x4     ym3, [srcq+r6 *1]
3416    movu                xm4, [srcq+ssq*0]
3417    vshufi64x2       m3{k1}, m0, m4, q1032       ; 4 5 6
3418    vbroadcasti32x4     ym0, [srcq+ssq*1]
3419    vinserti64x2     m0{k1}, m4, [srcq+ssq*2], 2 ; 6 7 8
3420    mova                 m4, m10
3421    vpdpwssd             m4, m12, m1 ; a0 b0
3422    mova                 m5, m10
3423    vpdpwssd             m5, m12, m2 ; c0 d0
3424    vpermb               m1, m6, m3  ; 45 56
3425    vpdpwssd             m4, m13, m2 ; a1 b1
3426    vpermb               m2, m6, m0  ; 67 78
3427    vpdpwssd             m5, m13, m1 ; c1 d1
3428    vpdpwssd             m4, m14, m1 ; a2 b2
3429    vpdpwssd             m5, m14, m2 ; c2 d2
3430    vpermt2b             m4, m7, m5
3431    mova             [tmpq], m4
3432    add                tmpq, 64
3433    sub                  hd, 4
3434    jg .v_w8_loop
3435    RET
3436.v_w16:
3437    vbroadcasti32x8      m0, [srcq+r6 *1]
3438    vinserti32x8         m1, m0, [srcq+ssq*0], 1 ; 1 2
3439    vinserti32x8         m0, [srcq+r6 *2], 0     ; 0 1
3440    mova                 m6, [spel_v_shuf16]
3441    movu                ym3, [srcq+ssq*1]
3442    lea                srcq, [srcq+ssq*2]
3443    vinserti32x8         m3, [srcq+ssq*0], 1     ; 3 4
3444    mova                 m7, [prep_endA]
3445    vpermb               m1, m6, m1     ; 12
3446    vpermb               m0, m6, m0     ; 01
3447    vpermb               m3, m6, m3     ; 34
3448    vpshrdd              m2, m1, m3, 16 ; 23
3449.v_w16_loop:
3450    mova                 m5, m10
3451    vpdpwssd             m5, m12, m1    ; b0
3452    mova                 m4, m10
3453    vpdpwssd             m4, m12, m0    ; a0
3454    mova                 m1, m3
3455    vpdpwssd             m5, m13, m3    ; b1
3456    movu                ym3, [srcq+ssq*1]
3457    lea                srcq, [srcq+ssq*2]
3458    vpdpwssd             m4, m13, m2    ; a1
3459    vinserti32x8         m3, [srcq+ssq*0], 1
3460    mova                 m0, m2
3461    vpermb               m3, m6, m3     ; 56
3462    vpshrdd              m2, m1, m3, 16 ; 45
3463    vpdpwssd             m5, m14, m3    ; b2
3464    vpdpwssd             m4, m14, m2    ; a2
3465    vpermt2b             m4, m7, m5
3466    mova             [tmpq], m4
3467    add                tmpq, 64
3468    sub                  hd, 2
3469    jg .v_w16_loop
3470    RET
3471.v_w32:
3472.v_w64:
3473.v_w128:
3474%if WIN64
3475    push                 r8
3476%endif
3477    mova                m11, [prep_endC]
3478    lea                  r5, [hq+wq*8-256]
3479.v_w32_loop0:
3480    movu                 m4, [srcq+r6 *2]
3481    movu                 m5, [srcq+r6 *1]
3482    lea                  r7, [srcq+ssq*2]
3483    movu                 m6, [srcq+ssq*0]
3484    movu                 m7, [srcq+ssq*1]
3485    mov                  r8, tmpq
3486    movu                 m8, [r7  +ssq*0]
3487    punpcklwd            m0, m4, m5  ; 01
3488    punpckhwd            m4, m5
3489    punpcklwd            m1, m5, m6  ; 12
3490    punpckhwd            m5, m6
3491    punpcklwd            m2, m6, m7  ; 23
3492    punpckhwd            m6, m7
3493    punpcklwd            m3, m7, m8  ; 34
3494    punpckhwd            m7, m8
3495.v_w32_loop:
3496    mova                m16, m10
3497    movu                 m9, [r7+ssq*1]
3498    mova                m18, m10
3499    vpdpwssd            m16, m12, m0 ; a0
3500    mova                m17, m10
3501    vpdpwssd            m18, m12, m4
3502    mova                m19, m10
3503    vpdpwssd            m17, m12, m1 ; b0
3504    lea                  r7, [r7+ssq*2]
3505    vpdpwssd            m19, m12, m5
3506    mova                 m0, m2
3507    vpdpwssd            m16, m13, m2 ; a1
3508    punpcklwd            m2, m8, m9  ; 45
3509    mova                 m4, m6
3510    vpdpwssd            m18, m13, m6
3511    punpckhwd            m6, m8, m9
3512    movu                 m8, [r7+ssq*0]
3513    vpdpwssd            m17, m13, m3 ; b1
3514    mova                 m1, m3
3515    vpdpwssd            m19, m13, m7
3516    mova                 m5, m7
3517    vpdpwssd            m16, m14, m2 ; a2
3518    punpcklwd            m3, m9, m8  ; 56
3519    vpdpwssd            m18, m14, m6
3520    punpckhwd            m7, m9, m8
3521    vpdpwssd            m17, m14, m3 ; b2
3522    vpdpwssd            m19, m14, m7
3523    vpermt2b            m16, m11, m18
3524    vpermt2b            m17, m11, m19
3525    mova          [r8+wq*0], m16
3526    mova          [r8+wq*2], m17
3527    lea                  r8, [r8+wq*4]
3528    sub                  hd, 2
3529    jg .v_w32_loop
3530    add                srcq, 64
3531    add                tmpq, 64
3532    movzx                hd, r5b
3533    sub                 r5d, 1<<8
3534    jg .v_w32_loop0
3535%if WIN64
3536    pop                  r8
3537%endif
3538    vzeroupper
3539    RET
3540.hv_w4:
3541    movzx               mxd, mxb
3542    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
3543    movzx               mxd, myb
3544    shr                 myd, 16
3545    cmp                  hd, 4
3546    cmove               myd, mxd
3547    mov                 r5d, r7m
3548    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3549    mov                  r6, ssq
3550    sub                srcq, 2
3551    shr                 r5d, 11
3552    neg                  r6
3553    psllw              xmm0, [base+prep_hv_shift+r5*8]
3554    psllw              xmm1, 2
3555    mova          [tmpq+ 0], xmm0
3556    mova          [tmpq+16], xmm1
3557    vpbroadcastd         m8, [tmpq+ 4]
3558    mov                 r3d, 0xf0
3559    vpbroadcastd         m9, [tmpq+ 8]
3560    vpbroadcastd        m12, xmm1
3561    movu                xm3, [srcq+r6 *2]
3562    kmovb                k1, r3d
3563    vinserti32x4        ym3, [srcq+r6 *1], 1
3564    vbroadcasti32x4      m2, [srcq+ssq*0]
3565    vinserti64x2     m3{k1}, m2, [srcq+ssq*1], 3
3566    movu                xm4, [srcq+ssq*2]
3567    vbroadcasti32x4      m5, [spel_h_shufA]
3568    vbroadcasti32x4      m6, [spel_h_shufB]
3569    mova                 m1, m11
3570    mova                m15, [spel_shuf4a]
3571    mova                xm2, xm11
3572    pshufb               m0, m3, m5
3573    vpdpwssd             m1, m8, m0
3574    pshufb              xm0, xm4, xm5
3575    vpdpwssd            xm2, xm8, xm0
3576    vpbroadcastd        m13, [tmpq+20]
3577    pshufb               m3, m6
3578    vpbroadcastd        m14, [tmpq+24]
3579    pshufb              xm4, xm6
3580    mova                 m7, [spel_shuf4b]
3581    vpdpwssd             m1, m9, m3    ; 0 1 2 3
3582    vpdpwssd            xm2, xm9, xm4  ; 4
3583    vpermt2b             m1, m15, m2   ; 01 12 23 34
3584    mova               ym15, [prep_endA]
3585.hv_w4_loop:
3586    lea                srcq, [srcq+ssq*4]
3587    movu                xm4, [srcq+r6 *1]
3588    vinserti32x4        ym4, [srcq+ssq*0], 1
3589    vbroadcasti32x4      m3, [srcq+ssq*1]
3590    vinserti64x2     m4{k1}, m3, [srcq+ssq*2], 3
3591    mova                 m2, m11
3592    pshufb               m3, m4, m5
3593    vpdpwssd             m2, m8, m3
3594    mova                 m3, m10
3595    vpdpwssd             m3, m12, m1   ; a0 b0 c0 d0
3596    pshufb               m4, m6
3597    vpdpwssd             m2, m9, m4    ; 5 6 7 8
3598    mova                 m4, m1
3599    vpermt2b             m1, m7, m2    ; 45 56 67 78
3600    vpdpwssd             m3, m14, m1   ; a2 b2 c2 d2
3601    vshufi32x4           m4, m1, q1032 ; 23 34 45 56
3602    vpdpwssd             m3, m13, m4   ; a1 b1 c1 d1
3603    vpermb               m3, m15, m3
3604    mova             [tmpq], ym3
3605    add                tmpq, 32
3606    sub                  hd, 4
3607    jg .hv_w4_loop
3608    RET
3609.hv_w8:
3610    mova                 m8, [spel_h_shufA]
3611    movu               ym18, [srcq+r6 *2]
3612    vinserti32x8        m18, [srcq+r6 *1], 1 ; 0 1
3613    movu               ym19, [srcq+ssq*0]
3614    vinserti32x8        m19, [srcq+ssq*1], 1 ; 2 3
3615    movu               ym20, [srcq+ssq*2]    ; 4
3616    movu                 m9, [spel_h_shufC]
3617    mova                m21, [spel_shuf8a]
3618    mova                 m0, [spel_shuf8b]
3619    vpermb               m4, m8, m18
3620    mova                 m1, m10
3621    vpermb               m5, m8, m19
3622    vpdpwssd             m1, m12, m4  ; a0 b0
3623    mova                 m2, m10
3624    vpermb               m6, m8, m20
3625    vpdpwssd             m2, m12, m5  ; c0 d0
3626    mova                 m3, m10
3627    vpermb              m18, m9, m18
3628    vpdpwssd             m3, m12, m6  ; e0
3629    mova                 m7, [prep_endB]
3630    vpermb              m19, m9, m19
3631    vpdpwssd             m1, m14, m18 ; a2 b2
3632    vpermb              m20, m9, m20
3633    vpdpwssd             m2, m14, m19 ; c2 d2
3634    shufpd               m4, m18, 0x55
3635    vpdpwssd             m3, m14, m20 ; e2
3636    shufpd               m5, m19, 0x55
3637    vpdpwssd             m1, m13, m4  ; a1 b1
3638    shufpd               m6, m20, 0x55
3639    vpdpwssd             m2, m13, m5  ; c1 d1
3640    vpdpwssd             m3, m13, m6  ; e1
3641    vpermt2b             m1, m21, m2  ; 01 12
3642    vpermt2b             m2, m21, m3  ; 23 34
3643.hv_w8_loop:
3644    lea                srcq, [srcq+ssq*4]
3645    movu               ym18, [srcq+r6 *1]
3646    vinserti32x8        m18, [srcq+ssq*0], 1
3647    movu               ym19, [srcq+ssq*1]
3648    vinserti32x8        m19, [srcq+ssq*2], 1
3649    mova                 m3, m10
3650    vpermb               m5, m8, m18
3651    mova                 m4, m10
3652    vpermb               m6, m8, m19
3653    vpdpwssd             m3, m12, m5  ; f0 g0
3654    mova                m20, m11
3655    vpdpwssd             m4, m12, m6  ; h0 i0
3656    mova                m21, m11
3657    vpdpwssd            m20, m15, m1  ; A0 B0
3658    vpermb              m18, m9, m18
3659    vpdpwssd            m21, m15, m2  ; C0 D0
3660    vpermb              m19, m9, m19
3661    vpdpwssd             m3, m14, m18 ; f2 g2
3662    vpdpwssd             m4, m14, m19 ; h2 i2
3663    shufpd               m5, m18, 0x55
3664    vpdpwssd            m20, m16, m2  ; A1 B1
3665    shufpd               m6, m19, 0x55
3666    vpdpwssd             m3, m13, m5  ; f1 g1
3667    vpdpwssd             m4, m13, m6  ; h1 i1
3668    vpermt2b             m2, m0, m3   ; 45 56
3669    vpdpwssd            m21, m16, m2  ; C1 D1
3670    mova                 m1, m2
3671    vpermt2b             m2, m0, m4   ; 67 78
3672    vpdpwssd            m20, m17, m1  ; A2 B2
3673    vpdpwssd            m21, m17, m2  ; A2 B2
3674    vpermt2b            m20, m7, m21
3675    mova             [tmpq], m20
3676    add                tmpq, 64
3677    sub                  hd, 4
3678    jg .hv_w8_loop
3679    vzeroupper
3680    RET
3681.hv:
3682    vpbroadcastd        m11, [pd_128]
3683    cmp                  wd, 4
3684    je .hv_w4
3685    shr                 mxd, 16
3686    pmovsxbw           xmm0, [base+subpel_filters+1+mxq*8]
3687    movzx               mxd, myb
3688    shr                 myd, 16
3689    cmp                  hd, 6
3690    cmovs               myd, mxd
3691    mov                 r5d, r7m
3692    pmovsxbw           xmm1, [base+subpel_filters+1+myq*8]
3693    mov                  r6, ssq
3694    sub                srcq, 4
3695    shr                 r5d, 11
3696    neg                  r6
3697    psllw              xmm0, [base+prep_hv_shift+r5*8]
3698    psllw              xmm1, 2
3699    mova          [tmpq+ 0], xmm0
3700    mova          [tmpq+16], xmm1
3701    vpbroadcastd        m12, xmm0
3702    vpbroadcastd        m13, [tmpq+ 4]
3703    vpbroadcastd        m14, [tmpq+ 8]
3704    vpbroadcastd        m15, xmm1
3705    vpbroadcastd        m16, [tmpq+20]
3706    vpbroadcastd        m17, [tmpq+24]
3707    cmp                  wd, 16
3708    jl .hv_w8
3709    vbroadcasti32x4      m8, [spel_h_shufA]
3710    vbroadcasti32x4      m9, [spel_h_shufB]
3711    jg .hv_w32
3712    vbroadcasti32x8      m6, [srcq+r6 *2+ 8]
3713    vinserti32x8         m2, m6, [srcq+r6 *2+16], 1
3714    vinserti32x8         m6, [srcq+r6 *2+ 0], 0 ; 0
3715    movu               ym18, [srcq+r6 *1+ 0]
3716    movu               ym19, [srcq+r6 *1+12]
3717    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3718    vinserti32x8        m19, [srcq+ssq*0+12], 1 ; 1 2
3719    movu               ym20, [srcq+ssq*1+ 0]
3720    movu               ym21, [srcq+ssq*1+12]
3721    lea                srcq, [srcq+ssq*2]
3722    vinserti32x8        m20, [srcq+ssq*0+ 0], 1
3723    vinserti32x8        m21, [srcq+ssq*0+12], 1 ; 3 4
3724    pshufb               m2, m8
3725    mova                 m1, m10
3726    pshufb               m3, m18, m8
3727    vpdpwssd             m1, m14, m2    ; a2
3728    mova                 m2, m10
3729    pshufb               m4, m19, m9
3730    vpdpwssd             m2, m12, m3    ; b0  c0
3731    mova                 m3, m10
3732    pshufb               m5, m20, m8
3733    vpdpwssd             m3, m14, m4    ; b2' c2'
3734    mova                 m4, m10
3735    pshufb               m7, m21, m9
3736    vpdpwssd             m4, m12, m5    ; d0  e0
3737    mova                 m5, m10
3738    pshufb               m0, m6, m8
3739    vpdpwssd             m5, m14, m7    ; d2' e2'
3740    mova                 m7, [spel_shuf16]
3741    pshufb              m18, m9
3742    vpdpwssd             m1, m12, m0    ; a0
3743    pshufb              m19, m8
3744    vpdpwssd             m2, m13, m18   ; b1  c1
3745    pshufb              m20, m9
3746    vpdpwssd             m3, m13, m19   ; b1' c1'
3747    pshufb              m21, m8
3748    vpdpwssd             m4, m13, m20   ; d1  e1
3749    pshufb               m6, m9
3750    vpdpwssd             m5, m13, m21   ; d1' e1'
3751    mova                 m0, [prep_endB]
3752    shufpd              m18, m19, 0x55
3753    vpdpwssd             m1, m13, m6    ; a1
3754    shufpd              m20, m21, 0x55
3755    vpdpwssd             m2, m14, m18   ; b2  c2
3756    vpdpwssd             m3, m12, m18   ; b0' c0'
3757    vpdpwssd             m4, m14, m20   ; d2  e2
3758    vpdpwssd             m5, m12, m20   ; d0' e0'
3759    pslldq               m1, 1
3760    vpermt2b             m2, m7, m3     ; 12
3761    vpermt2b             m4, m7, m5     ; 34
3762    vpshrdd              m1, m2, 16     ; 01
3763    vpshrdd              m3, m2, m4, 16 ; 23
3764.hv_w16_loop:
3765    movu               ym18, [srcq+ssq*1+ 0]
3766    movu               ym19, [srcq+ssq*1+12]
3767    lea                srcq, [srcq+ssq*2]
3768    vinserti32x8        m18, [srcq+ssq*0+ 0], 1
3769    vinserti32x8        m19, [srcq+ssq*0+12], 1
3770    mova                 m5, m10
3771    mova                 m6, m10
3772    pshufb              m21, m18, m8
3773    vpdpwssd             m5, m12, m21   ; f0  g0
3774    pshufb              m20, m19, m9
3775    mova                m21, m11
3776    vpdpwssd             m6, m14, m20   ; f2' g2'
3777    mova                m20, m11
3778    vpdpwssd            m21, m15, m2    ; B0
3779    mova                 m2, m4
3780    vpdpwssd            m20, m15, m1    ; A0
3781    mova                 m1, m3
3782    pshufb              m18, m9
3783    vpdpwssd             m5, m13, m18   ; f1  g1
3784    pshufb              m19, m8
3785    vpdpwssd             m6, m13, m19   ; f1' g1'
3786    vpdpwssd            m21, m16, m4    ; B1
3787    vpdpwssd            m20, m16, m3    ; A1
3788    shufpd              m18, m19, 0x55
3789    vpdpwssd             m5, m14, m18   ; f2  g2
3790    vpdpwssd             m6, m12, m18   ; f0' g0'
3791    mova                 m4, m7
3792    vpermi2b             m4, m5, m6     ; 56
3793    vpshrdd              m3, m2, m4, 16 ; 45
3794    vpdpwssd            m21, m17, m4    ; B2
3795    vpdpwssd            m20, m17, m3    ; A2
3796    vpermt2b            m20, m0, m21
3797    mova             [tmpq], m20
3798    add                tmpq, 64
3799    sub                  hd, 2
3800    jg .hv_w16_loop
3801    vzeroupper
3802    RET
3803.hv_w32:
3804    WIN64_SPILL_XMM      29
3805%if WIN64
3806    push                 r8
3807%endif
3808    mova                m27, [spel_shuf32]
3809    lea                 r5d, [hq+wq*8-256]
3810    mova                m28, [prep_endC]
3811.hv_w32_loop0:
3812    movu                m18, [srcq+r6 *2+ 0]
3813    movu                 m7, [srcq+r6 *2+12]
3814    movu                 m6, [srcq+r6 *1+ 0]
3815    movu                m20, [srcq+r6 *1+12]
3816    lea                  r7, [srcq+ssq*2]
3817    movu                m19, [srcq+ssq*0+ 0]
3818    movu                m21, [srcq+ssq*0+12]
3819    movu                m22, [srcq+ssq*1+ 0]
3820    movu                m24, [srcq+ssq*1+12]
3821    mov                  r8, tmpq
3822    movu                m23, [r7  +ssq*0+ 0]
3823    movu                m25, [r7  +ssq*0+12]
3824    pshufb               m1, m18, m8
3825    mova                 m0, m10
3826    pshufb               m2, m7, m9
3827    vpdpwssd             m0, m12, m1    ; a0
3828    mova                 m1, m10
3829    pshufb               m4, m6, m8
3830    vpdpwssd             m1, m14, m2    ; a2'
3831    mova                 m2, m10
3832    pshufb               m3, m19, m8
3833    vpdpwssd             m2, m12, m4    ; b0
3834    mova                 m4, m10
3835    pshufb               m5, m20, m9
3836    vpdpwssd             m4, m12, m3    ; c0
3837    mova                 m3, m10
3838    pshufb              m26, m21, m9
3839    vpdpwssd             m3, m14, m5    ; b2'
3840    mova                 m5, m10
3841    pshufb              m18, m9
3842    vpdpwssd             m5, m14, m26   ; c2'
3843    pshufb               m7, m8
3844    vpdpwssd             m0, m13, m18   ; a1
3845    pshufb               m6, m9
3846    vpdpwssd             m1, m13, m7    ; a1'
3847    pshufb              m19, m9
3848    vpdpwssd             m2, m13, m6    ; b1
3849    pshufb              m20, m8
3850    vpdpwssd             m4, m13, m19   ; c1
3851    pshufb              m21, m8
3852    vpdpwssd             m3, m13, m20   ; b1'
3853    shufpd              m18, m7, 0x55
3854    vpdpwssd             m5, m13, m21   ; c1'
3855    shufpd               m6, m20, 0x55
3856    vpdpwssd             m0, m14, m18   ; a2
3857    shufpd              m19, m21, 0x55
3858    vpdpwssd             m1, m12, m18   ; a0'
3859    pshufb              m18, m22, m8
3860    vpdpwssd             m2, m14, m6    ; b2
3861    pshufb               m7, m23, m8
3862    vpdpwssd             m4, m14, m19   ; c2
3863    vpdpwssd             m3, m12, m6    ; b0'
3864    mova                 m6, m10
3865    vpdpwssd             m5, m12, m19   ; c0'
3866    pshufb              m19, m24, m9
3867    vpdpwssd             m6, m12, m18   ; d0
3868    mova                m18, m10
3869    pshufb              m26, m25, m9
3870    vpdpwssd            m18, m12, m7    ; e0
3871    mova                 m7, m10
3872    pshufb              m22, m9
3873    vpdpwssd             m7, m14, m19   ; d2'
3874    mova                m19, m10
3875    pshufb              m23, m9
3876    vpdpwssd            m19, m14, m26   ; e2'
3877    pshufb              m24, m8
3878    vpdpwssd             m6, m13, m22   ; d1
3879    pshufb              m25, m8
3880    vpdpwssd            m18, m13, m23   ; e1
3881    shufpd              m22, m24, 0x55
3882    vpdpwssd             m7, m13, m24   ; d1'
3883    shufpd              m23, m25, 0x55
3884    vpdpwssd            m19, m13, m25   ; e1'
3885    pslldq               m0, 1
3886    vpdpwssd             m6, m14, m22   ; d2
3887    pslldq               m1, 1
3888    vpdpwssd            m18, m14, m23   ; e2
3889    vpermt2b             m2, m27, m4    ; 12
3890    vpdpwssd             m7, m12, m22   ; d0'
3891    vpermt2b             m3, m27, m5    ; 12'
3892    vpdpwssd            m19, m12, m23   ; e0'
3893    vpshrdd              m0, m2, 16     ; 01
3894    vpermt2b             m6, m27, m18   ; 34
3895    vpshrdd              m1, m3, 16     ; 01'
3896    vpermt2b             m7, m27, m19   ; 34'
3897    vpshrdd              m4, m2, m6, 16 ; 23
3898    vpshrdd              m5, m3, m7, 16 ; 23'
3899.hv_w32_loop:
3900    movu                m22, [r7+ssq*1+ 0]
3901    movu                m24, [r7+ssq*1+12]
3902    lea                  r7, [r7+ssq*2]
3903    movu                m23, [r7+ssq*0+ 0]
3904    movu                m25, [r7+ssq*0+12]
3905    mova                m19, m11
3906    vpdpwssd            m19, m15, m2    ; B0
3907    mova                m21, m11
3908    vpdpwssd            m21, m15, m3    ; B0'
3909    mova                m18, m11
3910    vpdpwssd            m18, m15, m0    ; A0
3911    mova                m20, m11
3912    vpdpwssd            m20, m15, m1    ; A0'
3913    mova                 m2, m6
3914    vpdpwssd            m19, m16, m6    ; B1
3915    mova                 m3, m7
3916    vpdpwssd            m21, m16, m7    ; B1'
3917    mova                 m0, m4
3918    vpdpwssd            m18, m16, m4    ; A1
3919    mova                 m1, m5
3920    pshufb               m4, m22, m8
3921    vpdpwssd            m20, m16, m5    ; A1'
3922    mova                 m6, m10
3923    pshufb               m7, m23, m8
3924    vpdpwssd             m6, m12, m4    ; f0
3925    mova                 m4, m10
3926    pshufb               m5, m24, m9
3927    vpdpwssd             m4, m12, m7    ; g0
3928    mova                 m7, m10
3929    pshufb              m26, m25, m9
3930    vpdpwssd             m7, m14, m5    ; f2'
3931    mova                 m5, m10
3932    pshufb              m22, m9
3933    vpdpwssd             m5, m14, m26   ; g2'
3934    pshufb              m23, m9
3935    vpdpwssd             m6, m13, m22   ; f1
3936    pshufb              m24, m8
3937    vpdpwssd             m4, m13, m23   ; g1
3938    pshufb              m25, m8
3939    vpdpwssd             m7, m13, m24   ; f1'
3940    shufpd              m22, m24, 0x55
3941    vpdpwssd             m5, m13, m25   ; g1'
3942    shufpd              m23, m25, 0x55
3943    vpdpwssd             m6, m14, m22   ; f2
3944    vpdpwssd             m4, m14, m23   ; g2
3945    vpdpwssd             m7, m12, m22   ; f0'
3946    vpdpwssd             m5, m12, m23   ; g0'
3947    vpermt2b             m6, m27, m4    ; 56
3948    vpermt2b             m7, m27, m5    ; 56'
3949    vpdpwssd            m19, m17, m6    ; B2
3950    vpshrdd              m4, m2, m6, 16 ; 45
3951    vpdpwssd            m21, m17, m7    ; B2'
3952    vpshrdd              m5, m3, m7, 16 ; 45'
3953    vpdpwssd            m18, m17, m4    ; A2
3954    vpdpwssd            m20, m17, m5    ; A2'
3955    vpermt2b            m19, m28, m21
3956    vpermt2b            m18, m28, m20
3957    mova          [r8+wq*0], m18
3958    mova          [r8+wq*2], m19
3959    lea                  r8, [r8+wq*4]
3960    sub                  hd, 2
3961    jg .hv_w32_loop
3962    add                srcq, 64
3963    add                tmpq, 64
3964    movzx                hd, r5b
3965    sub                 r5d, 1<<8
3966    jg .hv_w32_loop0
3967%if WIN64
3968    pop                  r8
3969%endif
3970    RET
3971
3972PREP_8TAP_FN smooth_sharp,   SMOOTH,  SHARP,   prep_8tap_16bpc
3973PREP_8TAP_FN sharp_smooth,   SHARP,   SMOOTH,  prep_8tap_16bpc
3974PREP_8TAP_FN regular_sharp,  REGULAR, SHARP,   prep_8tap_16bpc
3975PREP_8TAP_FN sharp_regular,  SHARP,   REGULAR, prep_8tap_16bpc
3976PREP_8TAP_FN sharp,          SHARP,   SHARP
3977
3978cglobal prep_8tap_16bpc, 3, 8, 0, tmp, src, stride, w, h, mx, my
3979%define base r7-prep_avx512icl
3980    imul                mxd, mxm, 0x010101
3981    add                 mxd, t0d ; 8tap_h, mx, 4tap_h
3982    imul                myd, mym, 0x010101
3983    add                 myd, t1d ; 8tap_v, my, 4tap_v
3984    lea                  r7, [prep_avx512icl]
3985    mov                  wd, wm
3986    movifnidn            hd, hm
3987    test                mxd, 0xf00
3988    jnz .h
3989    test                myd, 0xf00
3990    jz mangle(private_prefix %+ _prep_6tap_16bpc_avx512icl).prep
3991.v:
3992    movzx               mxd, myb
3993    shr                 myd, 16
3994    cmp                  hd, 4
3995    cmove               myd, mxd
3996    mov                 r5d, r7m
3997    vpbroadcastd        m10, [prep_8tap_rnd]
3998    pmovsxbw           xmm0, [base+subpel_filters+myq*8]
3999    tzcnt               r6d, wd
4000    shr                 r5d, 11
4001    movzx               r6d, word [r7+r6*2+table_offset(prep, _8tap_v)]
4002    psllw              xmm0, [base+prep_hv_shift+r5*8]
4003    add                  r7, r6
4004    lea                  r6, [strideq*3]
4005    sub                srcq, r6
4006    mova             [tmpq], xmm0
4007    vpbroadcastd        m12, xmm0
4008    vpbroadcastd        m13, [tmpq+ 4]
4009    vpbroadcastd        m14, [tmpq+ 8]
4010    vpbroadcastd        m15, [tmpq+12]
4011    jmp                  r7
4012.v_w4:
4013    mov                 r3d, 0x330c
4014    movq                xm1, [srcq+strideq*0]
4015    kmovw                k1, r3d
4016    vpbroadcastq    ym1{k1}, [srcq+strideq*1]
4017    vpbroadcastq         m0, [srcq+r6       ]
4018    vinserti32x4     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2 3
4019    lea                srcq, [srcq+strideq*4]
4020    vpbroadcastq    ym0{k1}, [srcq+strideq*0]
4021    vpbroadcastq         m2, [srcq+strideq*1]
4022    vinserti32x4     m0{k1}, m2, [srcq+strideq*2], 3 ; 3 4 5 6
4023    mova                ym5, [prep_endA]
4024    vshufi32x4           m3, m1, m0, q1021 ; 1 2 3 4
4025    vshufi32x4           m2, m1, m0, q2132 ; 2 3 4 5
4026    punpcklwd            m1, m3            ; 01 12 23 34
4027    punpcklwd            m2, m0            ; 23 34 45 56
4028.v_w4_loop:
4029    movq                xm4, [srcq+r6       ]
4030    lea                srcq, [srcq+strideq*4]
4031    vpbroadcastq    ym4{k1}, [srcq+strideq*0]
4032    vpbroadcastq         m3, [srcq+strideq*1]
4033    vinserti32x4     m4{k1}, m3, [srcq+strideq*2], 3 ; 7 8 9 a
4034    mova                 m3, m10
4035    vpdpwssd             m3, m12, m1       ; a0 b0 c0 d0
4036    valignq              m1, m4, m0, 6     ; 6 7 8 9
4037    vpdpwssd             m3, m13, m2       ; a1 b1 c1 d1
4038    mova                 m0, m4
4039    punpcklwd            m4, m1, m4        ; 67 78 89 9a
4040    vpdpwssd             m3, m15, m4       ; a3 b3 c3 d3
4041    vshufi32x4           m1, m2, m4, q1032 ; 45 56 67 78
4042    vpdpwssd             m3, m14, m1       ; a2 b2 c2 d2
4043    mova                 m2, m4
4044    vpermb               m3, m5, m3
4045    mova             [tmpq], ym3
4046    add                tmpq, 32
4047    sub                  hd, 4
4048    jg .v_w4_loop
4049    RET
4050.v_w8:
4051    movu                xm0, [srcq+strideq*0]
4052    mov                 r3d, 0x33
4053    vbroadcasti32x4     ym1, [srcq+strideq*1]
4054    kmovb                k1, r3d
4055    mova                 m7, [spel_v_shuf8]
4056    vinserti64x2     m1{k1}, m0, [srcq+strideq*2], 2 ; 0 1 2
4057    add                srcq, r6
4058    vbroadcasti32x4     ym2, [srcq+strideq*0]
4059    vbroadcasti32x4      m3, [srcq+strideq*1]
4060    vbroadcasti32x4     ym0, [srcq+strideq*2]
4061    vshufi64x2       m2{k1}, m1, m3, q1032    ; 2 3 4
4062    vinserti64x2     m0{k1}, m3, [srcq+r6], 2 ; 4 5 6
4063    mova                 m8, [prep_endB]
4064    vpermb               m1, m7, m1  ; 01 12
4065    vpermb               m2, m7, m2  ; 23 34
4066    vpermb               m3, m7, m0  ; 45 56
4067.v_w8_loop:
4068    lea                srcq, [srcq+strideq*4]
4069    vbroadcasti32x4     ym4, [srcq+strideq*0]
4070    movu                xm5, [srcq+strideq*1]
4071    vshufi64x2       m4{k1}, m0, m5, q1032    ; 6 7 8
4072    vbroadcasti32x4     ym0, [srcq+strideq*2]
4073    vinserti64x2     m0{k1}, m5, [srcq+r6], 2 ; 8 9 a
4074    mova                 m5, m10
4075    vpdpwssd             m5, m12, m1 ; a0 b0
4076    mova                 m6, m10
4077    vpdpwssd             m6, m12, m2 ; c0 d0
4078    mova                 m1, m3
4079    vpdpwssd             m5, m13, m2 ; c1 d1
4080    vpdpwssd             m6, m13, m3 ; c1 d1
4081    vpermb               m2, m7, m4  ; 67 78
4082    vpdpwssd             m5, m14, m3 ; a2 b2
4083    vpermb               m3, m7, m0  ; 89 9a
4084    vpdpwssd             m6, m14, m2 ; c2 d2
4085    vpdpwssd             m5, m15, m2 ; a3 b3
4086    vpdpwssd             m6, m15, m3 ; c3 d3
4087    vpermt2b             m5, m8, m6
4088    mova             [tmpq], m5
4089    add                tmpq, 64
4090    sub                  hd, 4
4091    jg .v_w8_loop
4092    RET
4093.v_w16:
4094    vbroadcasti32x8      m0, [srcq+strideq*1]
4095    vinserti32x8         m1, m0, [srcq+strideq*2], 1
4096    vinserti32x8         m0, [srcq+strideq*0], 0
4097    mova                 m8, [spel_v_shuf16]
4098    add                srcq, r6
4099    movu                ym3, [srcq+strideq*0]
4100    vinserti32x8         m3, [srcq+strideq*1], 1
4101    movu                ym5, [srcq+strideq*2]
4102    add                srcq, r6
4103    vinserti32x8         m5, [srcq+strideq*0], 1
4104    mova                m11, [prep_endA]
4105    vpermb               m1, m8, m1     ; 12
4106    vpermb               m0, m8, m0     ; 01
4107    vpermb               m3, m8, m3     ; 34
4108    vpermb               m5, m8, m5     ; 56
4109    vpshrdd              m2, m1, m3, 16 ; 23
4110    vpshrdd              m4, m3, m5, 16 ; 45
4111.v_w16_loop:
4112    mova                 m7, m10
4113    vpdpwssd             m7, m12, m1    ; b0
4114    mova                 m6, m10
4115    vpdpwssd             m6, m12, m0    ; a0
4116    mova                 m1, m3
4117    vpdpwssd             m7, m13, m3    ; b1
4118    mova                 m0, m2
4119    vpdpwssd             m6, m13, m2    ; a1
4120    mova                 m3, m5
4121    vpdpwssd             m7, m14, m5    ; b2
4122    mova                 m2, m4
4123    vpdpwssd             m6, m14, m4    ; a2
4124    movu                ym5, [srcq+strideq*1]
4125    lea                srcq, [srcq+strideq*2]
4126    vinserti32x8         m5, [srcq+strideq*0], 1
4127    vpermb               m5, m8, m5     ; 78
4128    vpshrdd              m4, m3, m5, 16 ; 67
4129    vpdpwssd             m7, m15, m5    ; b3
4130    vpdpwssd             m6, m15, m4    ; a3
4131    vpermt2b             m6, m11, m7
4132    mova             [tmpq], m6
4133    add                tmpq, 64
4134    sub                  hd, 2
4135    jg .v_w16_loop
4136    RET
4137.v_w32:
4138.v_w64:
4139.v_w128:
4140    WIN64_PUSH_XMM       23
4141%if WIN64
4142    push                 r8
4143%endif
4144    mova                m11, [prep_endC]
4145    lea                  r5, [hq+wq*8-256]
4146.v_w32_loop0:
4147    movu                m16, [srcq+strideq*0]
4148    movu                m17, [srcq+strideq*1]
4149    lea                  r7, [srcq+r6]
4150    movu                m18, [srcq+strideq*2]
4151    movu                m19, [r7  +strideq*0]
4152    mov                  r8, tmpq
4153    movu                m20, [r7  +strideq*1]
4154    movu                m21, [r7  +strideq*2]
4155    add                  r7, r6
4156    movu                m22, [r7  +strideq*0]
4157    punpcklwd            m0, m16, m17 ; 01l
4158    punpckhwd           m16, m17      ; 01h
4159    punpcklwd            m1, m17, m18 ; 12l
4160    punpckhwd           m17, m18      ; 12h
4161    punpcklwd            m2, m18, m19 ; 23l
4162    punpckhwd           m18, m19      ; 23h
4163    punpcklwd            m3, m19, m20 ; 34l
4164    punpckhwd           m19, m20      ; 34h
4165    punpcklwd            m4, m20, m21 ; 45l
4166    punpckhwd           m20, m21      ; 45h
4167    punpcklwd            m5, m21, m22 ; 56l
4168    punpckhwd           m21, m22      ; 56h
4169.v_w32_loop:
4170    mova                 m6, m10
4171    vpdpwssd             m6, m12, m0  ; a0l
4172    mova                 m8, m10
4173    vpdpwssd             m8, m12, m16 ; a0h
4174    mova                 m7, m10
4175    vpdpwssd             m7, m12, m1  ; b0l
4176    mova                 m9, m10
4177    vpdpwssd             m9, m12, m17 ; b0h
4178    mova                 m0, m2
4179    vpdpwssd             m6, m13, m2  ; a1l
4180    mova                m16, m18
4181    vpdpwssd             m8, m13, m18 ; a1h
4182    mova                 m1, m3
4183    vpdpwssd             m7, m13, m3  ; b1l
4184    mova                m17, m19
4185    vpdpwssd             m9, m13, m19 ; b1h
4186    mova                 m2, m4
4187    vpdpwssd             m6, m14, m4  ; a2l
4188    mova                m18, m20
4189    vpdpwssd             m8, m14, m20 ; a2h
4190    mova                 m3, m5
4191    vpdpwssd             m7, m14, m5  ; b2l
4192    mova                m19, m21
4193    vpdpwssd             m9, m14, m21 ; b2h
4194    movu                m21, [r7+strideq*1]
4195    lea                  r7, [r7+strideq*2]
4196    punpcklwd            m4, m22, m21 ; 67l
4197    punpckhwd           m20, m22, m21 ; 67h
4198    movu                m22, [r7+strideq*0]
4199    vpdpwssd             m6, m15, m4  ; a3l
4200    vpdpwssd             m8, m15, m20 ; a3h
4201    punpcklwd            m5, m21, m22 ; 78l
4202    punpckhwd           m21, m22      ; 78h
4203    vpdpwssd             m7, m15, m5  ; b3l
4204    vpdpwssd             m9, m15, m21 ; b3h
4205    vpermt2b             m6, m11, m8
4206    vpermt2b             m7, m11, m9
4207    mova          [r8+wq*0], m6
4208    mova          [r8+wq*2], m7
4209    lea                  r8, [r8+wq*4]
4210    sub                  hd, 2
4211    jg .v_w32_loop
4212    add                srcq, 64
4213    add                tmpq, 64
4214    movzx                hd, r5b
4215    sub                 r5d, 1<<8
4216    jg .v_w32_loop0
4217%if WIN64
4218    pop                  r8
4219%endif
4220    RET
4221.h_w4:
4222    RESET_STACK_STATE
4223    movzx               mxd, mxb
4224    sub                srcq, 2
4225    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4226    mov                 r5d, r7m
4227    vbroadcasti32x4      m4, [spel_h_shufA]
4228    vbroadcasti32x4      m5, [spel_h_shufB]
4229    shr                 r5d, 11
4230    mova                ym9, [prep_endA]
4231    psllw              xmm0, [base+prep_hv_shift+r5*8]
4232    mova             [tmpq], xmm0
4233    vpbroadcastd         m6, [tmpq+4]
4234    vpbroadcastd         m7, [tmpq+8]
4235.h_w4_loop:
4236    movu                xm2, [srcq+strideq*0]
4237    vinserti32x4        ym2, [srcq+strideq*1], 1
4238    vinserti32x4         m2, [srcq+strideq*2], 2
4239    vinserti32x4         m2, [srcq+r6       ], 3
4240    lea                srcq, [srcq+strideq*4]
4241    mova                 m0, m10
4242    pshufb               m1, m2, m4
4243    vpdpwssd             m0, m6, m1
4244    pshufb               m2, m5
4245    vpdpwssd             m0, m7, m2
4246    vpermb               m0, m9, m0
4247    mova             [tmpq], ym0
4248    add                tmpq, 32
4249    sub                  hd, 4
4250    jg .h_w4_loop
4251    RET
4252.h_w8:
4253    mova                 m6, [spel_h_shufA]
4254    movu                 m7, [spel_h_shufB]
4255    movu                 m8, [spel_h_shufC]
4256    mova                 m9, [spel_h_shufD]
4257    mova                m11, [prep_endB]
4258.h_w8_loop:
4259    movu                ym4, [srcq+strideq*0]
4260    vinserti32x8         m4, [srcq+strideq*1], 1
4261    movu                ym5, [srcq+strideq*2]
4262    vinserti32x8         m5, [srcq+r6       ], 1
4263    lea                srcq, [srcq+strideq*4]
4264    mova                 m0, m10
4265    mova                 m1, m10
4266    vpermb               m2, m6, m4
4267    vpermb               m3, m6, m5
4268    vpdpwssd             m0, m12, m2
4269    vpdpwssd             m1, m12, m3
4270    vpermb               m2, m7, m4
4271    vpermb               m3, m7, m5
4272    vpdpwssd             m0, m13, m2
4273    vpdpwssd             m1, m13, m3
4274    vpermb               m2, m8, m4
4275    vpermb               m3, m8, m5
4276    vpdpwssd             m0, m14, m2
4277    vpdpwssd             m1, m14, m3
4278    vpermb               m2, m9, m4
4279    vpermb               m3, m9, m5
4280    vpdpwssd             m0, m15, m2
4281    vpdpwssd             m1, m15, m3
4282    vpermt2b             m0, m11, m1
4283    mova             [tmpq], m0
4284    add                tmpq, 64
4285    sub                  hd, 4
4286    jg .h_w8_loop
4287    RET
4288.h:
4289    vpbroadcastd        m10, [prep_8tap_rnd]
4290    test                myd, 0xf00
4291    jnz .hv
4292    lea                  r6, [strideq*3]
4293    cmp                  wd, 4
4294    je .h_w4
4295    shr                 mxd, 16
4296    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4297    mov                 r5d, r7m
4298    sub                srcq, 6
4299    shr                 r5d, 11
4300    psllw              xmm0, [base+prep_hv_shift+r5*8]
4301    mova             [tmpq], xmm0
4302    vpbroadcastd        m12, xmm0
4303    vpbroadcastd        m13, [tmpq+ 4]
4304    vpbroadcastd        m14, [tmpq+ 8]
4305    vpbroadcastd        m15, [tmpq+12]
4306    cmp                  wd, 16
4307    jl .h_w8
4308    vbroadcasti32x4      m6, [spel_h_shufA]
4309    vbroadcasti32x4      m7, [spel_h_shufB]
4310    mova                m11, [prep_endC]
4311    jg .h_w32
4312.h_w16_loop:
4313    movu                ym2, [srcq+strideq*0+ 0]
4314    vinserti32x8         m2, [srcq+strideq*1+ 0], 1
4315    movu                ym3, [srcq+strideq*0+16]
4316    vinserti32x8         m3, [srcq+strideq*1+16], 1
4317    lea                srcq, [srcq+strideq*2]
4318    mova                 m0, m10
4319    mova                 m1, m10
4320    pshufb               m4, m2, m6
4321    vpdpwssd             m0, m12, m4 ; a0
4322    pshufb               m4, m3, m6
4323    vpdpwssd             m1, m14, m4 ; b2
4324    pshufb               m4, m2, m7
4325    vpdpwssd             m0, m13, m4 ; a1
4326    pshufb               m4, m3, m7
4327    vpdpwssd             m1, m15, m4 ; b3
4328    shufpd               m2, m3, 0x55
4329    pshufb               m4, m2, m6
4330    vpdpwssd             m0, m14, m4 ; a2
4331    vpdpwssd             m1, m12, m4 ; b0
4332    pshufb               m2, m7
4333    vpdpwssd             m0, m15, m2 ; a3
4334    vpdpwssd             m1, m13, m2 ; b1
4335    vpermt2b             m0, m11, m1
4336    mova             [tmpq], m0
4337    add                tmpq, 64
4338    sub                  hd, 2
4339    jg .h_w16_loop
4340    RET
4341.h_w32:
4342    lea                srcq, [srcq+wq*2]
4343    neg                  wq
4344.h_w32_loop0:
4345    mov                  r6, wq
4346.h_w32_loop:
4347    movu                 m2, [srcq+r6*2+ 0]
4348    movu                 m3, [srcq+r6*2+ 8]
4349    mova                 m0, m10
4350    mova                 m1, m10
4351    pshufb               m4, m2, m6
4352    vpdpwssd             m0, m12, m4 ; a0
4353    pshufb               m4, m3, m6
4354    vpdpwssd             m1, m12, m4 ; b0
4355    vpdpwssd             m0, m14, m4 ; a2
4356    movu                 m4, [srcq+r6*2+16]
4357    pshufb               m3, m7
4358    vpdpwssd             m1, m13, m3 ; b1
4359    vpdpwssd             m0, m15, m3 ; a3
4360    pshufb               m3, m4, m6
4361    vpdpwssd             m1, m14, m3 ; b2
4362    pshufb               m2, m7
4363    vpdpwssd             m0, m13, m2 ; a1
4364    pshufb               m4, m7
4365    vpdpwssd             m1, m15, m4 ; b3
4366    vpermt2b             m0, m11, m1
4367    mova             [tmpq], m0
4368    add                tmpq, 64
4369    add                  r6, 32
4370    jl .h_w32_loop
4371    add                srcq, strideq
4372    dec                  hd
4373    jg .h_w32_loop0
4374    RET
4375.hv:
4376    vpbroadcastd        m11, [pd_128]
4377    cmp                  wd, 4
4378    jg .hv_w8
4379    movzx               mxd, mxb
4380    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4381    movzx               mxd, myb
4382    shr                 myd, 16
4383    cmp                  hd, 4
4384    cmove               myd, mxd
4385    mov                 r5d, r7m
4386    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4387    lea                  r6, [strideq*3]
4388    sub                srcq, 2
4389    shr                 r5d, 11
4390    sub                srcq, r6
4391    psllw              xmm0, [base+prep_hv_shift+r5*8]
4392    psllw              xmm1, 2
4393    mova          [tmpq+ 0], xmm0
4394    mova          [tmpq+16], xmm1
4395    vpbroadcastd        m12, xmm1
4396    movu               xm16, [srcq+strideq*0]
4397    mov                 r3d, 0xff0
4398    vinserti128        ym16, [srcq+strideq*1], 1
4399    kmovw                k1, r3d
4400    vbroadcasti32x4     m18, [srcq+strideq*2]
4401    add                srcq, r6
4402    vinserti64x2    m16{k1}, m18, [srcq+strideq*0], 3
4403    movu               xm17, [srcq+strideq*1]
4404    vbroadcasti32x4    ym18, [srcq+strideq*2]
4405    add                srcq, r6
4406    vinserti32x4    m17{k1}, m18, [srcq+strideq*0], 2
4407    vbroadcasti32x4      m5, [spel_h_shufA]
4408    vbroadcasti32x4      m6, [spel_h_shufB]
4409    vpbroadcastd         m8, [tmpq+ 4]
4410    vpbroadcastd         m9, [tmpq+ 8]
4411    mova                 m1, m10
4412    mova                m19, [spel_shuf4a]
4413    mova                 m2, m10
4414    pshufb               m0, m16, m5
4415    vpdpwssd             m1, m8, m0
4416    pshufb               m0, m17, m5
4417    vpdpwssd             m2, m8, m0
4418    vpbroadcastd        m13, [tmpq+20]
4419    pshufb              m16, m6
4420    vpbroadcastd        m14, [tmpq+24]
4421    pshufb              m17, m6
4422    vpbroadcastd        m15, [tmpq+28]
4423    vpdpwssd             m1, m9, m16       ; 0 1 2 3
4424    vpdpwssd             m2, m9, m17       ; 4 5 6
4425    mova                 m7, [spel_shuf4b]
4426    vpermt2b             m1, m19, m2       ; 01 12 23 34
4427    vpermb               m2, m19, m2       ; 45 56
4428    mova               ym19, [prep_endA]
4429    vshufi32x4           m2, m1, m2, q1032 ; 23 34 45 56
4430.hv_w4_loop:
4431    movu               xm17, [srcq+strideq*1]
4432    vinserti128        ym17, [srcq+strideq*2], 1
4433    vbroadcasti32x4     m16, [srcq+r6       ]
4434    lea                srcq, [srcq+strideq*4]
4435    vinserti64x2    m17{k1}, m16, [srcq+strideq*0], 3
4436    mova                m18, m10
4437    pshufb              m16, m17, m5
4438    vpdpwssd            m18, m8, m16
4439    mova                m16, m11
4440    vpdpwssd            m16, m12, m1       ; a0 b0 c0 d0
4441    pshufb              m17, m6
4442    vpdpwssd            m18, m9, m17       ; 7 8 9 a
4443    mova                 m1, m2
4444    vpdpwssd            m16, m13, m2       ; a1 b1 c1 d1
4445    vpermt2b             m2, m7, m18       ; 67 78 89 9a
4446    vpdpwssd            m16, m15, m2       ; a3 b3 c3 d3
4447    vshufi32x4           m1, m2, q1032     ; 45 56 67 78
4448    vpdpwssd            m16, m14, m1       ; a2 b2 c2 d2
4449    vpermb              m16, m19, m16
4450    mova             [tmpq], ym16
4451    add                tmpq, 32
4452    sub                  hd, 4
4453    jg .hv_w4_loop
4454    vzeroupper
4455    RET
4456.hv_w8:
4457    shr                 mxd, 16
4458    pmovsxbw           xmm0, [base+subpel_filters+mxq*8]
4459    movzx               mxd, myb
4460    shr                 myd, 16
4461    cmp                  hd, 6
4462    cmovs               myd, mxd
4463    mov                 r5d, r7m
4464    pmovsxbw           xmm1, [base+subpel_filters+myq*8]
4465    lea                  r6, [strideq*3]
4466    sub                srcq, 6
4467    shr                 r5d, 11
4468    sub                srcq, r6
4469    psllw              xmm0, [base+prep_hv_shift+r5*8]
4470    psllw              xmm1, 2
4471    mova          [tmpq+ 0], xmm0
4472    mova          [tmpq+16], xmm1
4473    vpbroadcastd        m12, xmm0
4474    vpbroadcastd        m13, [tmpq+ 4]
4475    vpbroadcastd        m14, [tmpq+ 8]
4476    vpbroadcastd        m15, [tmpq+12]
4477    vpbroadcastd        m16, xmm1
4478    vpbroadcastd        m17, [tmpq+20]
4479    vpbroadcastd        m18, [tmpq+24]
4480    vpbroadcastd        m19, [tmpq+28]
4481    cmp                  wd, 8
4482    jg .hv_w16
4483    WIN64_SPILL_XMM      23
4484    mova                 m5, [spel_h_shufA]
4485    movu                ym0, [srcq+strideq*0]
4486    vinserti32x8         m0, [srcq+strideq*1], 1 ; 0 1
4487    movu                ym9, [srcq+strideq*2]
4488    add                srcq, r6
4489    vinserti32x8         m9, [srcq+strideq*0], 1 ; 2 3
4490    movu               ym20, [srcq+strideq*1]
4491    vinserti32x8        m20, [srcq+strideq*2], 1 ; 4 5
4492    add                srcq, r6
4493    movu               ym21, [srcq+strideq*0]    ; 6
4494    movu                 m6, [spel_h_shufB]
4495    movu                 m7, [spel_h_shufC]
4496    mova               ym22, [prep_endB]
4497    vpermb               m8, m5, m0
4498    mova                 m1, m10
4499    vpdpwssd             m1, m12, m8  ; a0 b0
4500    vpermb               m8, m5, m9
4501    mova                 m2, m10
4502    vpdpwssd             m2, m12, m8  ; c0 d0
4503    vpermb               m8, m5, m20
4504    mova                 m3, m10
4505    vpdpwssd             m3, m12, m8  ; e0 f0
4506    vpermb               m8, m5, m21
4507    mova                 m4, m10
4508    vpdpwssd             m4, m12, m8  ; g0
4509    vpermb               m8, m6, m0
4510    vpdpwssd             m1, m13, m8  ; a1 b1
4511    vpermb               m8, m6, m9
4512    vpdpwssd             m2, m13, m8  ; c1 d1
4513    vpermb               m8, m6, m20
4514    vpdpwssd             m3, m13, m8  ; e1 f1
4515    vpermb               m8, m6, m21
4516    vpdpwssd             m4, m13, m8  ; g1
4517    vpermb               m8, m7, m0
4518    vpdpwssd             m1, m14, m8  ; a2 b2
4519    vpermb               m8, m7, m9
4520    vpdpwssd             m2, m14, m8  ; c2 d2
4521    vpermb               m8, m7, m20
4522    vpdpwssd             m3, m14, m8  ; e2 f2
4523    vpermb               m8, m7, m21
4524    vpdpwssd             m4, m14, m8  ; g2
4525    mova                 m8, [spel_h_shufD]
4526    vpermb               m0, m8, m0
4527    vpdpwssd             m1, m15, m0  ; a3 b3
4528    mova                 m0, [spel_shuf8a]
4529    vpermb               m9, m8, m9
4530    vpdpwssd             m2, m15, m9  ; c3 d3
4531    mova                 m9, [spel_shuf8b]
4532    vpermb              m20, m8, m20
4533    vpdpwssd             m3, m15, m20 ; e3 f3
4534    vpermb              m21, m8, m21
4535    vpdpwssd             m4, m15, m21 ; g3
4536    vpermt2b             m1, m0, m2   ; 01 12
4537    vpermt2b             m2, m0, m3   ; 23 34
4538    vpermt2b             m3, m0, m4   ; 45 56
4539.hv_w8_loop:
4540    movu                ym0, [srcq+strideq*1]
4541    lea                srcq, [srcq+strideq*2]
4542    vinserti32x8         m0, [srcq+strideq*0], 1
4543    mova                 m4, m10
4544    mova                m20, m11
4545    vpermb              m21, m5, m0
4546    vpdpwssd             m4, m12, m21 ; h0 i0
4547    vpermb              m21, m6, m0
4548    vpdpwssd            m20, m16, m1  ; A0 B0
4549    vpdpwssd             m4, m13, m21 ; h1 i1
4550    vpermb              m21, m7, m0
4551    mova                 m1, m2
4552    vpdpwssd            m20, m17, m2  ; A1 B1
4553    vpdpwssd             m4, m14, m21 ; h2 i2
4554    vpermb              m21, m8, m0
4555    mova                 m2, m3
4556    vpdpwssd            m20, m18, m3  ; A2 B2
4557    vpdpwssd             m4, m15, m21 ; h3 i3
4558    vpermt2b             m3, m9, m4   ; 67 78
4559    vpdpwssd            m20, m19, m3  ; A3 B3
4560    vpermb              m20, m22, m20
4561    mova             [tmpq], ym20
4562    add                tmpq, 32
4563    sub                  hd, 2
4564    jg .hv_w8_loop
4565    RET
4566.hv_w16:
4567    WIN64_SPILL_XMM      27
4568%if WIN64
4569    push                 r8
4570%endif
4571    vbroadcasti32x4     m20, [spel_h_shufA]
4572    vbroadcasti32x4     m21, [spel_h_shufB]
4573    add                  wd, wd
4574    mova                 m9, [spel_shuf16]
4575    mova                m26, [prep_endB]
4576    lea                 r5d, [hq+wq*8-256]
4577.hv_w16_loop0:
4578    vbroadcasti32x8      m5, [srcq+strideq*0+ 8]
4579    vinserti32x8         m4, m5, [srcq+strideq*0+ 0], 0
4580    vinserti32x8         m5, [srcq+strideq*0+16], 1 ; 0
4581    movu                ym6, [srcq+strideq*1+ 0]
4582    movu                ym7, [srcq+strideq*1+16]
4583    lea                  r7, [srcq+r6]
4584    vinserti32x8         m6, [srcq+strideq*2+ 0], 1
4585    vinserti32x8         m7, [srcq+strideq*2+16], 1 ; 1 2
4586    movu               ym22, [r7  +strideq*0+ 0]
4587    movu               ym23, [r7  +strideq*0+16]
4588    mov                  r8, tmpq
4589    vinserti32x8        m22, [r7  +strideq*1+ 0], 1
4590    vinserti32x8        m23, [r7  +strideq*1+16], 1 ; 3 4
4591    movu               ym24, [r7  +strideq*2+ 0]
4592    movu               ym25, [r7  +strideq*2+16]
4593    add                  r7, r6
4594    vinserti32x8        m24, [r7  +strideq*0+ 0], 1
4595    vinserti32x8        m25, [r7  +strideq*0+16], 1 ; 5 6
4596    pshufb               m0, m4, m20
4597    mova                 m1, m10
4598    vpdpwssd             m1, m12, m0    ; a0
4599    pshufb               m0, m6, m20
4600    mova                 m2, m10
4601    vpdpwssd             m2, m12, m0    ; b0
4602    pshufb               m0, m7, m20
4603    mova                 m3, m10
4604    vpdpwssd             m3, m14, m0    ; c2
4605    pshufb               m0, m4, m21
4606    vpdpwssd             m1, m13, m0    ; a1
4607    pshufb               m0, m6, m21
4608    vpdpwssd             m2, m13, m0    ; b1
4609    pshufb               m0, m7, m21
4610    vpdpwssd             m3, m15, m0    ; c3
4611    pshufb               m0, m5, m20
4612    vpdpwssd             m1, m14, m0    ; a2
4613    shufpd               m6, m7, 0x55
4614    pshufb               m7, m6, m20
4615    vpdpwssd             m2, m14, m7    ; b2
4616    vpdpwssd             m3, m12, m7    ; c0
4617    pshufb               m5, m21
4618    vpdpwssd             m1, m15, m5    ; a3
4619    pshufb               m6, m21
4620    vpdpwssd             m2, m15, m6    ; b3
4621    vpdpwssd             m3, m13, m6    ; c1
4622    pshufb               m0, m22, m20
4623    mova                 m4, m10
4624    vpdpwssd             m4, m12, m0    ; d0
4625    pshufb               m0, m23, m20
4626    mova                 m5, m10
4627    vpdpwssd             m5, m14, m0    ; e2
4628    pshufb               m0, m24, m20
4629    mova                 m6, m10
4630    vpdpwssd             m6, m12, m0    ; f0
4631    pshufb               m0, m25, m20
4632    mova                 m7, m10
4633    vpdpwssd             m7, m14, m0    ; g2
4634    pshufb               m0, m22, m21
4635    vpdpwssd             m4, m13, m0    ; d1
4636    pshufb               m0, m23, m21
4637    vpdpwssd             m5, m15, m0    ; e3
4638    pshufb               m0, m24, m21
4639    vpdpwssd             m6, m13, m0    ; f1
4640    pshufb               m0, m25, m21
4641    vpdpwssd             m7, m15, m0    ; g3
4642    shufpd              m22, m23, 0x55
4643    pshufb              m23, m22, m20
4644    vpdpwssd             m4, m14, m23   ; d2
4645    vpdpwssd             m5, m12, m23   ; e0
4646    shufpd              m24, m25, 0x55
4647    pshufb              m25, m24, m20
4648    vpdpwssd             m6, m14, m25   ; f2
4649    vpdpwssd             m7, m12, m25   ; g0
4650    pshufb              m22, m21
4651    vpdpwssd             m4, m15, m22   ; d3
4652    vpdpwssd             m5, m13, m22   ; e1
4653    pshufb              m24, m21
4654    vpdpwssd             m6, m15, m24   ; f3
4655    vpdpwssd             m7, m13, m24   ; g1
4656    pslldq               m1, 1
4657    vpermt2b             m2, m9, m3     ; 12
4658    vpermt2b             m4, m9, m5     ; 34
4659    vpermt2b             m6, m9, m7     ; 56
4660    vpshrdd              m1, m2, 16     ; 01
4661    vpshrdd              m3, m2, m4, 16 ; 23
4662    vpshrdd              m5, m4, m6, 16 ; 45
4663.hv_w16_loop:
4664    movu               ym24, [r7+strideq*1+ 0]
4665    movu               ym25, [r7+strideq*1+16]
4666    lea                  r7, [r7+strideq*2]
4667    vinserti32x8        m24, [r7+strideq*0+ 0], 1
4668    vinserti32x8        m25, [r7+strideq*0+16], 1
4669    mova                 m7, m10
4670    mova                 m8, m10
4671    pshufb               m0, m24, m20
4672    vpdpwssd             m7, m12, m0    ; h0
4673    mova                m22, m11
4674    pshufb               m0, m25, m20
4675    vpdpwssd             m8, m14, m0    ; i2
4676    mova                m23, m11
4677    vpdpwssd            m22, m16, m1    ; A0
4678    mova                 m1, m3
4679    vpdpwssd            m23, m16, m2    ; B0
4680    mova                 m2, m4
4681    pshufb               m0, m24, m21
4682    vpdpwssd             m7, m13, m0    ; h1
4683    pshufb               m0, m25, m21
4684    vpdpwssd             m8, m15, m0    ; i3
4685    vpdpwssd            m22, m17, m3    ; A1
4686    mova                 m3, m5
4687    vpdpwssd            m23, m17, m4    ; B1
4688    mova                 m4, m6
4689    shufpd              m24, m25, 0x55
4690    pshufb              m25, m24, m20
4691    vpdpwssd             m7, m14, m25   ; h2
4692    vpdpwssd             m8, m12, m25   ; i0
4693    vpdpwssd            m22, m18, m5    ; A2
4694    vpdpwssd            m23, m18, m6    ; B2
4695    pshufb              m24, m21
4696    vpdpwssd             m7, m15, m24   ; h3
4697    vpdpwssd             m8, m13, m24   ; i1
4698    vpermt2b             m7, m9, m8     ; 78
4699    vpshrdd              m5, m6, m7, 16 ; 67
4700    vpdpwssd            m22, m19, m5    ; A3
4701    vpdpwssd            m23, m19, m7    ; B3
4702    mova                 m6, m7
4703    vpermt2b            m22, m26, m23
4704    mova          [r8+wq*0], ym22
4705    vextracti32x8 [r8+wq*1], m22, 1
4706    lea                  r8, [r8+wq*2]
4707    sub                  hd, 2
4708    jg .hv_w16_loop
4709    add                srcq, 32
4710    add                tmpq, 32
4711    movzx                hd, r5b
4712    sub                 r5d, 1<<8
4713    jg .hv_w16_loop0
4714%if WIN64
4715    pop                  r8
4716%endif
4717    RET
4718
4719%if WIN64
4720DECLARE_REG_TMP 5
4721%else
4722DECLARE_REG_TMP 7
4723%endif
4724
4725cglobal warp_affine_8x8t_16bpc, 4, 7, 22, tmp, ts
4726%define base r6-pd_0to7
4727    mov                 t0d, r7m
4728    lea                  r6, [pd_0to7]
4729    shr                 t0d, 11
4730    vpbroadcastd         m8, [base+warp_8x8t_rnd_v]
4731    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4732    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main
4733    psrad               m14, m16, 15
4734    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4735    psrad               m16, 15
4736    packssdw            m14, m16
4737    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4738    psrad               m15, m16, 15
4739    call mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).main2
4740    add                 tsq, tsq
4741    psrad               m16, 15
4742    packssdw            m15, m16
4743    jmp mangle(private_prefix %+ _warp_affine_8x8_16bpc_avx512icl).end
4744
4745cglobal warp_affine_8x8_16bpc, 4, 7, 22, dst, ds, src, ss, abcd
4746    mov                 t0d, r7m ; pixel_max
4747    lea                  r6, [pd_0to7]
4748    shr                 t0d, 11
4749    vpbroadcastd         m1, [base+warp_8x8_rnd_h+t0*4]
4750    vpbroadcastd         m8, [base+warp_8x8_rnd_v+t0*4]
4751    call .main
4752    psrad               m14, m16, 13
4753    call .main2
4754    psrad               m16, 13
4755    packusdw            m14, m16
4756    call .main2
4757    psrad               m15, m16, 13
4758    call .main2
4759    vpbroadcastd         m0, [base+bidir_shift+t0*4]
4760    vpsrlvw             m14, m0
4761    psrad               m16, 13
4762    packusdw            m15, m16
4763    vpsrlvw             m15, m0
4764.end:
4765    mova                 m0, [base+warp8x8_end]
4766    vpermb              m16, m0, m14
4767    lea                  r2, [dsq*3]
4768    mova          [dstq+dsq*0], xm16
4769    vextracti128  [dstq+dsq*1], ym16, 1
4770    vextracti32x4 [dstq+dsq*2], m16, 2
4771    vextracti32x4 [dstq+r2   ], m16, 3
4772    vpermb              m16, m0, m15
4773    lea                dstq, [dstq+dsq*4]
4774    mova          [dstq+dsq*0], xm16
4775    vextracti128  [dstq+dsq*1], ym16, 1
4776    vextracti32x4 [dstq+dsq*2], m16, 2
4777    vextracti32x4 [dstq+r2   ], m16, 3
4778    RET
4779.main:
4780    vpbroadcastd        ym3, [base+pd_512]
4781%if WIN64
4782    mov               abcdq, r5mp
4783    vpaddd             ym18, ym3, r6m {1to8} ; mx
4784%else
4785    add                 r5d, 512
4786    vpbroadcastd       ym18, r5d
4787%endif
4788    vpaddd             ym20, ym3, r7m {1to8} ; my
4789    mova               ym16, [base+pd_0to7]
4790    vpbroadcastd       ym19, [abcdq+4*0]     ; alpha
4791    vpbroadcastd       ym21, [abcdq+4*1]     ; gamma
4792    lea                  r4, [ssq*3+6]
4793    vpdpwssd           ym18, ym19, ym16      ; tmx
4794    vpdpwssd           ym20, ym21, ym16      ; tmy
4795    sub                srcq, r4
4796    mova                m10, [base+warp8x8_permA]
4797    lea                  r4, [mc_warp_filter+64*8]
4798    vbroadcasti32x4     m12, [base+warp8x8_permC]
4799    kxnorb               k1, k1, k1
4800    vbroadcasti32x4     m13, [base+warp8x8_permD]
4801    movu                ym5, [srcq+0]
4802    vinserti32x8         m5, [srcq+8], 1
4803    psrad              ym17, ym18, 10
4804    mova                m11, [base+warp8x8_permB]
4805    kmovb                k2, k1
4806    vpgatherdq       m3{k1}, [r4+ym17*8]    ; filter_x0
4807    psrad              ym19, 16             ; beta
4808    psrad              ym21, 16             ; delta
4809    paddd              ym18, ym19
4810    vpermb               m4, m10, m5
4811    vpbroadcastq         m9, [base+warp_shift_h+t0*8]
4812    pshufd               m3, m3, q3120
4813    paddd                m7, m1, m1
4814    pshufb               m2, m3, m12
4815    vpdpwssd             m1, m4, m2
4816    vpermb               m5, m11, m5
4817    vshufi32x4           m4, m5, q1021
4818    pshufb               m3, m13
4819    vpdpwssd             m1, m4, m3
4820    call .h
4821    psllq                m2, m1, 32
4822    paddd                m1, m2
4823    vpmultishiftqb       m1, m9, m1
4824    vpshrdq              m1, m0, 48          ; 01 12
4825    call .h
4826    vpshrdq              m2, m1, m0, 48      ; 23 34
4827    call .h
4828    vpshrdq              m3, m2, m0, 48      ; 45 56
4829.main2:
4830    call .h
4831    psrad               ym6, ym20, 10
4832    kmovb                k1, k2
4833    paddd              ym17, ym20, ym21      ; my += delta
4834    vpgatherdq      m20{k2}, [r4+ym6*8]      ; filter_y0
4835    psrad              ym16, ym17, 10
4836    kmovb                k2, k1
4837    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_y1
4838    shufps               m5, m20, m6, q2020
4839    mova                m16, m8
4840    pshufb               m4, m5, m12
4841    vpdpwssd            m16, m1, m4          ; a0 b0
4842    pshufb               m5, m13
4843    mova                 m1, m2
4844    vpdpwssd            m16, m2, m5          ; a1 b1
4845    shufps               m6, m20, m6, q3131
4846    paddd              ym20, ym17, ym21
4847    pshufb               m4, m6, m12
4848    mova                 m2, m3
4849    vpdpwssd            m16, m3, m4          ; a2 b2
4850    vpshrdq              m3, m0, 48          ; 67 78
4851    pshufb               m6, m13
4852    vpdpwssd            m16, m3, m6          ; a3 b3
4853    ret
4854ALIGN function_align
4855.h:
4856    movu               ym16, [srcq+ssq*1]
4857    psrad               ym6, ym18, 10
4858    lea                srcq, [srcq+ssq*2]
4859    vinserti32x8         m5, m16, [srcq+ssq*0], 1
4860    kmovb                k1, k2
4861    paddd              ym17, ym18, ym19      ; mx += beta
4862    vpgatherdq      m18{k2}, [r4+ym6*8]      ; filter_x1
4863    psrad              ym16, ym17, 10
4864    kmovb                k2, k1
4865    vpgatherdq       m6{k1}, [r4+ym16*8]     ; filter_x2
4866    vpermb               m4, m10, m5
4867    shufps              m16, m18, m6, q2020
4868    shufps               m6, m18, m6, q3131
4869    mova                 m0, m7
4870    pshufb              m18, m16, m12
4871    vpdpwssd             m0, m4, m18         ; a0 b0
4872    vpermb               m5, m11, m5
4873    pshufb              m18, m6, m13
4874    vpdpwssd             m0, m5, m18         ; a3 b3
4875    paddd              ym18, ym17, ym19
4876    vshufi32x4          m17, m4, m5, q1021
4877    pshufb              m16, m13
4878    vpdpwssd             m0, m17, m16        ; a1 b1
4879    vshufi32x4           m4, m5, q2132
4880    pshufb               m6, m12
4881    vpdpwssd             m0, m4, m6          ; a2 b2
4882    vpmultishiftqb       m0, m9, m0          ; a a b b
4883    ret
4884
4885%macro BIDIR_FN 0
4886    call .main
4887    lea            stride3q, [strideq*3]
4888    jmp                  wq
4889.w4:
4890    movq   [dstq          ], xm0
4891    movhps [dstq+strideq*1], xm0
4892    vextracti32x4       xm2, ym0, 1
4893    movq   [dstq+strideq*2], xm2
4894    movhps [dstq+stride3q ], xm2
4895    cmp                  hd, 8
4896    jl .w4_end
4897    vextracti32x4       xm2, m0, 2
4898    lea                dstq, [dstq+strideq*4]
4899    movq   [dstq          ], xm2
4900    movhps [dstq+strideq*1], xm2
4901    vextracti32x4       xm0, m0, 3
4902    movq   [dstq+strideq*2], xm0
4903    movhps [dstq+stride3q ], xm0
4904    je .w4_end
4905    lea                dstq, [dstq+strideq*4]
4906    movq   [dstq          ], xm1
4907    movhps [dstq+strideq*1], xm1
4908    vextracti32x4       xm0, ym1, 1
4909    movq   [dstq+strideq*2], xm0
4910    movhps [dstq+stride3q ], xm0
4911    vextracti32x4       xm0, m1, 2
4912    lea                dstq, [dstq+strideq*4]
4913    movq   [dstq          ], xm0
4914    movhps [dstq+strideq*1], xm0
4915    vextracti32x4       xm1, m1, 3
4916    movq   [dstq+strideq*2], xm1
4917    movhps [dstq+stride3q ], xm1
4918.w4_end:
4919    RET
4920.w8_loop:
4921    call .main
4922    lea                dstq, [dstq+strideq*4]
4923.w8:
4924    mova          [dstq+strideq*0], xm0
4925    vextracti32x4 [dstq+strideq*1], ym0, 1
4926    vextracti32x4 [dstq+strideq*2], m0, 2
4927    vextracti32x4 [dstq+stride3q ], m0, 3
4928    sub                  hd, 8
4929    jl .w8_end
4930    lea                dstq, [dstq+strideq*4]
4931    mova          [dstq+strideq*0], xm1
4932    vextracti32x4 [dstq+strideq*1], ym1, 1
4933    vextracti32x4 [dstq+strideq*2], m1, 2
4934    vextracti32x4 [dstq+stride3q ], m1, 3
4935    jg .w8_loop
4936.w8_end:
4937    RET
4938.w16_loop:
4939    call .main
4940    lea                dstq, [dstq+strideq*4]
4941.w16:
4942    mova          [dstq+strideq*0], ym0
4943    vextracti32x8 [dstq+strideq*1], m0, 1
4944    mova          [dstq+strideq*2], ym1
4945    vextracti32x8 [dstq+stride3q ], m1, 1
4946    sub                  hd, 4
4947    jg .w16_loop
4948    RET
4949.w32_loop:
4950    call .main
4951    lea                dstq, [dstq+strideq*2]
4952.w32:
4953    mova   [dstq+strideq*0], m0
4954    mova   [dstq+strideq*1], m1
4955    sub                  hd, 2
4956    jg .w32_loop
4957    RET
4958.w64_loop:
4959    call .main
4960    add                dstq, strideq
4961.w64:
4962    mova        [dstq+64*0], m0
4963    mova        [dstq+64*1], m1
4964    dec                  hd
4965    jg .w64_loop
4966    RET
4967.w128_loop:
4968    call .main
4969    add                dstq, strideq
4970.w128:
4971    mova        [dstq+64*0], m0
4972    mova        [dstq+64*1], m1
4973    call .main
4974    mova        [dstq+64*2], m0
4975    mova        [dstq+64*3], m1
4976    dec                  hd
4977    jg .w128_loop
4978    RET
4979%endmacro
4980
4981%if WIN64
4982DECLARE_REG_TMP 5
4983%else
4984DECLARE_REG_TMP 7
4985%endif
4986
4987cglobal avg_16bpc, 4, 7, 4, dst, stride, tmp1, tmp2, w, h, stride3
4988%define base r6-avg_avx512icl_table
4989    lea                  r6, [avg_avx512icl_table]
4990    tzcnt                wd, wm
4991    mov                 t0d, r6m ; pixel_max
4992    movsxd               wq, [r6+wq*4]
4993    shr                 t0d, 11
4994    vpbroadcastd         m2, [base+avg_round+t0*4]
4995    vpbroadcastd         m3, [base+avg_shift+t0*4]
4996    movifnidn            hd, hm
4997    add                  wq, r6
4998    BIDIR_FN
4999ALIGN function_align
5000.main:
5001    mova                 m0, [tmp1q+64*0]
5002    paddsw               m0, [tmp2q+64*0]
5003    mova                 m1, [tmp1q+64*1]
5004    paddsw               m1, [tmp2q+64*1]
5005    add               tmp1q, 64*2
5006    add               tmp2q, 64*2
5007    pmaxsw               m0, m2
5008    pmaxsw               m1, m2
5009    psubsw               m0, m2
5010    psubsw               m1, m2
5011    vpsrlvw              m0, m3
5012    vpsrlvw              m1, m3
5013    ret
5014
5015cglobal w_avg_16bpc, 4, 7, 8, dst, stride, tmp1, tmp2, w, h, stride3
5016%define base r6-w_avg_avx512icl_table
5017    lea                  r6, [w_avg_avx512icl_table]
5018    tzcnt                wd, wm
5019    mov                 t0d, r7m ; pixel_max
5020    shr                 t0d, 11
5021    movsxd               wq, [r6+wq*4]
5022    vpbroadcastd         m5, [base+w_avg_round+t0*4]
5023    vpbroadcastd         m7, [base+bidir_shift+t0*4]
5024    add                  wq, r6
5025    mov                 r6d, r6m ; weight
5026    lea                 t0d, [r6-16]
5027    shl                 r6d, 16
5028    sub                 r6d, t0d ; 16-weight, weight
5029    movifnidn            hd, hm
5030    vpbroadcastd         m6, r6d
5031    BIDIR_FN
5032ALIGN function_align
5033.main:
5034    mova                 m3, [tmp1q+64*0]
5035    mova                 m1, [tmp2q+64*0]
5036    mova                 m0, [tmp1q+64*1]
5037    mova                 m4, [tmp2q+64*1]
5038    add               tmp1q, 64*2
5039    add               tmp2q, 64*2
5040    punpcklwd            m2, m1, m3
5041    punpckhwd            m1, m3
5042    punpcklwd            m3, m4, m0
5043    punpckhwd            m4, m0
5044    mova                 m0, m5
5045    vpdpwssd             m0, m6, m2
5046    mova                 m2, m5
5047    vpdpwssd             m2, m6, m1
5048    mova                 m1, m5
5049    vpdpwssd             m1, m6, m3
5050    mova                 m3, m5
5051    vpdpwssd             m3, m6, m4
5052    REPX       {psrad x, 2}, m0, m2, m1, m3
5053    packusdw             m0, m2
5054    packusdw             m1, m3
5055    vpsrlvw              m0, m7
5056    vpsrlvw              m1, m7
5057    ret
5058
5059cglobal mask_16bpc, 4, 8, 11, dst, stride, tmp1, tmp2, w, h, mask, stride3
5060%define base r7-mask_avx512icl_table
5061    lea                  r7, [mask_avx512icl_table]
5062    tzcnt                wd, wm
5063    mov                 r6d, r7m ; pixel_max
5064    movifnidn            hd, hm
5065    shr                 r6d, 11
5066    movsxd               wq, [r7+wq*4]
5067    vpbroadcastd         m8, [base+pw_64]
5068    vpbroadcastd         m9, [base+mask_round+r6*4]
5069    vpbroadcastd        m10, [base+bidir_shift+r6*4]
5070    mov               maskq, maskmp
5071    add                  wq, r7
5072    BIDIR_FN
5073ALIGN function_align
5074.main:
5075    pmovzxbw             m1, [maskq+32*0]
5076    mova                 m4, [tmp1q+64*0]
5077    mova                 m2, [tmp2q+64*0]
5078    pmovzxbw             m6, [maskq+32*1]
5079    mova                 m5, [tmp1q+64*1]
5080    mova                 m3, [tmp2q+64*1]
5081    add               maskq, 32*2
5082    add               tmp1q, 64*2
5083    add               tmp2q, 64*2
5084    punpcklwd            m7, m4, m2
5085    punpckhwd            m4, m2
5086    psubw                m0, m8, m1
5087    punpcklwd            m2, m1, m0 ; m, 64-m
5088    punpckhwd            m1, m0
5089    mova                 m0, m9
5090    vpdpwssd             m0, m7, m2
5091    mova                 m2, m9
5092    vpdpwssd             m2, m4, m1 ; tmp1 * m + tmp2 * (64-m)
5093    punpcklwd            m7, m5, m3
5094    punpckhwd            m5, m3
5095    psubw                m1, m8, m6
5096    punpcklwd            m3, m6, m1
5097    punpckhwd            m6, m1
5098    mova                 m1, m9
5099    vpdpwssd             m1, m7, m3
5100    mova                 m3, m9
5101    vpdpwssd             m3, m5, m6
5102    REPX       {psrad x, 4}, m0, m2, m1, m3
5103    packusdw             m0, m2
5104    packusdw             m1, m3
5105    vpsrlvw              m0, m10
5106    vpsrlvw              m1, m10
5107    ret
5108
5109cglobal w_mask_420_16bpc, 4, 8, 16, dst, stride, tmp1, tmp2, w, h, mask, stride3
5110%define base r7-w_mask_420_avx512icl_table
5111    lea                  r7, [w_mask_420_avx512icl_table]
5112    tzcnt                wd, wm
5113    mov                 r6d, r8m ; pixel_max
5114    movifnidn            hd, hm
5115    shr                 r6d, 11
5116    movsxd               wq, [r7+wq*4]
5117    vpbroadcastd        m10, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5118    vpbroadcastd        m11, [base+pw_64]
5119    vpbroadcastd        m12, [base+mask_round+r6*4]
5120    vpbroadcastd        m13, [base+bidir_shift+r6*4]
5121    mov                 r6d, r7m ; sign
5122    vpbroadcastd        m14, [base+w_mask_round+r6*4]
5123    mova               ym15, [w_mask_end42x]
5124    mov               maskq, maskmp
5125    add                  wq, r7
5126    call .main
5127    lea            stride3q, [strideq*3]
5128    jmp                  wq
5129.w4:
5130    mova                 m4, [w_mask_shuf4]
5131    vpermt2b             m2, m4, m3
5132    mova                 m3, m14
5133    vpdpbusd             m3, m2, [pb_64] {1to16}
5134    vpermb               m3, m15, m3
5135    movq   [dstq+strideq*0], xm0
5136    movhps [dstq+strideq*1], xm0
5137    vextracti32x4       xm2, ym0, 1
5138    movq   [dstq+strideq*2], xm2
5139    movhps [dstq+stride3q ], xm2
5140    mova            [maskq], xm3
5141    cmp                  hd, 8
5142    jl .w4_end
5143    vextracti32x4       xm2, m0, 2
5144    lea                dstq, [dstq+strideq*4]
5145    movq   [dstq+strideq*0], xm2
5146    movhps [dstq+strideq*1], xm2
5147    vextracti32x4       xm0, m0, 3
5148    movq   [dstq+strideq*2], xm0
5149    movhps [dstq+stride3q ], xm0
5150    je .w4_end
5151    lea                dstq, [dstq+strideq*4]
5152    movq   [dstq+strideq*0], xm1
5153    movhps [dstq+strideq*1], xm1
5154    vextracti32x4       xm2, ym1, 1
5155    movq   [dstq+strideq*2], xm2
5156    movhps [dstq+stride3q ], xm2
5157    vextracti32x4       xm2, m1, 2
5158    lea                dstq, [dstq+strideq*4]
5159    movq   [dstq+strideq*0], xm2
5160    movhps [dstq+strideq*1], xm2
5161    vextracti32x4       xm1, m1, 3
5162    movq   [dstq+strideq*2], xm1
5163    movhps [dstq+stride3q ], xm1
5164.w4_end:
5165    RET
5166.w8:
5167    mova                 m8, [w_mask_shuf8]
5168    vpbroadcastd         m9, [pb_64]
5169    jmp .w8_start
5170.w8_loop:
5171    call .main
5172    lea                dstq, [dstq+strideq*4]
5173    add               maskq, 16
5174.w8_start:
5175    vpermt2b             m2, m8, m3
5176    mova                 m3, m14
5177    vpdpbusd             m3, m2, m9
5178    vpermb               m3, m15, m3
5179    mova          [dstq+strideq*0], xm0
5180    vextracti32x4 [dstq+strideq*1], ym0, 1
5181    vextracti32x4 [dstq+strideq*2], m0, 2
5182    vextracti32x4 [dstq+stride3q ], m0, 3
5183    mova            [maskq], xm3
5184    sub                  hd, 8
5185    jl .w8_end
5186    lea                dstq, [dstq+strideq*4]
5187    mova          [dstq+strideq*0], xm1
5188    vextracti32x4 [dstq+strideq*1], ym1, 1
5189    vextracti32x4 [dstq+strideq*2], m1, 2
5190    vextracti32x4 [dstq+stride3q ], m1, 3
5191    jg .w8_loop
5192.w8_end:
5193    RET
5194.w16:
5195    mova                 m8, [w_mask_shuf16]
5196    vpbroadcastd         m9, [pb_64]
5197    jmp .w16_start
5198.w16_loop:
5199    call .main
5200    lea                dstq, [dstq+strideq*4]
5201    add               maskq, 16
5202.w16_start:
5203    vpermt2b             m2, m8, m3
5204    mova                 m3, m14
5205    vpdpbusd             m3, m2, m9
5206    vpermb               m3, m15, m3
5207    mova          [dstq+strideq*0], ym0
5208    vextracti32x8 [dstq+strideq*1], m0, 1
5209    mova          [dstq+strideq*2], ym1
5210    vextracti32x8 [dstq+stride3q ], m1, 1
5211    mova            [maskq], xm3
5212    sub                  hd, 4
5213    jg .w16_loop
5214    RET
5215.w32_loop:
5216    call .main
5217    lea                dstq, [dstq+strideq*4]
5218    add               maskq, 32
5219.w32:
5220    paddw                m2, m3
5221    mova                 m8, m14
5222    vpdpwssd             m8, m11, m2
5223    mova   [dstq+strideq*0], m0
5224    mova   [dstq+strideq*1], m1
5225    call .main
5226    paddw                m2, m3
5227    mova                 m3, m14
5228    vpdpwssd             m3, m11, m2
5229    vpermt2b             m8, m15, m3
5230    mova   [dstq+strideq*2], m0
5231    mova   [dstq+stride3q ], m1
5232    mova            [maskq], ym8
5233    sub                  hd, 4
5234    jg .w32_loop
5235    RET
5236.w64_loop:
5237    call .main
5238    lea                dstq, [dstq+strideq*2]
5239    add               maskq, 32
5240.w64:
5241    mova                 m8, m2
5242    mova                 m9, m3
5243    mova [dstq+strideq*0+64*0], m0
5244    mova [dstq+strideq*0+64*1], m1
5245    call .main
5246    paddw                m8, m2
5247    paddw                m9, m3
5248    mova                 m2, m14
5249    vpdpwssd             m2, m11, m8
5250    mova                 m3, m14
5251    vpdpwssd             m3, m11, m9
5252    vpermt2b             m2, m15, m3
5253    mova [dstq+strideq*1+64*0], m0
5254    mova [dstq+strideq*1+64*1], m1
5255    mova            [maskq], ym2
5256    sub                  hd, 2
5257    jg .w64_loop
5258    RET
5259.w128_loop:
5260    call .main
5261    lea                dstq, [dstq+strideq*2]
5262    add               maskq, 64
5263.w128:
5264    mova               m16, m2
5265    mova                m8, m3
5266    mova [dstq+strideq*0+64*0], m0
5267    mova [dstq+strideq*0+64*1], m1
5268    call .main
5269    mova                m17, m2
5270    mova                 m9, m3
5271    mova [dstq+strideq*0+64*2], m0
5272    mova [dstq+strideq*0+64*3], m1
5273    call .main
5274    paddw                m2, m16
5275    paddw                m3, m8
5276    mova                m16, m14
5277    vpdpwssd            m16, m11, m2
5278    mova                 m8, m14
5279    vpdpwssd             m8, m11, m3
5280    mova [dstq+strideq*1+64*0], m0
5281    mova [dstq+strideq*1+64*1], m1
5282    call .main
5283    paddw                m2, m17
5284    paddw                m3, m9
5285    mova                m17, m14
5286    vpdpwssd            m17, m11, m2
5287    mova                 m9, m14
5288    vpdpwssd             m9, m11, m3
5289    vpermt2b            m16, m15, m8
5290    vpermt2b            m17, m15, m9
5291    mova [dstq+strideq*1+64*2], m0
5292    mova [dstq+strideq*1+64*3], m1
5293    mova       [maskq+32*0], ym16
5294    mova       [maskq+32*1], ym17
5295    sub                  hd, 2
5296    jg .w128_loop
5297    vzeroupper
5298    RET
5299ALIGN function_align
5300.main:
5301    mova                 m1, [tmp1q+64*0]
5302    mova                 m3, [tmp2q+64*0]
5303    mova                 m4, [tmp1q+64*1]
5304    mova                 m7, [tmp2q+64*1]
5305    add               tmp1q, 64*2
5306    add               tmp2q, 64*2
5307    psubsw               m6, m1, m3
5308    punpcklwd            m5, m3, m1
5309    pabsw                m6, m6
5310    punpckhwd            m3, m1
5311    psubusw              m6, m10, m6
5312    psrlw                m6, 10      ; 64-m
5313    psubw                m2, m11, m6 ; m
5314    punpcklwd            m1, m6, m2
5315    punpckhwd            m6, m2
5316    mova                 m0, m12
5317    vpdpwssd             m0, m5, m1
5318    mova                 m1, m12
5319    vpdpwssd             m1, m3, m6
5320    psubsw               m5, m4, m7
5321    punpcklwd            m6, m7, m4
5322    pabsw                m5, m5
5323    punpckhwd            m7, m4
5324    psubusw              m5, m10, m5
5325    psrlw                m5, 10
5326    psubw                m3, m11, m5
5327    punpcklwd            m4, m5, m3
5328    psrad                m0, 4
5329    punpckhwd            m5, m3
5330    psrad                m1, 4
5331    packusdw             m0, m1
5332    mova                 m1, m12
5333    vpdpwssd             m1, m6, m4
5334    mova                 m4, m12
5335    vpdpwssd             m4, m7, m5
5336    psrad                m1, 4
5337    psrad                m4, 4
5338    packusdw             m1, m4
5339    vpsrlvw              m0, m13
5340    vpsrlvw              m1, m13
5341    ret
5342
5343cglobal w_mask_422_16bpc, 4, 8, 15, dst, stride, tmp1, tmp2, w, h, mask, stride3
5344%define base r7-w_mask_422_avx512icl_table
5345    lea                  r7, [w_mask_422_avx512icl_table]
5346    tzcnt                wd, wm
5347    mov                 r6d, r8m ; pixel_max
5348    movifnidn            hd, hm
5349    shr                 r6d, 11
5350    movsxd               wq, [r7+wq*4]
5351    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5352    vpbroadcastd         m9, [base+pw_64]
5353    vpbroadcastd        m10, [base+mask_round+r6*4]
5354    vpbroadcastd        m11, [base+bidir_shift+r6*4]
5355    mov                 r6d, r7m ; sign
5356    vpbroadcastd        m12, [base+w_mask_round+r6*4]
5357    mova               ym13, [w_mask_end42x]
5358    mov               maskq, maskmp
5359    add                  wq, r7
5360    paddw               m14, m9, m9 ; pw_128
5361    call .main
5362    lea            stride3q, [strideq*3]
5363    jmp                  wq
5364.w4:
5365    movq   [dstq+strideq*0], xm0
5366    movhps [dstq+strideq*1], xm0
5367    vextracti32x4       xm2, ym0, 1
5368    movq   [dstq+strideq*2], xm2
5369    movhps [dstq+stride3q ], xm2
5370    cmp                  hd, 8
5371    jl .w4_end
5372    vextracti32x4       xm2, m0, 2
5373    lea                dstq, [dstq+strideq*4]
5374    movq   [dstq+strideq*0], xm2
5375    movhps [dstq+strideq*1], xm2
5376    vextracti32x4       xm0, m0, 3
5377    movq   [dstq+strideq*2], xm0
5378    movhps [dstq+stride3q ], xm0
5379    je .w4_end
5380    lea                dstq, [dstq+strideq*4]
5381    movq   [dstq+strideq*0], xm1
5382    movhps [dstq+strideq*1], xm1
5383    vextracti32x4       xm2, ym1, 1
5384    movq   [dstq+strideq*2], xm2
5385    movhps [dstq+stride3q ], xm2
5386    vextracti32x4       xm2, m1, 2
5387    lea                dstq, [dstq+strideq*4]
5388    movq   [dstq+strideq*0], xm2
5389    movhps [dstq+strideq*1], xm2
5390    vextracti32x4       xm1, m1, 3
5391    movq   [dstq+strideq*2], xm1
5392    movhps [dstq+stride3q ], xm1
5393.w4_end:
5394    RET
5395.w8_loop:
5396    call .main
5397    lea                dstq, [dstq+strideq*4]
5398.w8:
5399    mova          [dstq+strideq*0], xm0
5400    vextracti32x4 [dstq+strideq*1], ym0, 1
5401    vextracti32x4 [dstq+strideq*2], m0, 2
5402    vextracti32x4 [dstq+stride3q ], m0, 3
5403    sub                  hd, 8
5404    jl .w8_end
5405    lea                dstq, [dstq+strideq*4]
5406    mova          [dstq+strideq*0], xm1
5407    vextracti32x4 [dstq+strideq*1], ym1, 1
5408    vextracti32x4 [dstq+strideq*2], m1, 2
5409    vextracti32x4 [dstq+stride3q ], m1, 3
5410    jg .w8_loop
5411.w8_end:
5412    RET
5413.w16_loop:
5414    call .main
5415    lea                dstq, [dstq+strideq*4]
5416.w16:
5417    mova          [dstq+strideq*0], ym0
5418    vextracti32x8 [dstq+strideq*1], m0, 1
5419    mova          [dstq+strideq*2], ym1
5420    vextracti32x8 [dstq+stride3q ], m1, 1
5421    sub                  hd, 4
5422    jg .w16_loop
5423    RET
5424.w32_loop:
5425    call .main
5426    lea                dstq, [dstq+strideq*2]
5427.w32:
5428    mova   [dstq+strideq*0], m0
5429    mova   [dstq+strideq*1], m1
5430    sub                  hd, 2
5431    jg .w32_loop
5432    RET
5433.w64_loop:
5434    call .main
5435    add                dstq, strideq
5436.w64:
5437    mova        [dstq+64*0], m0
5438    mova        [dstq+64*1], m1
5439    dec                  hd
5440    jg .w64_loop
5441    RET
5442.w128_loop:
5443    call .main
5444    add                dstq, strideq
5445.w128:
5446    mova        [dstq+64*0], m0
5447    mova        [dstq+64*1], m1
5448    call .main
5449    mova        [dstq+64*2], m0
5450    mova        [dstq+64*3], m1
5451    dec                  hd
5452    jg .w128_loop
5453    RET
5454ALIGN function_align
5455.main:
5456    mova                 m1, [tmp1q+64*0]
5457    mova                 m3, [tmp2q+64*0]
5458    mova                 m4, [tmp1q+64*1]
5459    mova                 m7, [tmp2q+64*1]
5460    add               tmp1q, 64*2
5461    add               tmp2q, 64*2
5462    psubsw               m6, m1, m3
5463    punpcklwd            m5, m3, m1
5464    pabsw                m6, m6
5465    punpckhwd            m3, m1
5466    psubusw              m6, m8, m6
5467    psrlw                m6, 10
5468    psubw                m2, m9, m6
5469    punpcklwd            m1, m6, m2
5470    punpckhwd            m6, m2
5471    mova                 m0, m10
5472    vpdpwssd             m0, m5, m1
5473    mova                 m1, m10
5474    vpdpwssd             m1, m3, m6
5475    psubsw               m5, m4, m7
5476    punpcklwd            m6, m7, m4
5477    pabsw                m5, m5
5478    punpckhwd            m7, m4
5479    psubusw              m5, m8, m5
5480    psrlw                m5, 10
5481    psubw                m3, m9, m5
5482    punpcklwd            m4, m5, m3
5483    psrad                m0, 4
5484    punpckhwd            m5, m3
5485    psrad                m1, 4
5486    packusdw             m0, m1
5487    mova                 m1, m10
5488    vpdpwssd             m1, m6, m4
5489    mova                 m4, m10
5490    vpdpwssd             m4, m7, m5
5491    mova                 m5, m12
5492    vpdpwssd             m5, m14, m2
5493    mova                 m2, m12
5494    vpdpwssd             m2, m14, m3
5495    psrad                m1, 4
5496    psrad                m4, 4
5497    packusdw             m1, m4
5498    vpermt2b             m5, m13, m2
5499    vpsrlvw              m0, m11
5500    vpsrlvw              m1, m11
5501    mova            [maskq], ym5
5502    add               maskq, 32
5503    ret
5504
5505cglobal w_mask_444_16bpc, 4, 8, 13, dst, stride, tmp1, tmp2, w, h, mask, stride3
5506%define base r7-w_mask_444_avx512icl_table
5507    lea                  r7, [w_mask_444_avx512icl_table]
5508    tzcnt                wd, wm
5509    mov                 r6d, r8m ; pixel_max
5510    movifnidn            hd, hm
5511    shr                 r6d, 11
5512    movsxd               wq, [r7+wq*4]
5513    vpbroadcastd         m8, [base+pw_27615] ; ((64 - 38) << 10) + 1023 - 32
5514    vpbroadcastd         m9, [base+pw_64]
5515    vpbroadcastd        m10, [base+mask_round+r6*4]
5516    mova                m11, [w_mask_end444]
5517    vpbroadcastd        m12, [base+bidir_shift+r6*4]
5518    mov               maskq, maskmp
5519    add                  wq, r7
5520    call .main
5521    lea            stride3q, [strideq*3]
5522    jmp                  wq
5523.w4:
5524    movq   [dstq+strideq*0], xm0
5525    movhps [dstq+strideq*1], xm0
5526    vextracti32x4       xm2, ym0, 1
5527    movq   [dstq+strideq*2], xm2
5528    movhps [dstq+stride3q ], xm2
5529    cmp                  hd, 8
5530    jl .w4_end
5531    vextracti32x4       xm2, m0, 2
5532    lea                dstq, [dstq+strideq*4]
5533    movq   [dstq+strideq*0], xm2
5534    movhps [dstq+strideq*1], xm2
5535    vextracti32x4       xm0, m0, 3
5536    movq   [dstq+strideq*2], xm0
5537    movhps [dstq+stride3q ], xm0
5538    je .w4_end
5539    lea                dstq, [dstq+strideq*4]
5540    movq   [dstq+strideq*0], xm1
5541    movhps [dstq+strideq*1], xm1
5542    vextracti32x4       xm2, ym1, 1
5543    movq   [dstq+strideq*2], xm2
5544    movhps [dstq+stride3q ], xm2
5545    vextracti32x4       xm2, m1, 2
5546    lea                dstq, [dstq+strideq*4]
5547    movq   [dstq+strideq*0], xm2
5548    movhps [dstq+strideq*1], xm2
5549    vextracti32x4       xm1, m1, 3
5550    movq   [dstq+strideq*2], xm1
5551    movhps [dstq+stride3q ], xm1
5552.w4_end:
5553    RET
5554.w8_loop:
5555    call .main
5556    lea                dstq, [dstq+strideq*4]
5557.w8:
5558    mova          [dstq+strideq*0], xm0
5559    vextracti32x4 [dstq+strideq*1], ym0, 1
5560    vextracti32x4 [dstq+strideq*2], m0, 2
5561    vextracti32x4 [dstq+stride3q ], m0, 3
5562    sub                  hd, 8
5563    jl .w8_end
5564    lea                dstq, [dstq+strideq*4]
5565    mova          [dstq+strideq*0], xm1
5566    vextracti32x4 [dstq+strideq*1], ym1, 1
5567    vextracti32x4 [dstq+strideq*2], m1, 2
5568    vextracti32x4 [dstq+stride3q ], m1, 3
5569    jg .w8_loop
5570.w8_end:
5571    RET
5572.w16_loop:
5573    call .main
5574    lea                dstq, [dstq+strideq*4]
5575.w16:
5576    mova          [dstq+strideq*0], ym0
5577    vextracti32x8 [dstq+strideq*1], m0, 1
5578    mova          [dstq+strideq*2], ym1
5579    vextracti32x8 [dstq+stride3q ], m1, 1
5580    sub                  hd, 4
5581    jg .w16_loop
5582    RET
5583.w32_loop:
5584    call .main
5585    lea                dstq, [dstq+strideq*2]
5586.w32:
5587    mova   [dstq+strideq*0], m0
5588    mova   [dstq+strideq*1], m1
5589    sub                  hd, 2
5590    jg .w32_loop
5591    RET
5592.w64_loop:
5593    call .main
5594    add                dstq, strideq
5595.w64:
5596    mova        [dstq+64*0], m0
5597    mova        [dstq+64*1], m1
5598    dec                  hd
5599    jg .w64_loop
5600    RET
5601.w128_loop:
5602    call .main
5603    add                dstq, strideq
5604.w128:
5605    mova        [dstq+64*0], m0
5606    mova        [dstq+64*1], m1
5607    call .main
5608    mova        [dstq+64*2], m0
5609    mova        [dstq+64*3], m1
5610    dec                  hd
5611    jg .w128_loop
5612    RET
5613ALIGN function_align
5614.main:
5615    mova                 m1, [tmp1q+64*0]
5616    mova                 m3, [tmp2q+64*0]
5617    mova                 m4, [tmp1q+64*1]
5618    mova                 m7, [tmp2q+64*1]
5619    add               tmp1q, 64*2
5620    add               tmp2q, 64*2
5621    psubsw               m6, m1, m3
5622    punpcklwd            m5, m3, m1
5623    pabsw                m6, m6
5624    punpckhwd            m3, m1
5625    psubusw              m6, m8, m6
5626    psrlw                m6, 10
5627    psubw                m2, m9, m6
5628    punpcklwd            m1, m6, m2
5629    punpckhwd            m6, m2
5630    mova                 m0, m10
5631    vpdpwssd             m0, m5, m1
5632    mova                 m1, m10
5633    vpdpwssd             m1, m3, m6
5634    psubsw               m5, m4, m7
5635    punpcklwd            m6, m7, m4
5636    pabsw                m5, m5
5637    punpckhwd            m7, m4
5638    psubusw              m5, m8, m5
5639    psrlw                m5, 10
5640    psubw                m3, m9, m5
5641    punpcklwd            m4, m5, m3
5642    psrad                m0, 4
5643    punpckhwd            m5, m3
5644    psrad                m1, 4
5645    packusdw             m0, m1
5646    mova                 m1, m10
5647    vpdpwssd             m1, m6, m4
5648    mova                 m4, m10
5649    vpdpwssd             m4, m7, m5
5650    vpermt2b             m2, m11, m3
5651    psrad                m1, 4
5652    psrad                m4, 4
5653    packusdw             m1, m4
5654    vpsrlvw              m0, m12
5655    vpsrlvw              m1, m12
5656    mova            [maskq], m2
5657    add               maskq, 64
5658    ret
5659
5660cglobal blend_16bpc, 3, 7, 7, dst, ds, tmp, w, h, mask
5661%define base r6-blend_avx512icl_table
5662    lea                  r6, [blend_avx512icl_table]
5663    tzcnt                wd, wm
5664    movifnidn            hd, hm
5665    movsxd               wq, [r6+wq*4]
5666    movifnidn         maskq, maskmp
5667    vpbroadcastd         m6, [base+pw_m512]
5668    add                  wq, r6
5669    lea                  r6, [dsq*3]
5670    jmp                  wq
5671.w4:
5672    pmovzxbw           ym19, [maskq]
5673    movq               xm16, [dstq+dsq*0]
5674    movhps             xm16, [dstq+dsq*1]
5675    vpbroadcastq       ym17, [dstq+dsq*2]
5676    vpbroadcastq       ym18, [dstq+r6   ]
5677    pmullw             ym19, ym6
5678    vpblendd           ym16, ym17, 0x30
5679    vpblendd           ym16, ym18, 0xc0
5680    psubw              ym17, ym16, [tmpq]
5681    add               maskq, 16
5682    add                tmpq, 32
5683    pmulhrsw           ym17, ym19
5684    paddw              ym16, ym17
5685    vextracti128       xm17, ym16, 1
5686    movq       [dstq+dsq*0], xm16
5687    movhps     [dstq+dsq*1], xm16
5688    movq       [dstq+dsq*2], xm17
5689    movhps     [dstq+r6   ], xm17
5690    lea                dstq, [dstq+dsq*4]
5691    sub                  hd, 4
5692    jg .w4
5693    vzeroupper
5694    RET
5695.w8:
5696    pmovzxbw             m2, [maskq]
5697    mova                xm0, [dstq+dsq*0]
5698    vinserti32x4        ym0, [dstq+dsq*1], 1
5699    vinserti32x4         m0, [dstq+dsq*2], 2
5700    vinserti32x4         m0, [dstq+r6   ], 3
5701    pmullw               m2, m6
5702    psubw                m1, m0, [tmpq]
5703    add               maskq, 32
5704    add                tmpq, 64
5705    pmulhrsw             m1, m2
5706    paddw                m0, m1
5707    mova          [dstq+dsq*0], xm0
5708    vextracti32x4 [dstq+dsq*1], ym0, 1
5709    vextracti32x4 [dstq+dsq*2], m0, 2
5710    vextracti32x4 [dstq+r6   ], m0, 3
5711    lea                dstq, [dstq+dsq*4]
5712    sub                  hd, 4
5713    jg .w8
5714    RET
5715.w16:
5716    pmovzxbw             m4, [maskq+32*0]
5717    pmovzxbw             m5, [maskq+32*1]
5718    mova                ym0, [dstq+dsq*0]
5719    vinserti32x8         m0, [dstq+dsq*1], 1
5720    mova                ym1, [dstq+dsq*2]
5721    vinserti32x8         m1, [dstq+r6   ], 1
5722    pmullw               m4, m6
5723    pmullw               m5, m6
5724    psubw                m2, m0, [tmpq+64*0]
5725    psubw                m3, m1, [tmpq+64*1]
5726    add               maskq, 32*2
5727    add                tmpq, 64*2
5728    pmulhrsw             m2, m4
5729    pmulhrsw             m3, m5
5730    paddw                m0, m2
5731    paddw                m1, m3
5732    mova          [dstq+dsq*0], ym0
5733    vextracti32x8 [dstq+dsq*1], m0, 1
5734    mova          [dstq+dsq*2], ym1
5735    vextracti32x8 [dstq+r6   ], m1, 1
5736    lea                dstq, [dstq+dsq*4]
5737    sub                  hd, 4
5738    jg .w16
5739    RET
5740.w32:
5741    pmovzxbw             m4, [maskq+32*0]
5742    pmovzxbw             m5, [maskq+32*1]
5743    mova                 m0, [dstq+dsq*0]
5744    mova                 m1, [dstq+dsq*1]
5745    pmullw               m4, m6
5746    pmullw               m5, m6
5747    psubw                m2, m0, [tmpq+ 64*0]
5748    psubw                m3, m1, [tmpq+ 64*1]
5749    add               maskq, 32*2
5750    add                tmpq, 64*2
5751    pmulhrsw             m2, m4
5752    pmulhrsw             m3, m5
5753    paddw                m0, m2
5754    paddw                m1, m3
5755    mova       [dstq+dsq*0], m0
5756    mova       [dstq+dsq*1], m1
5757    lea                dstq, [dstq+dsq*2]
5758    sub                  hd, 2
5759    jg .w32
5760    RET
5761
5762cglobal blend_v_16bpc, 3, 6, 5, dst, ds, tmp, w, h
5763    lea                  r5, [blend_v_avx512icl_table]
5764    tzcnt                wd, wm
5765    movifnidn            hd, hm
5766    movsxd               wq, [r5+wq*4]
5767    add                  wq, r5
5768    jmp                  wq
5769.w2:
5770    vpbroadcastd       xmm2, [obmc_masks_avx2+2*2]
5771.w2_loop:
5772    movd               xmm0, [dstq+dsq*0]
5773    pinsrd             xmm0, [dstq+dsq*1], 1
5774    movq               xmm1, [tmpq]
5775    add                tmpq, 4*2
5776    psubw              xmm1, xmm0, xmm1
5777    pmulhrsw           xmm1, xmm2
5778    paddw              xmm0, xmm1
5779    movd       [dstq+dsq*0], xmm0
5780    pextrd     [dstq+dsq*1], xmm0, 1
5781    lea                dstq, [dstq+dsq*2]
5782    sub                  hd, 2
5783    jg .w2_loop
5784    RET
5785.w4:
5786    vpbroadcastq       xmm2, [obmc_masks_avx2+4*2]
5787.w4_loop:
5788    movq               xmm0, [dstq+dsq*0]
5789    movhps             xmm0, [dstq+dsq*1]
5790    psubw              xmm1, xmm0, [tmpq]
5791    add                tmpq, 8*2
5792    pmulhrsw           xmm1, xmm2
5793    paddw              xmm0, xmm1
5794    movq       [dstq+dsq*0], xmm0
5795    movhps     [dstq+dsq*1], xmm0
5796    lea                dstq, [dstq+dsq*2]
5797    sub                  hd, 2
5798    jg .w4_loop
5799    RET
5800.w8:
5801    vbroadcasti32x4     ym2, [obmc_masks_avx2+8*2]
5802.w8_loop:
5803    mova                xm0, [dstq+dsq*0]
5804    vinserti32x4        ym0, [dstq+dsq*1], 1
5805    psubw               ym1, ym0, [tmpq]
5806    add                tmpq, 16*2
5807    pmulhrsw            ym1, ym2
5808    paddw               ym0, ym1
5809    mova          [dstq+dsq*0], xm0
5810    vextracti32x4 [dstq+dsq*1], ym0, 1
5811    lea                dstq, [dstq+dsq*2]
5812    sub                  hd, 2
5813    jg .w8_loop
5814    RET
5815.w16:
5816    vbroadcasti32x8      m2, [obmc_masks_avx2+16*2]
5817.w16_loop:
5818    mova                ym0, [dstq+dsq*0]
5819    vinserti32x8         m0, [dstq+dsq*1], 1
5820    psubw                m1, m0, [tmpq]
5821    add                tmpq, 32*2
5822    pmulhrsw             m1, m2
5823    paddw                m0, m1
5824    mova          [dstq+dsq*0], ym0
5825    vextracti32x8 [dstq+dsq*1], m0, 1
5826    lea                dstq, [dstq+dsq*2]
5827    sub                  hd, 2
5828    jg .w16_loop
5829    RET
5830.w32:
5831    mova                 m4, [obmc_masks_avx2+32*2]
5832.w32_loop:
5833    mova                 m0,     [dstq+dsq*0]
5834    psubw                m2, m0, [tmpq+ 64*0]
5835    mova                 m1,     [dstq+dsq*1]
5836    psubw                m3, m1, [tmpq+ 64*1]
5837    add                tmpq, 64*2
5838    pmulhrsw             m2, m4
5839    pmulhrsw             m3, m4
5840    paddw                m0, m2
5841    paddw                m1, m3
5842    mova       [dstq+dsq*0], m0
5843    mova       [dstq+dsq*1], m1
5844    lea                dstq, [dstq+dsq*2]
5845    sub                  hd, 2
5846    jg .w32_loop
5847    RET
5848
5849cglobal blend_h_16bpc, 3, 7, 9, dst, ds, tmp, w, h, mask
5850%define base r6-$$
5851    lea                  r6, [$$]
5852    tzcnt                wd, wm
5853    mov                  hd, hm
5854    movsxd               wq, [base+blend_h_avx512icl_table+wq*4]
5855    lea               maskq, [base+obmc_masks_avx2+hq*2]
5856    lea                  hd, [hq*3]
5857    lea                  wq, [base+blend_h_avx512icl_table+wq]
5858    shr                  hd, 2 ; h * 3/4
5859    lea               maskq, [maskq+hq*2]
5860    neg                  hq
5861    jmp                  wq
5862.w2:
5863    movd               xmm0, [dstq+dsq*0]
5864    pinsrd             xmm0, [dstq+dsq*1], 1
5865    movd               xmm2, [maskq+hq*2]
5866    movq               xmm1, [tmpq]
5867    add                tmpq, 4*2
5868    punpcklwd          xmm2, xmm2
5869    psubw              xmm1, xmm0, xmm1
5870    pmulhrsw           xmm1, xmm2
5871    paddw              xmm0, xmm1
5872    movd       [dstq+dsq*0], xmm0
5873    pextrd     [dstq+dsq*1], xmm0, 1
5874    lea                dstq, [dstq+dsq*2]
5875    add                  hq, 2
5876    jl .w2
5877    RET
5878.w4:
5879    mova               xmm3, [blend_shuf]
5880.w4_loop:
5881    movq               xmm0, [dstq+dsq*0]
5882    movhps             xmm0, [dstq+dsq*1]
5883    movd               xmm2, [maskq+hq*2]
5884    psubw              xmm1, xmm0, [tmpq]
5885    add                tmpq, 8*2
5886    pshufb             xmm2, xmm3
5887    pmulhrsw           xmm1, xmm2
5888    paddw              xmm0, xmm1
5889    movq       [dstq+dsq*0], xmm0
5890    movhps     [dstq+dsq*1], xmm0
5891    lea                dstq, [dstq+dsq*2]
5892    add                  hq, 2
5893    jl .w4_loop
5894    RET
5895.w8:
5896    vbroadcasti32x4     ym3, [blend_shuf]
5897    shufpd              ym3, ym3, 0x0c
5898.w8_loop:
5899    mova                xm0, [dstq+dsq*0]
5900    vinserti32x4        ym0, [dstq+dsq*1], 1
5901    vpbroadcastd        ym2, [maskq+hq*2]
5902    psubw               ym1, ym0, [tmpq]
5903    add                tmpq, 16*2
5904    pshufb              ym2, ym3
5905    pmulhrsw            ym1, ym2
5906    paddw               ym0, ym1
5907    mova          [dstq+dsq*0], xm0
5908    vextracti32x4 [dstq+dsq*1], ym0, 1
5909    lea                dstq, [dstq+dsq*2]
5910    add                  hq, 2
5911    jl .w8_loop
5912    RET
5913.w16:
5914    vbroadcasti32x4      m3, [blend_shuf]
5915    shufpd               m3, m3, 0xf0
5916.w16_loop:
5917    mova                ym0, [dstq+dsq*0]
5918    vinserti32x8         m0, [dstq+dsq*1], 1
5919    vpbroadcastd         m2, [maskq+hq*2]
5920    psubw                m1, m0, [tmpq]
5921    add                tmpq, 32*2
5922    pshufb               m2, m3
5923    pmulhrsw             m1, m2
5924    paddw                m0, m1
5925    mova          [dstq+dsq*0], ym0
5926    vextracti32x8 [dstq+dsq*1], m0, 1
5927    lea                dstq, [dstq+dsq*2]
5928    add                  hq, 2
5929    jl .w16_loop
5930    RET
5931.w32:
5932    vpbroadcastw         m4, [maskq+hq*2]
5933    vpbroadcastw         m5, [maskq+hq*2+2]
5934    mova                 m0,     [dstq+dsq*0]
5935    psubw                m2, m0, [tmpq+ 64*0]
5936    mova                 m1,     [dstq+dsq*1]
5937    psubw                m3, m1, [tmpq+ 64*1]
5938    add                tmpq, 64*2
5939    pmulhrsw             m2, m4
5940    pmulhrsw             m3, m5
5941    paddw                m0, m2
5942    paddw                m1, m3
5943    mova       [dstq+dsq*0], m0
5944    mova       [dstq+dsq*1], m1
5945    lea                dstq, [dstq+dsq*2]
5946    add                  hq, 2
5947    jl .w32
5948    RET
5949.w64:
5950    vpbroadcastw         m4, [maskq+hq*2]
5951    mova                 m0,     [dstq+64*0]
5952    psubw                m2, m0, [tmpq+64*0]
5953    mova                 m1,     [dstq+64*1]
5954    psubw                m3, m1, [tmpq+64*1]
5955    add                tmpq, 64*2
5956    pmulhrsw             m2, m4
5957    pmulhrsw             m3, m4
5958    paddw                m0, m2
5959    paddw                m1, m3
5960    mova        [dstq+64*0], m0
5961    mova        [dstq+64*1], m1
5962    add                dstq, dsq
5963    inc                  hq
5964    jl .w64
5965    RET
5966.w128:
5967    vpbroadcastw         m8, [maskq+hq*2]
5968    mova                 m0,     [dstq+64*0]
5969    psubw                m4, m0, [tmpq+64*0]
5970    mova                 m1,     [dstq+64*1]
5971    psubw                m5, m1, [tmpq+64*1]
5972    mova                 m2,     [dstq+64*2]
5973    psubw                m6, m2, [tmpq+64*2]
5974    mova                 m3,     [dstq+64*3]
5975    psubw                m7, m3, [tmpq+64*3]
5976    add                tmpq, 64*4
5977    REPX   {pmulhrsw x, m8}, m4, m5, m6, m7
5978    paddw                m0, m4
5979    paddw                m1, m5
5980    paddw                m2, m6
5981    paddw                m3, m7
5982    mova        [dstq+64*0], m0
5983    mova        [dstq+64*1], m1
5984    mova        [dstq+64*2], m2
5985    mova        [dstq+64*3], m3
5986    add                dstq, dsq
5987    inc                  hq
5988    jl .w128
5989    RET
5990
5991cglobal resize_16bpc, 6, 12, 32, dst, dst_stride, src, src_stride, \
5992                                 dst_w, h, src_w, dx, mx0, pxmax
5993    sub          dword mx0m, 4<<14
5994    sub        dword src_wm, 8
5995    mov                  r6, ~0
5996    vpbroadcastd         m5, dxm
5997    vpbroadcastd         m8, mx0m
5998    vpbroadcastd         m6, src_wm
5999    kmovq                k6, r6
6000 DEFINE_ARGS dst, dst_stride, src, src_stride, dst_w, h, x, _, _, pxmax
6001    LEA                  r7, $$
6002%define base r7-$$
6003    vpbroadcastd         m3, [base+pd_16384]
6004    vpbroadcastd         m7, [base+pd_63]
6005    mova                m24, [base+resize_permA]
6006    mova                m25, [base+resize_permB]
6007    mova                m26, [base+resize_permC]
6008    mova                m27, [base+resize_permD]
6009    vbroadcasti32x4     m28, [base+resize_shufA]
6010    vbroadcasti32x4     m29, [base+resize_shufB]
6011    mova                m30, [base+resize_permE]
6012    vpbroadcastw       ym31, pxmaxm
6013    vpdpwssd             m8, m5, [base+rescale_mul] ; mx+dx*[0-15]
6014    pslld                m5, 4                      ; dx*16
6015    pslld                m6, 14
6016    pxor                 m2, m2
6017.loop_y:
6018    xor                  xd, xd
6019    mova                 m4, m8     ; per-line working version of mx
6020.loop_x:
6021    pmaxsd               m0, m4, m2
6022    psrad                m9, m4, 8  ; filter offset (unmasked)
6023    pminsd               m0, m6     ; iclip(mx, 0, src_w-8)
6024    psubd                m1, m4, m0 ; pshufb offset
6025    psrad                m0, 14     ; clipped src_x offset
6026    psrad                m1, 14     ; pshufb edge_emu offset
6027    vptestmd             k5, m1, m1
6028    pand                 m9, m7     ; filter offset (masked)
6029    ktestw               k5, k5
6030    jz .load
6031    vpbroadcastq        m14, [base+pd_0_4]
6032    vpermq              m10, m0, q1100
6033    vpermq              m11, m0, q3322
6034    vpermq              m20, m1, q1100
6035    vpermq              m21, m1, q3322
6036    punpckldq           m10, m10
6037    punpckldq           m11, m11
6038    punpckldq           m20, m20
6039    punpckldq           m21, m21
6040    paddd               m10, m14
6041    paddd               m11, m14
6042    paddd               m20, m14
6043    paddd               m21, m14
6044    vextracti32x8      ym12, m10, 1
6045    vextracti32x8      ym13, m11, 1
6046    vextracti32x8      ym22, m20, 1
6047    vextracti32x8      ym23, m21, 1
6048    kmovq                k1, k6
6049    kmovq                k2, k6
6050    kmovq                k3, k6
6051    kmovq                k4, k6
6052    vpgatherdq      m16{k1}, [srcq+ym10*2] ; 0 1 2 3
6053    vpgatherdq      m17{k2}, [srcq+ym11*2] ; 4 5 6 7
6054    vpgatherdq      m18{k3}, [srcq+ym12*2] ; 8 9 A B
6055    vpgatherdq      m19{k4}, [srcq+ym13*2] ; C D E F
6056    kmovq                k1, k6
6057    kmovq                k2, k6
6058    kmovq                k3, k6
6059    kmovq                k4, k6
6060    vpgatherdq       m0{k1}, [base+resize_shuf+8+ym20*2]
6061    vpgatherdq       m1{k2}, [base+resize_shuf+8+ym21*2]
6062    vpgatherdq      m14{k3}, [base+resize_shuf+8+ym22*2]
6063    vpgatherdq      m15{k4}, [base+resize_shuf+8+ym23*2]
6064    pshufb              m16, m0
6065    pshufb              m17, m1
6066    pshufb              m18, m14
6067    pshufb              m19, m15
6068    mova                m20, m24
6069    mova                m22, m24
6070    mova                m21, m25
6071    mova                m23, m25
6072    vpermi2d            m20, m16, m17 ; 0-3a 0-3b 4-7a 4-7b
6073    vpermi2d            m21, m16, m17 ; 0-3c 0-3d 4-7c 4-7d
6074    vpermi2d            m22, m18, m19 ; 8-Ba 8-Bb C-Fa C-Fb
6075    vpermi2d            m23, m18, m19 ; 8-Bc 8-Bd C-Fc C-Fd
6076    mova                m15, m26
6077    mova                m17, m26
6078    mova                m16, m27
6079    mova                m18, m27
6080    vpermi2q            m15, m20, m22 ; 0-3a 4-7a 8-Ba C-Fa
6081    vpermi2q            m16, m20, m22 ; 0-3b 4-7b 8-Bb C-Fb
6082    vpermi2q            m17, m21, m23 ; 0-3c 4-7c 8-Bc C-Fc
6083    vpermi2q            m18, m21, m23 ; 0-3d 4-7d 8-Bd C-Fd
6084    kmovq                k1, k6
6085    kmovq                k2, k6
6086    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6087    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6088    pshufb              m10, m11, m28
6089    pshufb              m11, m11, m29
6090    pshufb              m12, m13, m28
6091    pshufb              m13, m13, m29
6092    jmp .filter
6093.load:
6094    kmovq                k1, k6
6095    kmovq                k2, k6
6096    kmovq                k3, k6
6097    kmovq                k4, k6
6098    vpgatherdd      m11{k1}, [base+resize_filter+m9*8+0]
6099    vpgatherdd      m13{k2}, [base+resize_filter+m9*8+4]
6100    pshufb              m10, m11, m28
6101    pshufb              m11, m11, m29
6102    pshufb              m12, m13, m28
6103    pshufb              m13, m13, m29
6104    vpgatherdd      m15{k3}, [srcq+m0*2+ 0]
6105    vpgatherdd      m16{k4}, [srcq+m0*2+ 4]
6106    kmovq                k1, k6
6107    kmovq                k2, k6
6108    vpgatherdd      m17{k1}, [srcq+m0*2+ 8]
6109    vpgatherdd      m18{k2}, [srcq+m0*2+12]
6110.filter:
6111    mova                m14, m2
6112    vpdpwssd            m14, m15, m10
6113    vpdpwssd            m14, m16, m11
6114    vpdpwssd            m14, m17, m12
6115    vpdpwssd            m14, m18, m13
6116    psubd               m14, m3, m14
6117    psrad               m14, 15
6118    packusdw            m14, m14
6119    vpermq              m14, m30, m14
6120    pminsw             ym14, ym31
6121    mova        [dstq+xq*2], ym14
6122    paddd                m4, m5
6123    add                  xd, 16
6124    cmp                  xd, dst_wd
6125    jl .loop_x
6126    add                dstq, dst_strideq
6127    add                srcq, src_strideq
6128    dec                  hd
6129    jg .loop_y
6130    RET
6131
6132%endif ; ARCH_X86_64
6133