• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 64
32
33pb_4x0_4x4_4x8_4x12: times 4 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
34
35pb_mask: dd 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080
36         dd 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, 0x8000
37
38hmulA: dd  0,  8, 16, 24, 32, 40, 48, 56,  4, 12, 20, 28, 36, 44, 52, 60
39hmulB: dd  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
40hmulC: dd  0,  1,  2,  3, 16, 17, 18, 19, 32, 33, 34, 35, 48, 49, 50, 51
41hmulD: dd  0,  1, 16, 17, 32, 33, 48, 49
42hshuf4:db  0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
43
44shift1: dq 0x0204081020408000
45shift3: dq 0x0810204080000000
46shift4: dq 0x1020408000000000
47
48pb_1:    times 4 db 1
49pb_2:    times 4 db 2
50pb_3:    times 4 db 3
51pb_4:    times 4 db 4
52pb_16:   times 4 db 16
53pb_63:   times 4 db 63
54pb_64:   times 4 db 64
55pb_128:  times 4 db 0x80
56pb_2_1:  times 2 db 2, 1
57pb_3_1:  times 2 db 3, 1
58pb_7_1:  times 2 db 7, 1
59pb_m1_0: times 2 db -1, 0
60pb_m1_1: times 2 db -1, 1
61pb_m1_2: times 2 db -1, 2
62pw_2048: times 2 dw 2048
63pw_4096: times 2 dw 4096
64
65SECTION .text
66
67%macro ABSSUB 4 ; dst, a, b, tmp
68    psubusb           %1, %2, %3
69    psubusb           %4, %3, %2
70    por               %1, %4
71%endmacro
72
73%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
74    punpcklbw        m%5, m%1, m%2
75    punpckhbw        m%1, m%2
76    punpcklbw        m%2, m%3, m%4
77    punpckhbw        m%3, m%4
78    punpcklwd        m%4, m%5, m%2
79    punpckhwd        m%5, m%2
80    punpcklwd        m%2, m%1, m%3
81    punpckhwd        m%1, m%3
82    kmovw             k1, k6
83    lea               t0, [dstq+strideq*4]
84    vpscatterdd [dstq+m19-2]{k1}, m%4
85    kmovw             k1, k6
86    lea               t1, [dstq+strideq*8]
87    vpscatterdd [t0  +m19-2]{k1}, m%5
88    kmovw             k1, k6
89    lea               t2, [t0  +strideq*8]
90    vpscatterdd [t1  +m19-2]{k1}, m%2
91    kmovw             k1, k6
92    vpscatterdd [t2  +m19-2]{k1}, m%1
93%endmacro
94
95%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
96%if %1 == 0
97    SWAP             m16, m22
98%endif
99    punpcklbw        m22, m24, m26
100    punpckhbw        m24, m26
101    punpcklbw        m26, m2, m3
102    punpckhbw         m2, m3
103    punpcklbw         m3, m4, m5
104    punpckhbw         m4, m5
105    punpcklbw         m5, m6, m7
106    punpckhbw         m6, m7
107    punpcklbw         m7, m8, m9
108    punpckhbw         m8, m9
109    punpcklbw         m9, m10, m11
110    punpckhbw        m10, m11
111    punpcklbw        m11, m25, m13
112    punpckhbw        m25, m13
113%if %1 == 0
114    SWAP             m13, m16
115%else
116    mova             m13, %3
117%endif
118    SWAP             m16, m25
119    punpcklbw        m25, m14, m13
120    punpckhbw        m13, m14, m13
121    ; interleaved in m22,24,26,2,3,4,5,6,7,8,9,10,11,rsp%3,25,13
122    punpcklwd        m14, m22, m26
123    punpckhwd        m22, m26
124    punpcklwd        m26, m24, m2
125    punpckhwd        m24, m2
126    punpcklwd         m2, m3, m5
127    punpckhwd         m3, m5
128    punpcklwd         m5, m4, m6
129    punpckhwd         m4, m6
130    punpcklwd         m6, m7, m9
131    punpckhwd         m7, m9
132    punpcklwd         m9, m8, m10
133    punpckhwd         m8, m10
134    punpcklwd        m10, m11, m25
135    punpckhwd        m11, m25
136    SWAP             m25, m16, m11
137    punpcklwd        m11, m25, m13
138    punpckhwd        m25, m13
139    ; interleaved in m14,15,26,24,2,3,5,4,6,7,9,8,10,rsp%3,11,25
140    punpckldq        m13, m14, m2
141    punpckhdq        m14, m2
142    punpckldq         m2, m22, m3
143    punpckhdq        m22, m3
144    punpckldq         m3, m26, m5
145    punpckhdq        m26, m5
146    punpckldq         m5, m24, m4
147    punpckhdq        m24, m4
148    punpckldq         m4, m6, m10
149    punpckhdq         m6, m10
150    punpckldq        m10, m9, m11
151    punpckhdq         m9, m11
152    punpckldq        m11, m8, m25
153    punpckhdq         m8, m25
154    SWAP             m25, m16, m8
155    punpckldq         m8, m7, m25
156    punpckhdq         m7, m25
157    ; interleaved in m13,14,2,15,3,26,5,24,4,6,8,7,10,9,11,rsp%3
158    punpcklqdq       m25, m13, m4
159    punpckhqdq       m13, m4
160    punpcklqdq        m4, m14, m6
161    punpckhqdq       m14, m6
162    punpcklqdq        m6, m2, m8
163    punpckhqdq        m2, m8
164    punpcklqdq        m8, m22, m7
165    punpckhqdq       m22, m7
166    punpcklqdq        m7, m3, m10
167    punpckhqdq        m3, m10
168    punpcklqdq       m10, m26, m9
169    punpckhqdq       m26, m9
170    punpcklqdq        m9, m5, m11
171    punpckhqdq        m5, m11
172    SWAP             m11, m16
173%if %2 == 0
174    SWAP             m16, m25
175%else
176    mova              %3, m25
177%endif
178    punpcklqdq       m25, m24, m11
179    punpckhqdq       m24, m11
180%if %2 == 0
181    SWAP             m11, m16
182%endif
183    ; interleaved m11,13,4,14,6,2,8,15,7,3,10,26,9,5,25,24
184    SWAP              24, 11, 26, 13, 5, 2, 4, 6, 8, 7, 22
185    SWAP               3, 14, 25, 9
186%endmacro
187
188%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
189    ; load data
190%ifidn %2, v
191%define is_h 0
192%if %1 == 4
193    lea               t0, [dstq+mstrideq*2]
194    mova              m3, [t0  +strideq*0]    ; p1
195    mova              m4, [t0  +strideq*1]    ; p0
196    mova              m5, [t0  +strideq*2]    ; q0
197    mova              m6, [t0  +stride3q ]    ; q1
198%else
199    ; load 6-8 pixels, remainder (for wd=16) will be read inline
200%if %1 == 16
201    lea               t0, [dstq+mstrideq*8]
202    mova             m16, [t0  +strideq*1]
203    mova             m17, [t0  +strideq*2]
204    mova             m18, [t0  +stride3q ]
205%endif
206    lea               t0, [dstq+mstrideq*4]
207%if %1 != 6
208    mova             m25, [t0  +strideq*0]
209%endif
210    mova             m13, [t0  +strideq*1]
211    mova              m3, [t0  +strideq*2]
212    mova              m4, [t0  +stride3q ]
213    mova              m5, [dstq+strideq*0]
214    mova              m6, [dstq+strideq*1]
215    mova             m14, [dstq+strideq*2]
216%if %1 != 6
217    mova             m22, [dstq+stride3q ]
218%endif
219%if %1 == 16
220    lea               t0, [dstq+strideq*4]
221    mova             m29, [t0  +strideq*0]
222    mova             m30, [t0  +strideq*1]
223    mova             m31, [t0  +strideq*2]
224%endif
225%endif
226%else ; h
227%define is_h 1
228    ; load lines
229%if %1 == 4
230    vbroadcasti32x4   m0, [hshuf4]
231    kmovw             k1, k6
232    lea               t0, [dstq+strideq*4]
233    vpgatherdd    m3{k1}, [dstq+m19-2]
234    kmovw             k1, k6
235    lea               t1, [dstq+strideq*8]
236    vpgatherdd    m4{k1}, [t0  +m19-2]
237    kmovw             k1, k6
238    lea               t2, [t0  +strideq*8]
239    vpgatherdd    m5{k1}, [t1  +m19-2]
240    kmovw             k1, k6
241    vpgatherdd    m6{k1}, [t2  +m19-2]
242    pshufb            m3, m0
243    pshufb            m4, m0
244    pshufb            m5, m0
245    pshufb            m6, m0
246    punpckldq         m7, m3, m4
247    punpckhdq         m3, m4
248    punpckldq         m4, m5, m6
249    punpckhdq         m5, m6
250    punpcklqdq        m6, m7, m4
251    punpckhqdq        m7, m4
252    punpcklqdq        m4, m3, m5
253    punpckhqdq        m3, m5
254    SWAP               3, 6
255    SWAP               5, 4, 7
256    ; 6,7,4,3 -> 3,4,5,6
257%elif %1 == 6 || %1 == 8
258    kmovb             k1, k7
259    lea               t0, [dstq+strideq*1]
260    vpgatherdq    m3{k1}, [dstq+ym21-%1/2]
261    kmovb             k1, k7
262    lea               t1, [dstq+strideq*2]
263    vpgatherdq    m4{k1}, [t0  +ym21-%1/2]
264    kmovb             k1, k7
265    lea               t2, [dstq+stride3q ]
266    vpgatherdq    m5{k1}, [t1  +ym21-%1/2]
267    kmovb             k1, k7
268    vextracti32x8    ym0, m21, 1
269    vpgatherdq    m6{k1}, [t2  +ym21-%1/2]
270    kmovb             k1, k7
271    vpgatherdq   m12{k1}, [dstq+ym0 -%1/2]
272    kmovb             k1, k7
273    vpgatherdq   m13{k1}, [t0  +ym0 -%1/2]
274    kmovb             k1, k7
275    vpgatherdq   m14{k1}, [t1  +ym0 -%1/2]
276    kmovb             k1, k7
277    vpgatherdq   m15{k1}, [t2  +ym0 -%1/2]
278    ; transpose 8x16
279    ; xm3: A-H0,A-H8
280    ; xm4: A-H1,A-H9
281    ; xm5: A-H2,A-H10
282    ; xm6: A-H3,A-H11
283    ; xm12: A-H4,A-H12
284    ; xm13: A-H5,A-H13
285    ; xm14: A-H6,A-H14
286    ; xm15: A-H7,A-H15
287    punpcklbw         m7, m3, m4
288    punpckhbw         m3, m4
289    punpcklbw         m4, m5, m6
290    punpckhbw         m5, m6
291    punpcklbw         m6, m12, m13
292    punpckhbw        m12, m13
293    punpcklbw        m13, m14, m15
294    punpckhbw        m14, m15
295    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
296    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
297    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
298    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
299    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
300    ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
301    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
302    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
303    punpcklwd        m15, m7, m4
304    punpckhwd         m7, m4
305    punpcklwd         m4, m3, m5
306    punpckhwd         m3, m5
307    punpcklwd         m5, m6, m13
308    punpckhwd         m6, m13
309    punpcklwd        m13, m12, m14
310    punpckhwd        m12, m14
311    ; xm15: A0-3,B0-3,C0-3,D0-3
312    ; xm7: E0-3,F0-3,G0-3,H0-3
313    ; xm4: A8-11,B8-11,C8-11,D8-11
314    ; xm3: E8-11,F8-11,G8-11,H8-11
315    ; xm5: A4-7,B4-7,C4-7,D4-7
316    ; xm6: E4-7,F4-7,G4-7,H4-7
317    ; xm13: A12-15,B12-15,C12-15,D12-15
318    ; xm12: E12-15,F12-15,G12-15,H12-15
319    punpckldq        m14, m15, m5
320    punpckhdq        m15, m5
321    punpckldq         m5, m7, m6
322 %if %1 != 6
323    punpckhdq         m7, m6
324 %endif
325    punpckldq         m6, m4, m13
326    punpckhdq         m4, m13
327    punpckldq        m13, m3, m12
328 %if %1 != 6
329    punpckhdq        m12, m3, m12
330 %endif
331    ; xm14: A0-7,B0-7
332    ; xm15: C0-7,D0-7
333    ; xm5: E0-7,F0-7
334    ; xm7: G0-7,H0-7
335    ; xm6: A8-15,B8-15
336    ; xm4: C8-15,D8-15
337    ; xm13: E8-15,F8-15
338    ; xm12: G8-15,H8-15
339    punpcklqdq        m3, m14, m6
340    punpckhqdq       m14, m6
341    punpckhqdq        m6, m15, m4
342    punpcklqdq       m15, m4
343    punpcklqdq        m4, m5, m13
344    punpckhqdq       m13, m5, m13
345 %if %1 == 8
346    punpcklqdq        m5, m7, m12
347    punpckhqdq       m25, m7, m12
348    ; xm3: A0-15
349    ; xm14: B0-15
350    ; xm15: C0-15
351    ; xm6: D0-15
352    ; xm4: E0-15
353    ; xm13: F0-15
354    ; xm5: G0-15
355    ; xm25: H0-15
356    SWAP              25, 3, 15
357    SWAP              13, 14, 5, 4, 6
358    SWAP              15, 22
359    ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,22
360 %else
361    SWAP              13, 3, 14
362    SWAP               6, 4, 15, 5
363    ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
364 %endif
365%else ; 16, h
366    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
367    ; remainder at the end for the second transpose
368    movu            xm24, [dstq+strideq*0-8]
369    movu            xm26, [dstq+strideq*1-8]
370    movu             xm2, [dstq+strideq*2-8]
371    movu             xm3, [dstq+stride3q -8]
372    lea               t0, [dstq+strideq*4]
373    movu             xm4, [t0  +strideq*0-8]
374    movu             xm5, [t0  +strideq*1-8]
375    movu             xm6, [t0  +strideq*2-8]
376    movu             xm7, [t0  +stride3q -8]
377    lea               t0, [t0  +strideq*4]
378    movu             xm8, [t0  +strideq*0-8]
379    movu             xm9, [t0  +strideq*1-8]
380    movu            xm10, [t0  +strideq*2-8]
381    movu            xm11, [t0  +stride3q -8]
382    lea               t0, [t0  +strideq*4]
383    movu            xm25, [t0  +strideq*0-8]
384    movu            xm13, [t0  +strideq*1-8]
385    movu            xm14, [t0  +strideq*2-8]
386    movu            xm22, [t0  +stride3q -8]
387    lea               t0, [t0  +strideq*4]
388    vinserti32x4    ym24, [t0  +strideq*0-8], 1
389    vinserti32x4    ym26, [t0  +strideq*1-8], 1
390    vinserti32x4     ym2, [t0  +strideq*2-8], 1
391    vinserti32x4     ym3, [t0  +stride3q -8], 1
392    lea               t0, [t0  +strideq*4]
393    vinserti32x4     ym4, [t0  +strideq*0-8], 1
394    vinserti32x4     ym5, [t0  +strideq*1-8], 1
395    vinserti32x4     ym6, [t0  +strideq*2-8], 1
396    vinserti32x4     ym7, [t0  +stride3q -8], 1
397    lea               t0, [t0  +strideq*4]
398    vinserti32x4     ym8, [t0  +strideq*0-8], 1
399    vinserti32x4     ym9, [t0  +strideq*1-8], 1
400    vinserti32x4    ym10, [t0  +strideq*2-8], 1
401    vinserti32x4    ym11, [t0  +stride3q -8], 1
402    lea               t0, [t0  +strideq*4]
403    vinserti32x4    ym25, [t0  +strideq*0-8], 1
404    vinserti32x4    ym13, [t0  +strideq*1-8], 1
405    vinserti32x4    ym14, [t0  +strideq*2-8], 1
406    vinserti32x4    ym22, [t0  +stride3q -8], 1
407    lea               t0, [t0  +strideq*4]
408    vinserti32x4     m24, [t0  +strideq*0-8], 2
409    vinserti32x4     m26, [t0  +strideq*1-8], 2
410    vinserti32x4      m2, [t0  +strideq*2-8], 2
411    vinserti32x4      m3, [t0  +stride3q -8], 2
412    lea               t0, [t0  +strideq*4]
413    vinserti32x4      m4, [t0  +strideq*0-8], 2
414    vinserti32x4      m5, [t0  +strideq*1-8], 2
415    vinserti32x4      m6, [t0  +strideq*2-8], 2
416    vinserti32x4      m7, [t0  +stride3q -8], 2
417    lea               t0, [t0  +strideq*4]
418    vinserti32x4      m8, [t0  +strideq*0-8], 2
419    vinserti32x4      m9, [t0  +strideq*1-8], 2
420    vinserti32x4     m10, [t0  +strideq*2-8], 2
421    vinserti32x4     m11, [t0  +stride3q -8], 2
422    lea               t0, [t0  +strideq*4]
423    vinserti32x4     m25, [t0  +strideq*0-8], 2
424    vinserti32x4     m13, [t0  +strideq*1-8], 2
425    vinserti32x4     m14, [t0  +strideq*2-8], 2
426    vinserti32x4     m22, [t0  +stride3q -8], 2
427    lea               t0, [t0  +strideq*4]
428    vinserti32x4     m24, [t0  +strideq*0-8], 3
429    vinserti32x4     m26, [t0  +strideq*1-8], 3
430    vinserti32x4      m2, [t0  +strideq*2-8], 3
431    vinserti32x4      m3, [t0  +stride3q -8], 3
432    lea               t0, [t0  +strideq*4]
433    vinserti32x4      m4, [t0  +strideq*0-8], 3
434    vinserti32x4      m5, [t0  +strideq*1-8], 3
435    vinserti32x4      m6, [t0  +strideq*2-8], 3
436    vinserti32x4      m7, [t0  +stride3q -8], 3
437    lea               t0, [t0  +strideq*4]
438    vinserti32x4      m8, [t0  +strideq*0-8], 3
439    vinserti32x4      m9, [t0  +strideq*1-8], 3
440    vinserti32x4     m10, [t0  +strideq*2-8], 3
441    vinserti32x4     m11, [t0  +stride3q -8], 3
442    lea               t0, [t0  +strideq*4]
443    vinserti32x4     m25, [t0  +strideq*0-8], 3
444    vinserti32x4     m13, [t0  +strideq*1-8], 3
445    vinserti32x4     m14, [t0  +strideq*2-8], 3
446    vinserti32x4     m22, [t0  +stride3q -8], 3
447    ;
448    TRANSPOSE_16X16B 0, 1, [rsp+0*64]
449    SWAP             m16, m26
450    SWAP             m17, m2
451    SWAP             m18, m3
452    SWAP             m29, m25
453    SWAP             m30, m13
454    SWAP             m31, m14
455    mova      [rsp+4*64], m22
456    ; 4,5,6,7,8,9,10,11 -> 25,13,3,4,5,6,14,22
457    SWAP              25, 4, 7
458    SWAP              13, 5, 8
459    SWAP               3, 6, 9
460    SWAP              10, 14
461    SWAP              11, 22
462%endif
463%endif
464
465    ; load L/E/I/H
466    vpbroadcastd     m15, [pb_1]
467%ifidn %2, v
468    movu              m1, [lq]
469    movu              m0, [lq+l_strideq]
470%else
471    kmovw             k1, k6
472    vpgatherdd    m0{k1}, [lq+m20+4]
473    kmovw             k1, k6
474    vpgatherdd    m1{k1}, [lq+m20+0]
475%endif
476    pxor              m2, m2
477    pcmpeqb           k1, m0, m2
478    vmovdqu8      m0{k1}, m1                ; l[x][] ? l[x][] : l[x-stride][]
479    pshufb            m0, pbshuf            ; l[x][0]
480    vpcmpub           k3, m0, m2, 4 ; neq   ; L
481    psrlq             m2, m0, [lutq+128]
482    pand              m2, [pb_63]{bcstd}
483    vpbroadcastb      m1, [lutq+136]
484    pminub            m2, m1
485    pmaxub            m2, m15               ; I
486    gf2p8affineqb     m1, m0, [shift4]{bcstq}, 0 ; H
487    paddd             m0, [pb_2]{bcstd}
488    paddb             m0, m0
489    paddb             m0, m2                ; E
490
491    ABSSUB            m8, m3, m4, m9        ; abs(p1-p0)
492    ABSSUB            m9, m5, m6, m10       ; abs(q1-q0)
493    pmaxub            m8, m9
494    vpcmpub           k1, m8, m1, 6 ; gt    ; hev
495%if %1 != 4
496 %if %1 == 6
497    ABSSUB            m9, m13, m4, m10      ; abs(p2-p0)
498    pmaxub            m9, m8
499 %else
500    ABSSUB            m9, m25, m4, m10      ; abs(p3-p0)
501    pmaxub            m9, m8
502    ABSSUB           m10, m13, m4, m11      ; abs(p2-p0)
503    pmaxub            m9, m10
504 %endif
505    ABSSUB           m10, m5,  m14, m11     ; abs(q2-q0)
506    pmaxub            m9, m10
507 %if %1 != 6
508    ABSSUB           m10, m5,  m22, m11     ; abs(q3-q0)
509    pmaxub            m9, m10
510 %endif
511    vpcmpub       k2{k3}, m9, m15, 2 ; le   ; flat8in
512 %if %1 == 6
513    ABSSUB           m10, m13, m3,  m1      ; abs(p2-p1)
514 %else
515    ABSSUB           m10, m25, m13, m11     ; abs(p3-p2)
516    ABSSUB           m11, m13, m3,  m1      ; abs(p2-p1)
517    pmaxub           m10, m11
518    ABSSUB           m11, m14, m22, m1      ; abs(q3-q2)
519    pmaxub           m10, m11
520 %endif
521    ABSSUB           m11, m14, m6,  m1      ; abs(q2-q1)
522    pmaxub           m10, m11
523 %if %1 == 16
524    vpbroadcastd     m11, [maskq+8]
525    por              m11, [maskq+4]{bcstd}
526 %else
527    vpbroadcastd     m11, [maskq+4]
528 %endif
529    vptestmd          k4, m11, pbmask
530    vmovdqa32 m10{k4}{z}, m10               ; only apply fm-wide to wd>4 blocks
531    pmaxub            m8, m10
532%endif
533    vpcmpub       k3{k3}, m8, m2, 2 ; le
534    ABSSUB           m10, m3, m6, m11       ; abs(p1-q1)
535    ABSSUB           m11, m4, m5, m2        ; abs(p0-q0)
536    paddusb          m11, m11
537    gf2p8affineqb    m10, m10, [shift1]{bcstq}, 0
538    paddusb          m10, m11               ; abs(p0-q0)*2+(abs(p1-q1)>>1)
539    vpcmpub       k3{k3}, m10, m0, 2        ; abs(p0-q0)*2+(abs(p1-q1)>>1) <= E
540
541%if %1 == 16
542    ABSSUB            m1, m16, m4, m2
543    ABSSUB            m2, m17, m4, m10
544    pmaxub            m1, m2
545    ABSSUB            m2, m18, m4, m10
546    pmaxub            m1, m2
547    ABSSUB            m2, m29, m5, m10
548    pmaxub            m1, m2
549    ABSSUB            m2, m30, m5, m10
550    pmaxub            m1, m2
551    ABSSUB            m2, m31, m5, m10
552    pmaxub            m1, m2
553    kandq             k2, k2, k3
554    vpcmpub       k4{k2}, m1, m15, 2        ; flat8in & flat8out
555    vpbroadcastd      m2, [maskq+8]
556    vptestmd          k5, m2, pbmask
557    vpmovm2d          m7, k5
558    vptestmb      k4{k4}, m7, m7            ; flat16 & fm
559    por              m10, m2, [maskq+4]{bcstd}
560    vptestmd          k5, m10, pbmask
561    vpmovm2d          m7, k5
562    vptestmb      k2{k2}, m7, m7            ; flat8in
563    por               m2, m10, [maskq+0]{bcstd}
564    vptestmd          k5, m2, pbmask
565    vpmovm2d          m7, k5
566    vptestmb      k3{k3}, m7, m7
567    kandnq            k3, k2, k3            ; fm & !flat8 & !flat16
568    kandnq            k2, k4, k2            ; flat8 & !flat16
569%elif %1 != 4
570    vpbroadcastd      m0, [maskq+4]
571    vptestmd          k4, m0, pbmask
572    vpmovm2d          m7, k4
573    vptestmb      k2{k2}, m7, m7
574    kandq             k2, k2, k3            ; flat8 & fm
575    por               m0, [maskq+0]{bcstd}
576    vptestmd          k4, m0, pbmask
577    vpmovm2d          m7, k4
578    vptestmb      k3{k3}, m7, m7
579    kandnq            k3, k2, k3            ; fm & !flat8
580%else
581 %ifidn %2, v
582    vptestmd          k4, pbmask, [maskq+0]{bcstd}
583 %else
584    vpbroadcastd      m0, [maskq+0]
585    vptestmd          k4, m0, pbmask
586 %endif
587    vpmovm2d          m7, k4
588    vptestmb      k3{k3}, m7, m7            ; fm
589%endif
590
591    ; short filter
592%if %1 >= 8
593    SWAP             m23, m15
594%endif
595    vpbroadcastd     m15, [pb_3]
596    vpbroadcastd      m0, [pb_4]
597    vpbroadcastd     m12, [pb_16]
598    vpbroadcastd      m1, [pb_64]
599    pxor              m3, pb128
600    pxor              m6, pb128
601    psubsb    m10{k1}{z}, m3, m6            ; f=iclip_diff(p1-q1)&hev
602    pxor              m4, pb128
603    pxor              m5, pb128
604    psubsb           m11, m5, m4
605    paddsb           m10, m11
606    paddsb           m10, m11
607    paddsb    m10{k3}{z}, m10, m11          ; f=iclip_diff(3*(q0-p0)+f)&fm
608    paddsb            m8, m10, m15
609    paddsb           m10, m0
610    gf2p8affineqb     m8, m8, [shift3]{bcstq}, 16
611    gf2p8affineqb    m10, m10, [shift3]{bcstq}, 16
612    psubb             m8, m12               ; f2
613    psubb            m10, m12               ; f1
614    paddsb            m4, m8
615    psubsb            m5, m10
616    pxor              m4, pb128
617    pxor              m5, pb128
618    ;
619    pxor             m10, pb128
620    pxor              m8, m8
621    pavgb             m8, m10               ; f=(f1+1)>>1
622    psubb             m8, m1
623    knotq             k1, k1
624    paddsb        m3{k1}, m3, m8
625    psubsb        m6{k1}, m6, m8
626    pxor              m3, pb128
627    pxor              m6, pb128
628
629%if %1 == 16
630    ; flat16 filter
631%ifidn %2, v
632    lea               t0, [dstq+mstrideq*8]
633%endif
634    SWAP             m24, m16, m14
635    SWAP              m2, m17, m22
636    SWAP              m7, m18
637
638    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
639    ; write -6
640    vpbroadcastd      m1, [pb_7_1]
641    vpbroadcastd     m12, [pb_2]
642    punpcklbw        m14, m24, m25
643    punpckhbw        m22, m24, m25
644    pmaddubsw        m10, m14, m1
645    pmaddubsw        m11, m22, m1          ; p6*7+p3
646    punpcklbw         m8, m2, m7
647    punpckhbw         m9, m2, m7
648    pmaddubsw         m8, m12
649    pmaddubsw         m9, m12
650    paddw            m10, m8
651    paddw            m11, m9                ; p6*7+p5*2+p4*2+p3
652%ifidn %2, h
653    vpbroadcastd     m27, [pw_2048]
654    vpbroadcastd      m1, [pb_m1_1]
655 %define pw2048 m27
656 %define pbm1_1 m1
657%endif
658    punpcklbw         m8, m13, m3
659    punpckhbw         m9, m13, m3
660    pmaddubsw         m8, m23
661    pmaddubsw         m9, m23
662    paddw            m10, m8
663    paddw            m11, m9                ; p6*7+p5*2+p4*2+p3+p2+p1
664    punpcklbw         m8, m4, m5
665    punpckhbw         m9, m4, m5
666    pmaddubsw         m8, m23
667    pmaddubsw         m9, m23
668    paddw            m10, m8
669    paddw            m11, m9                ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
670    pmulhrsw          m8, m10, pw2048
671    pmulhrsw          m9, m11, pw2048
672    packuswb          m8, m9
673%ifidn %2, v
674    vmovdqu8 [t0+strideq*2]{k4}, m8         ; p5
675%else
676    vpblendmb     m8{k4}, m2, m8
677    mova      [rsp+1*64], m8
678%endif
679
680    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
681    ; write -5
682    pmaddubsw        m14, pbm1_1
683    pmaddubsw        m22, pbm1_1
684    paddw            m10, m14
685    paddw            m11, m22               ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
686    punpcklbw         m8, m24, m6
687    punpckhbw         m9, m24, m6
688    pmaddubsw         m8, pbm1_1
689    pmaddubsw         m9, pbm1_1
690    paddw            m10, m8
691    paddw            m11, m9                ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
692    SWAP             m18, m8
693    SWAP             m23, m9
694    pmulhrsw          m8, m10, pw2048
695    pmulhrsw          m9, m11, pw2048
696    packuswb          m8, m9
697%ifidn %2, v
698    vmovdqu8 [t0+stride3q]{k4}, m8          ; p4
699%else
700    vpblendmb     m8{k4}, m7, m8
701    mova      [rsp+2*64], m8
702%endif
703
704    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
705    ; write -4
706    SWAP             m14, m16
707    punpcklbw         m8, m24, m13
708    punpckhbw         m9, m24, m13
709    pmaddubsw         m8, pbm1_1
710    pmaddubsw         m9, pbm1_1
711    paddw            m10, m8
712    paddw            m11, m9                ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
713    punpcklbw         m8, m2, m14
714    punpckhbw         m2, m14
715    pmaddubsw         m8, pbm1_1
716    pmaddubsw         m2, pbm1_1
717    paddw            m10, m8
718    paddw            m11, m2                ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
719    SWAP             m16, m8
720    pmulhrsw          m8, m10, pw2048
721    pmulhrsw          m9, m11, pw2048
722    packuswb          m8, m9
723%ifidn %2, v
724    vmovdqu8 [t0+strideq*4]{k4}, m8         ; p3
725%else
726    vpblendmb     m8{k4}, m25, m8
727    mova      [rsp+3*64], m8
728%endif
729
730    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
731    ; write -3
732    SWAP             m22, m17
733    punpcklbw         m8, m24, m3
734    punpckhbw         m9, m24, m3
735    pmaddubsw         m8, pbm1_1
736    pmaddubsw         m9, pbm1_1
737    paddw            m10, m8
738    paddw            m11, m9                ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
739    punpcklbw         m8, m7, m22
740    punpckhbw         m7, m22
741    pmaddubsw         m8, pbm1_1
742    pmaddubsw         m7, pbm1_1
743    paddw            m10, m8
744    paddw            m11, m7                ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
745    SWAP             m17, m8
746    pmulhrsw          m8, m10, pw2048
747    pmulhrsw          m9, m11, pw2048
748    packuswb          m8, m9
749    vpblendmb    m15{k4}, m13, m8           ; don't clobber p2/m13 since we need it in F
750
751    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
752    ; write -2
753%ifidn %2, v
754    lea               t0, [dstq+strideq*4]
755%endif
756    punpcklbw         m8, m24, m4
757    punpckhbw         m9, m24, m4
758    pmaddubsw         m8, pbm1_1
759    pmaddubsw         m9, pbm1_1
760    paddw            m10, m8
761    paddw            m11, m9                ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
762    punpcklbw         m8, m25, m29
763    punpckhbw         m9, m25, m29
764    SWAP             m26, m29
765    pmaddubsw         m8, pbm1_1
766    pmaddubsw         m9, pbm1_1
767    paddw            m10, m8
768    paddw            m11, m9                ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
769    SWAP             m29, m8
770    SWAP              m0, m9
771    pmulhrsw          m8, m10, pw2048
772    pmulhrsw          m9, m11, pw2048
773    packuswb          m8, m9
774    vpblendmb    m12{k4}, m3, m8            ; don't clobber p1/m3 since we need it in G
775
776    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
777    ; write -1
778%ifidn %2, h
779    SWAP             m28, m24
780    punpcklbw         m8, m28, m5
781    punpckhbw        m24, m28, m5
782%else
783    punpcklbw         m8, m24, m5
784    punpckhbw        m24, m5
785%endif
786    pmaddubsw         m8, pbm1_1
787    pmaddubsw        m24, pbm1_1
788    paddw            m10, m8
789    paddw            m11, m24               ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
790    punpcklbw        m24, m13, m30
791    punpckhbw         m9, m13, m30
792%ifidn %2, h
793    SWAP             m27, m30
794%endif
795    SWAP             m13, m15
796    pmaddubsw        m24, pbm1_1
797    pmaddubsw         m9, pbm1_1
798    paddw            m10, m24
799    paddw            m11, m9                ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
800    SWAP             m30, m24
801    SWAP             m15, m9
802%ifidn %2, h
803    SWAP              m9, m24
804 %define pw2048 m9
805%endif
806    pmulhrsw         m24, m10, pw2048
807    pmulhrsw          m8, m11, pw2048
808    paddw            m10, m18               ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
809    paddw            m11, m23
810    packuswb         m24, m8
811    punpcklbw         m8, m3, m31
812    pmaddubsw         m8, pbm1_1
813    paddw            m10, m8                ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
814    SWAP             m18, m8
815    pmulhrsw          m8, m10, pw2048
816    paddw            m10, m16               ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
817%ifidn %2, h
818    SWAP             m16, m9
819 %define pw2048 m16
820%endif
821    punpckhbw         m9, m3, m31
822    SWAP              m3, m12
823    pmaddubsw         m9, pbm1_1
824    paddw            m11, m9                ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
825    SWAP             m23, m9
826    pmulhrsw          m9, m11, pw2048
827    paddw            m11, m2                ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
828%ifidn %2, h
829    SWAP              m2, m1
830 %define pbm1_1 m2
831%endif
832    vpblendmb     m1{k4}, m4, m24           ; don't clobber p0/m4 since we need it in H
833
834    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
835    ; write +0
836    SWAP             m24, m31               ; q6
837    packuswb          m8, m9
838%ifidn %2, h
839    SWAP             m31, m2
840 %define pbm1_1 m31
841%endif
842    vpblendmb    m12{k4}, m5, m8            ; don't clobber q0/m5 since we need it in I
843
844    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
845    ; write +1
846    punpcklbw         m8, m4, m24
847    punpckhbw         m2, m4, m24
848    SWAP              m4, m1
849    pmaddubsw         m8, pbm1_1
850    pmaddubsw         m2, pbm1_1
851    paddw            m10, m8
852    paddw            m11, m2                ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
853    pmulhrsw          m2, m10, pw2048
854    pmulhrsw          m9, m11, pw2048
855    packuswb          m2, m9
856    vpblendmb     m2{k4}, m6, m2            ; don't clobber q1/m6 since we need it in K
857
858    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
859    ; write +2
860    paddw            m10, m17               ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
861    paddw            m11, m7
862    punpcklbw         m8, m5, m24
863    punpckhbw         m9, m5, m24
864    SWAP              m5, m12
865    pmaddubsw         m8, pbm1_1
866    pmaddubsw         m9, pbm1_1
867    paddw            m10, m8
868    paddw            m11, m9                ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
869    pmulhrsw          m7, m10, pw2048
870    pmulhrsw          m9, m11, pw2048
871    packuswb          m7, m9
872    vpblendmb     m7{k4}, m14, m7           ; don't clobber q2/m14 since we need it in K
873
874    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
875    ; write +3
876    paddw            m10, m29               ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
877    paddw            m11, m0
878    punpcklbw         m8, m6, m24
879    punpckhbw         m9, m6, m24
880    SWAP               2, 6
881    pmaddubsw         m8, pbm1_1
882    pmaddubsw         m9, pbm1_1
883    paddw            m10, m8
884    paddw            m11, m9                ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
885    pmulhrsw          m8, m10, pw2048
886    pmulhrsw          m9, m11, pw2048
887    packuswb          m8, m9
888%ifidn %2, v
889    vmovdqu8 [t0+mstrideq]{k4}, m8
890%else
891    SWAP             m29, m16
892 %define pw2048 m29
893    vpblendmb    m16{k4}, m22, m8
894%endif
895
896    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
897    ; write +4
898    paddw            m10, m30               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
899    paddw            m11, m15
900%ifidn %2, h
901    SWAP             m15, m8
902%endif
903    punpcklbw         m8, m14, m24
904    punpckhbw         m9, m14, m24
905    SWAP              14, 7
906    pmaddubsw         m8, pbm1_1
907    pmaddubsw         m9, pbm1_1
908    paddw            m10, m8
909    paddw            m11, m9                ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
910    pmulhrsw          m8, m10, pw2048
911    pmulhrsw          m9, m11, pw2048
912    packuswb          m8, m9
913%ifidn %2, v
914    vmovdqu8 [t0+strideq*0]{k4}, m8         ; q4
915%else
916    vpblendmb    m17{k4}, m26, m8
917%endif
918
919    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
920    ; write +5
921    paddw            m10, m18               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
922    paddw            m11, m23
923    punpcklbw         m8, m22, m24
924    punpckhbw         m9, m22, m24
925    SWAP             m30, m24
926    pmaddubsw         m8, pbm1_1
927    pmaddubsw         m9, pbm1_1
928    paddw            m10, m8
929    paddw            m11, m9                ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
930    pmulhrsw         m10, pw2048
931    pmulhrsw         m11, pw2048
932    packuswb         m10, m11
933%ifidn %2, v
934    vmovdqu8 [t0+strideq*1]{k4}, m10        ; q5
935%else
936    vmovdqu8     m27{k4}, m10
937%endif
938
939%ifidn %2, v
940    lea               t0, [dstq+mstrideq*4]
941%endif
942%endif
943
944%if %1 >= 8
945    ; flat8 filter
946    vpbroadcastd      m9, [pb_3_1]
947    vpbroadcastd     m10, [pb_2_1]
948%if %1 == 16
949    vpbroadcastd     m23, [pb_1]
950    vpbroadcastd      m0, [pb_4]
951%elifidn %2, h
952    vpbroadcastd     m31, [pb_m1_1]
953 %define pbm1_1 m31
954%endif
955    punpcklbw        m24, m25, m3
956    punpckhbw        m26, m25, m3
957    pmaddubsw         m2, m24, m9
958    pmaddubsw         m7, m26, m9           ; 3 * p3 + p1
959    punpcklbw         m8, m13, m4
960    punpckhbw        m11, m13, m4
961    pmaddubsw         m8, m10
962    pmaddubsw        m11, m10
963    paddw             m2, m8
964    paddw             m7, m11               ; 3 * p3 + 2 * p2 + p1 + p0
965    punpcklbw         m8, m5, m0
966    punpckhbw        m11, m5, m0
967    pmaddubsw         m8, m23
968    pmaddubsw        m11, m23
969    paddw             m2, m8
970    paddw             m7, m11               ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
971    psrlw             m8, m2, 3
972    psrlw            m11, m7, 3
973    packuswb          m8, m11
974%if is_h || %1 == 16
975    vpblendmb    m10{k2}, m13, m8           ; p2
976%endif
977%ifidn %2, v
978 %if %1 == 8
979    vmovdqu8 [t0+strideq*1]{k2}, m8
980 %else
981    mova  [t0+strideq*1], m10
982 %endif
983%endif
984
985    pmaddubsw         m8, m24, pbm1_1
986    pmaddubsw        m11, m26, pbm1_1
987    paddw             m2, m8
988    paddw             m7, m11
989    punpcklbw         m8, m13, m6
990    punpckhbw        m11, m13, m6
991    pmaddubsw         m8, pbm1_1
992    pmaddubsw        m11, pbm1_1
993    paddw             m2, m8
994    paddw             m7, m11               ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
995    psrlw             m8, m2, 3
996    psrlw            m11, m7, 3
997    packuswb          m8, m11
998    vpblendmb     m8{k2}, m3, m8            ; p1
999%ifidn %2, v
1000    mova  [t0+strideq*2], m8
1001%else
1002    SWAP             m18, m8
1003%endif
1004
1005    pmaddubsw        m24, m23
1006    pmaddubsw        m26, m23
1007    psubw             m2, m24
1008    psubw             m7, m26
1009    punpcklbw         m8, m4, m14
1010    punpckhbw        m11, m4, m14
1011    pmaddubsw         m8, m23
1012    pmaddubsw        m11, m23
1013    paddw             m2, m8
1014    paddw             m7, m11               ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
1015    psrlw             m8, m2, 3
1016    psrlw            m11, m7, 3
1017    packuswb          m8, m11
1018    vpblendmb     m8{k2}, m4, m8            ; p0
1019%ifidn %2, v
1020    mova   [t0+stride3q], m8
1021%else
1022    SWAP             m29, m8
1023%endif
1024
1025    punpcklbw        m24, m5, m22
1026    punpckhbw        m26, m5, m22
1027    pmaddubsw         m8, m24, m23
1028    pmaddubsw        m11, m26, m23
1029    paddw             m2, m8
1030    paddw             m7, m11
1031    punpcklbw         m8, m4, m25
1032    punpckhbw        m11, m4, m25
1033    pmaddubsw         m8, m23
1034    pmaddubsw        m11, m23
1035    psubw             m2, m8
1036    psubw             m7, m11               ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
1037    psrlw             m8, m2, 3
1038    psrlw            m11, m7, 3
1039    packuswb          m8, m11
1040    vpblendmb    m11{k2}, m5, m8            ; q0
1041%ifidn %2, v
1042    mova [dstq+strideq*0], m11
1043%endif
1044
1045    pmaddubsw        m24, pbm1_1
1046    pmaddubsw        m26, pbm1_1
1047    paddw             m2, m24
1048    paddw             m7, m26
1049    punpcklbw         m8, m13, m6
1050    punpckhbw        m13, m6
1051    pmaddubsw         m8, pbm1_1
1052    pmaddubsw        m13, pbm1_1
1053    paddw             m2, m8
1054    paddw             m7, m13               ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
1055    psrlw             m8, m2, 3
1056    psrlw            m13, m7, 3
1057    packuswb          m8, m13
1058    vpblendmb    m13{k2}, m6, m8            ; q1
1059%ifidn %2, v
1060    mova [dstq+strideq*1], m13
1061%endif
1062
1063    punpcklbw        m24, m3, m6
1064    punpckhbw        m26, m3, m6
1065    pmaddubsw        m24, m23
1066    pmaddubsw        m26, m23
1067    psubw             m2, m24
1068    psubw             m7, m26
1069    punpcklbw        m24, m14, m22
1070    punpckhbw        m26, m14, m22
1071    pmaddubsw        m24, m23
1072    pmaddubsw        m26, m23
1073    paddw             m2, m24
1074    paddw             m7, m26               ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
1075    psrlw             m2, 3
1076    psrlw             m7, 3
1077    packuswb          m2, m7
1078%if is_h || %1 == 16
1079    vpblendmb     m2{k2}, m14, m2           ; q2
1080%endif
1081%ifidn %2, v
1082 %if %1 == 8
1083    vmovdqu8 [dstq+strideq*2]{k2}, m2
1084 %else
1085    mova [dstq+strideq*2], m2
1086 %endif
1087%endif
1088
1089%ifidn %2, h
1090    SWAP             m24, m18
1091    SWAP             m26, m29
1092%if %1 == 8
1093    ; 16x8 transpose
1094    punpcklbw         m3, m25, m10
1095    punpckhbw        m25, m10
1096    punpcklbw        m10, m24, m26
1097    punpckhbw        m24, m26
1098    punpcklbw        m26, m11, m13
1099    punpckhbw        m11, m13
1100    punpcklbw        m13, m2, m22
1101    punpckhbw         m2, m22
1102    ;
1103    punpcklwd        m22, m3, m10
1104    punpckhwd         m3, m10
1105    punpcklwd        m10, m25, m24
1106    punpckhwd        m25, m24
1107    punpcklwd        m24, m26, m13
1108    punpckhwd        m26, m13
1109    punpcklwd        m13, m11, m2
1110    punpckhwd        m11, m2
1111    ;
1112    punpckldq         m2, m22, m24
1113    punpckhdq        m22, m24
1114    punpckldq        m24, m3, m26
1115    punpckhdq         m3, m26
1116    punpckldq        m26, m10, m13
1117    punpckhdq        m10, m13
1118    punpckldq        m13, m25, m11
1119    punpckhdq        m25, m11
1120    ; write 8x32
1121    vpbroadcastd    ym16, strided
1122    pmulld          ym16, [hmulD]
1123    lea               t1, [dstq+strideq*2]
1124    lea               t2, [dstq+strideq*4]
1125    lea               t3, [t1  +strideq*4]
1126    lea               t0, [dstq+strideq*8]
1127    kmovb             k1, k6
1128    kmovb             k2, k6
1129    kmovb             k3, k6
1130    kmovb             k4, k6
1131    vpscatterdq [dstq+ym16-4]{k1}, m2
1132    vpscatterdq [t1  +ym16-4]{k2}, m22
1133    vpscatterdq [t2  +ym16-4]{k3}, m24
1134    vpscatterdq [t3  +ym16-4]{k4}, m3
1135    lea               t1, [t0+strideq*2]
1136    lea               t2, [t0+strideq*4]
1137    lea               t3, [t1+strideq*4]
1138    kmovb             k1, k6
1139    kmovb             k2, k6
1140    kmovb             k3, k6
1141    kmovb             k4, k6
1142    vpscatterdq [t0+ym16-4]{k1}, m26
1143    vpscatterdq [t1+ym16-4]{k2}, m10
1144    vpscatterdq [t2+ym16-4]{k3}, m13
1145    vpscatterdq [t3+ym16-4]{k4}, m25
1146%else
1147    ; 16x16 transpose and store
1148    SWAP               5, 10, 2
1149    SWAP               6, 24
1150    SWAP               7, 26
1151    SWAP               8, 11
1152    SWAP               9, 13
1153    mova             m24, [rsp+0*64]
1154    SWAP             m26, m28
1155    mova              m2, [rsp+1*64]
1156    mova              m3, [rsp+2*64]
1157    mova              m4, [rsp+3*64]
1158    SWAP             m11, m16
1159    SWAP             m25, m17
1160    SWAP             m13, m27
1161    SWAP             m14, m30
1162    TRANSPOSE_16X16B 1, 0, [rsp+4*64]
1163    movu [dstq+strideq*0-8], xm24
1164    movu [dstq+strideq*1-8], xm26
1165    movu [dstq+strideq*2-8], xm2
1166    movu [dstq+stride3q -8], xm3
1167    lea               t0, [dstq+strideq*4]
1168    movu [t0+strideq*0-8], xm4
1169    movu [t0+strideq*1-8], xm5
1170    movu [t0+strideq*2-8], xm6
1171    movu [t0+stride3q -8], xm7
1172    lea               t0, [t0+strideq*4]
1173    movu [t0+strideq*0-8], xm8
1174    movu [t0+strideq*1-8], xm9
1175    movu [t0+strideq*2-8], xm10
1176    movu [t0+stride3q -8], xm11
1177    lea               t0, [t0+strideq*4]
1178    movu [t0+strideq*0-8], xm25
1179    movu [t0+strideq*1-8], xm13
1180    movu [t0+strideq*2-8], xm14
1181    movu [t0+stride3q -8], xm22
1182    lea               t0, [t0+strideq*4]
1183    vextracti128 [t0+strideq*0-8], ym24, 1
1184    vextracti128 [t0+strideq*1-8], ym26, 1
1185    vextracti128 [t0+strideq*2-8], ym2, 1
1186    vextracti128 [t0+stride3q -8], ym3, 1
1187    lea               t0, [t0+strideq*4]
1188    vextracti128 [t0+strideq*0-8], ym4, 1
1189    vextracti128 [t0+strideq*1-8], ym5, 1
1190    vextracti128 [t0+strideq*2-8], ym6, 1
1191    vextracti128 [t0+stride3q -8], ym7, 1
1192    lea               t0, [t0+strideq*4]
1193    vextracti128 [t0+strideq*0-8], ym8, 1
1194    vextracti128 [t0+strideq*1-8], ym9, 1
1195    vextracti128 [t0+strideq*2-8], ym10, 1
1196    vextracti128 [t0+stride3q -8], ym11, 1
1197    lea               t0, [t0+strideq*4]
1198    vextracti128 [t0+strideq*0-8], ym25, 1
1199    vextracti128 [t0+strideq*1-8], ym13, 1
1200    vextracti128 [t0+strideq*2-8], ym14, 1
1201    vextracti128 [t0+stride3q -8], ym22, 1
1202    lea               t0, [t0+strideq*4]
1203    vextracti32x4 [t0+strideq*0-8], m24, 2
1204    vextracti32x4 [t0+strideq*1-8], m26, 2
1205    vextracti32x4 [t0+strideq*2-8], m2, 2
1206    vextracti32x4 [t0+stride3q -8], m3, 2
1207    lea               t0, [t0+strideq*4]
1208    vextracti32x4 [t0+strideq*0-8], m4, 2
1209    vextracti32x4 [t0+strideq*1-8], m5, 2
1210    vextracti32x4 [t0+strideq*2-8], m6, 2
1211    vextracti32x4 [t0+stride3q -8], m7, 2
1212    lea               t0, [t0+strideq*4]
1213    vextracti32x4 [t0+strideq*0-8], m8, 2
1214    vextracti32x4 [t0+strideq*1-8], m9, 2
1215    vextracti32x4 [t0+strideq*2-8], m10, 2
1216    vextracti32x4 [t0+stride3q -8], m11, 2
1217    lea               t0, [t0+strideq*4]
1218    vextracti32x4 [t0+strideq*0-8], m25, 2
1219    vextracti32x4 [t0+strideq*1-8], m13, 2
1220    vextracti32x4 [t0+strideq*2-8], m14, 2
1221    vextracti32x4 [t0+stride3q -8], m22, 2
1222    lea               t0, [t0+strideq*4]
1223    vextracti32x4 [t0+strideq*0-8], m24, 3
1224    vextracti32x4 [t0+strideq*1-8], m26, 3
1225    vextracti32x4 [t0+strideq*2-8], m2, 3
1226    vextracti32x4 [t0+stride3q -8], m3, 3
1227    lea               t0, [t0+strideq*4]
1228    vextracti32x4 [t0+strideq*0-8], m4, 3
1229    vextracti32x4 [t0+strideq*1-8], m5, 3
1230    vextracti32x4 [t0+strideq*2-8], m6, 3
1231    vextracti32x4 [t0+stride3q -8], m7, 3
1232    lea               t0, [t0+strideq*4]
1233    vextracti32x4 [t0+strideq*0-8], m8, 3
1234    vextracti32x4 [t0+strideq*1-8], m9, 3
1235    vextracti32x4 [t0+strideq*2-8], m10, 3
1236    vextracti32x4 [t0+stride3q -8], m11, 3
1237    lea               t0, [t0+strideq*4]
1238    vextracti32x4 [t0+strideq*0-8], m25, 3
1239    vextracti32x4 [t0+strideq*1-8], m13, 3
1240    vextracti32x4 [t0+strideq*2-8], m14, 3
1241    vextracti32x4 [t0+stride3q -8], m22, 3
1242%endif
1243%endif
1244
1245%elif %1 == 6
1246    ; flat6 filter
1247    vpbroadcastd     m15, [pb_3_1]
1248    vpbroadcastd     m12, [pb_2]
1249    punpcklbw         m8, m13, m5
1250    punpckhbw        m11, m13, m5
1251    pmaddubsw         m0, m8, m15
1252    pmaddubsw         m1, m11, m15
1253    punpcklbw         m7, m4, m3
1254    punpckhbw        m10, m4, m3
1255    pmaddubsw         m2, m7, m12
1256    pmaddubsw        m12, m10, m12
1257%ifidn %2, h
1258    vpbroadcastd     m15, [pb_m1_1]
1259 %define pbm1_1 m15
1260%endif
1261    paddw             m0, m2
1262    paddw             m1, m12
1263    pmulhrsw          m2, m0, m16
1264    pmulhrsw         m12, m1, m16
1265    packuswb          m2, m12
1266    vpblendmb     m2{k2}, m3, m2            ; p1
1267%ifidn %2, v
1268    mova  [t0+strideq*2], m2
1269%endif
1270
1271    pmaddubsw         m8, pbm1_1
1272    pmaddubsw        m11, pbm1_1
1273    paddw             m0, m8
1274    paddw             m1, m11
1275    punpcklbw         m8, m13, m6
1276    punpckhbw        m11, m13, m6
1277    pmaddubsw         m8, pbm1_1
1278    pmaddubsw        m11, pbm1_1
1279    paddw             m0, m8
1280    paddw             m1, m11
1281    pmulhrsw         m12, m0, m16
1282    pmulhrsw         m13, m1, m16
1283    packuswb         m12, m13
1284    vpblendmb    m12{k2}, m4, m12           ; p0
1285%ifidn %2, v
1286    mova   [t0+stride3q], m12
1287%endif
1288
1289    vpbroadcastd      m9, [pb_m1_2]
1290    vpbroadcastd      m4, [pb_m1_0]
1291    paddw             m0, m8
1292    paddw             m1, m11
1293    punpcklbw         m8, m3, m14
1294    punpckhbw        m11, m3, m14
1295    pmaddubsw        m14, m8, pbm1_1
1296    pmaddubsw        m13, m11, pbm1_1
1297    paddw             m0, m14
1298    paddw             m1, m13
1299    pmulhrsw         m14, m0, m16
1300    pmulhrsw         m13, m1, m16
1301    packuswb         m14, m13
1302    vpblendmb    m14{k2}, m5, m14           ; q0
1303%ifidn %2, v
1304    mova [dstq+strideq*0], m14
1305%endif
1306
1307    pmaddubsw         m8, m9
1308    pmaddubsw        m11, m9
1309    paddw             m0, m8
1310    paddw             m1, m11
1311    pmaddubsw         m7, m4
1312    pmaddubsw        m10, m4
1313    paddw             m0, m7
1314    paddw             m1, m10
1315    pmulhrsw          m0, m16
1316    pmulhrsw          m1, m16
1317    packuswb          m0, m1
1318    vpblendmb     m0{k2}, m6, m0            ; q1
1319%ifidn %2, v
1320    mova [dstq+strideq*1], m0
1321%else
1322    TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
1323%endif
1324%else ; %1 == 4
1325%ifidn %2, v
1326    mova  [t0+strideq*0], m3                ; p1
1327    mova  [t0+strideq*1], m4                ; p0
1328    mova  [t0+strideq*2], m5                ; q0
1329    mova  [t0+stride3q ], m6                ; q1
1330%else
1331    TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
1332%endif
1333%endif
1334%endmacro
1335
1336%define k7 k6
1337
1338INIT_ZMM avx512icl
1339cglobal lpf_v_sb_y_8bpc, 7, 10, 32, dst, stride, mask, l, l_stride, \
1340                                    lut, w, stride3, mstride
1341 DECLARE_REG_TMP 9
1342    shl        l_strideq, 2
1343    sub               lq, l_strideq
1344    mov         mstrideq, strideq
1345    neg         mstrideq
1346    lea         stride3q, [strideq*3]
1347    mova             m21, [pb_4x0_4x4_4x8_4x12]
1348    mova             m20, [pb_mask]
1349    vpbroadcastd     m19, [pb_128]
1350    vpbroadcastd     m28, [pb_m1_1]
1351    vpbroadcastd     m27, [pw_2048]
1352 %define pbshuf m21
1353 %define pbmask m20
1354 %define pb128  m19
1355 %define pbm1_1 m28
1356 %define pw2048 m27
1357
1358.loop:
1359    cmp   word [maskq+8], 0                 ; vmask[2]
1360    je .no_flat16
1361
1362    FILTER            16, v
1363    jmp .end
1364
1365.no_flat16:
1366    cmp   word [maskq+4], 0                 ; vmask[1]
1367    je .no_flat
1368
1369    FILTER             8, v
1370    jmp .end
1371
1372.no_flat:
1373    cmp   word [maskq+0], 0                 ; vmask[0]
1374    je .end
1375
1376    call .v4
1377
1378.end:
1379    add               lq, 64
1380    add             dstq, 64
1381    add            maskq, 2
1382    sub               wd, 16
1383    jg .loop
1384    RET
1385ALIGN function_align
1386RESET_MM_PERMUTATION
1387.v4:
1388    FILTER             4, v
1389    ret
1390
1391cglobal lpf_h_sb_y_8bpc, 7, 13, 32, 5*64, dst, stride, mask, l, l_stride, \
1392                                          lut, h, stride3, stride8
1393 DECLARE_REG_TMP 9, 10, 11, 12
1394    shl        l_strideq, 2
1395    sub               lq, 4
1396    lea         stride3q, [strideq*3]
1397    lea         stride8q, [strideq*8]
1398    kxnorw            k6, k6, k6
1399    vpbroadcastd     m19, strided
1400    vpbroadcastd     m20, l_strided
1401    pmulld           m21, m19, [hmulA]
1402    pmulld           m20, [hmulB]
1403    pmulld           m19, [hmulC]
1404 %define pbshuf [pb_4x0_4x4_4x8_4x12]
1405 %define pbmask [pb_mask]
1406 %define pb128  [pb_128]{bcstd}
1407    shl        l_strideq, 1
1408
1409.loop:
1410    cmp   word [maskq+8], 0                 ; vmask[2]
1411    je .no_flat16
1412
1413    FILTER            16, h
1414    jmp .end
1415
1416.no_flat16:
1417    cmp   word [maskq+4], 0                 ; vmask[1]
1418    je .no_flat
1419
1420    FILTER             8, h
1421    jmp .end
1422
1423.no_flat:
1424    cmp   word [maskq+0], 0                 ; vmask[0]
1425    je .end
1426
1427    call .h4
1428
1429.end:
1430    lea               lq, [lq+l_strideq*8]
1431    lea             dstq, [dstq+stride8q*8]
1432    add            maskq, 2
1433    sub               hd, 16
1434    jg .loop
1435    RET
1436ALIGN function_align
1437RESET_MM_PERMUTATION
1438.h4:
1439    FILTER             4, h
1440    ret
1441
1442cglobal lpf_v_sb_uv_8bpc, 7, 10, 22, dst, stride, mask, l, l_stride, \
1443                                     lut, w, stride3, mstride
1444 DECLARE_REG_TMP 9
1445    shl        l_strideq, 2
1446    sub               lq, l_strideq
1447    mov         mstrideq, strideq
1448    neg         mstrideq
1449    lea         stride3q, [strideq*3]
1450    mova             m21, [pb_4x0_4x4_4x8_4x12]
1451    mova             m20, [pb_mask]
1452    vpbroadcastd     m19, [pb_128]
1453    vpbroadcastd     m17, [pb_m1_1]
1454    vpbroadcastd     m16, [pw_4096]
1455 %define pbshuf m21
1456 %define pbmask m20
1457 %define pb128  m19
1458 %define pbm1_1 m17
1459
1460.loop:
1461    cmp   word [maskq+4], 0                 ; vmask[1]
1462    je .no_flat
1463
1464    FILTER             6, v
1465    jmp .end
1466
1467.no_flat:
1468    cmp   word [maskq+0], 0                 ; vmask[0]
1469    je .end
1470
1471    call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx512icl).v4
1472
1473.end:
1474    add               lq, 64
1475    add             dstq, 64
1476    add            maskq, 2
1477    sub               wd, 16
1478    jg .loop
1479    RET
1480
1481%undef k7
1482cglobal lpf_h_sb_uv_8bpc, 7, 12, 22, dst, stride, mask, l, l_stride, \
1483                                     lut, h, stride3, stride8
1484 DECLARE_REG_TMP 9, 10, 11
1485    mov              r7d, 0xffff
1486    movzx            r8d, r7b
1487    cmp               hd, 9
1488    cmovb            r7d, r8d
1489    kmovw             k6, r7d   ; h > 8 ? 0xffff : 0x00ff
1490    shl        l_strideq, 2
1491    sub               lq, 4
1492    kshiftrw          k7, k6, 4 ; h > 8 ? 0xff   : 0xf0
1493    lea         stride3q, [strideq*3]
1494    lea         stride8q, [strideq*8]
1495    vpbroadcastd     m19, strided
1496    vpbroadcastd     m20, l_strided
1497    pmulld           m21, m19, [hmulA]
1498    pmulld           m20, [hmulB]
1499    pmulld           m19, [hmulC]
1500    mova             m18, [pb_mask]
1501    vpbroadcastd     m17, [pb_128]
1502    vpbroadcastd     m16, [pw_4096]
1503 %define pbshuf [pb_4x0_4x4_4x8_4x12]
1504 %define pbmask m18
1505 %define pb128  m17
1506    add        l_strideq, l_strideq
1507
1508.loop:
1509    cmp   word [maskq+4], 0                 ; vmask[1]
1510    je .no_flat
1511
1512    FILTER             6, h
1513    jmp .end
1514
1515.no_flat:
1516    cmp   word [maskq+0], 0                 ; vmask[0]
1517    je .end
1518
1519    call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx512icl).h4
1520
1521.end:
1522    lea               lq, [lq+l_strideq*8]
1523    lea             dstq, [dstq+stride8q*8]
1524    add            maskq, 2
1525    sub               hd, 16
1526    jg .loop
1527    RET
1528
1529%endif ; ARCH_X86_64
1530