• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2018-2021, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33pb_4x1_4x5_4x9_4x13: times 2 db 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12
34pb_7_1: times 16 db 7, 1
35pb_3_1: times 16 db 3, 1
36pb_2_1: times 16 db 2, 1
37pb_m1_0: times 16 db -1, 0
38pb_m1_1: times 16 db -1, 1
39pb_m1_2: times 16 db -1, 2
40pb_1: times 32 db 1
41pb_2: times 32 db 2
42pb_3: times 32 db 3
43pb_4: times 32 db 4
44pb_16: times 32 db 16
45pb_63: times 32 db 63
46pb_64: times 32 db 64
47pb_128: times 32 db 0x80
48pb_129: times 32 db 0x81
49pb_240: times 32 db 0xf0
50pb_248: times 32 db 0xf8
51pb_254: times 32 db 0xfe
52
53pw_2048: times 16 dw 2048
54pw_4096: times 16 dw 4096
55
56pb_mask: dd 1, 2, 4, 8, 16, 32, 64, 128
57
58SECTION .text
59
60%macro ABSSUB 4 ; dst, a, b, tmp
61    psubusb       %1, %2, %3
62    psubusb       %4, %3, %2
63    por           %1, %4
64%endmacro
65
66%macro TRANSPOSE_16x4_AND_WRITE_4x32 5
67    ; transpose 16x4
68    punpcklbw    m%5, m%1, m%2
69    punpckhbw    m%1, m%2
70    punpcklbw    m%2, m%3, m%4
71    punpckhbw    m%3, m%4
72    punpcklwd    m%4, m%5, m%2
73    punpckhwd    m%5, m%2
74    punpcklwd    m%2, m%1, m%3
75    punpckhwd    m%1, m%3
76
77    ; write out
78    movd [dstq+strideq*0-2], xm%4
79    pextrd [dstq+strideq*1-2], xm%4, 1
80    pextrd [dstq+strideq*2-2], xm%4, 2
81    pextrd [dstq+stride3q-2], xm%4, 3
82    lea         dstq, [dstq+strideq*4]
83    movd [dstq+strideq*0-2], xm%5
84    pextrd [dstq+strideq*1-2], xm%5, 1
85    pextrd [dstq+strideq*2-2], xm%5, 2
86    pextrd [dstq+stride3q-2], xm%5, 3
87    lea         dstq, [dstq+strideq*4]
88    movd [dstq+strideq*0-2], xm%2
89    pextrd [dstq+strideq*1-2], xm%2, 1
90    pextrd [dstq+strideq*2-2], xm%2, 2
91    pextrd [dstq+stride3q-2], xm%2, 3
92    lea         dstq, [dstq+strideq*4]
93    movd [dstq+strideq*0-2], xm%1
94    pextrd [dstq+strideq*1-2], xm%1, 1
95    pextrd [dstq+strideq*2-2], xm%1, 2
96    pextrd [dstq+stride3q-2], xm%1, 3
97    lea         dstq, [dstq+strideq*4]
98
99    vextracti128 xm%4, m%4, 1
100    vextracti128 xm%5, m%5, 1
101    vextracti128 xm%2, m%2, 1
102    vextracti128 xm%1, m%1, 1
103
104    movd [dstq+strideq*0-2], xm%4
105    pextrd [dstq+strideq*1-2], xm%4, 1
106    pextrd [dstq+strideq*2-2], xm%4, 2
107    pextrd [dstq+stride3q-2], xm%4, 3
108    lea         dstq, [dstq+strideq*4]
109    movd [dstq+strideq*0-2], xm%5
110    pextrd [dstq+strideq*1-2], xm%5, 1
111    pextrd [dstq+strideq*2-2], xm%5, 2
112    pextrd [dstq+stride3q-2], xm%5, 3
113    lea         dstq, [dstq+strideq*4]
114    movd [dstq+strideq*0-2], xm%2
115    pextrd [dstq+strideq*1-2], xm%2, 1
116    pextrd [dstq+strideq*2-2], xm%2, 2
117    pextrd [dstq+stride3q-2], xm%2, 3
118    lea         dstq, [dstq+strideq*4]
119    movd [dstq+strideq*0-2], xm%1
120    pextrd [dstq+strideq*1-2], xm%1, 1
121    pextrd [dstq+strideq*2-2], xm%1, 2
122    pextrd [dstq+stride3q-2], xm%1, 3
123    lea         dstq, [dstq+strideq*4]
124%endmacro
125
126%macro TRANSPOSE_16X16B 3 ; in_load_15_from_mem, out_store_0_in_mem, mem
127%if %1 == 0
128    mova          %3, m15
129%endif
130
131    ; input in m0-15
132    punpcklbw    m15, m0, m1
133    punpckhbw     m0, m1
134    punpcklbw     m1, m2, m3
135    punpckhbw     m2, m3
136    punpcklbw     m3, m4, m5
137    punpckhbw     m4, m5
138    punpcklbw     m5, m6, m7
139    punpckhbw     m6, m7
140    punpcklbw     m7, m8, m9
141    punpckhbw     m8, m9
142    punpcklbw     m9, m10, m11
143    punpckhbw    m10, m11
144    punpcklbw    m11, m12, m13
145    punpckhbw    m12, m13
146    mova         m13, %3
147    mova          %3, m12
148    punpcklbw    m12, m14, m13
149    punpckhbw    m13, m14, m13
150
151    ; interleaved in m15,0,1,2,3,4,5,6,7,8,9,10,11,rsp%3,12,13
152    punpcklwd    m14, m15, m1
153    punpckhwd    m15, m1
154    punpcklwd     m1, m0, m2
155    punpckhwd     m0, m2
156    punpcklwd     m2, m3, m5
157    punpckhwd     m3, m5
158    punpcklwd     m5, m4, m6
159    punpckhwd     m4, m6
160    punpcklwd     m6, m7, m9
161    punpckhwd     m7, m9
162    punpcklwd     m9, m8, m10
163    punpckhwd     m8, m10
164    punpcklwd    m10, m11, m12
165    punpckhwd    m11, m12
166    mova         m12, %3
167    mova          %3, m11
168    punpcklwd    m11, m12, m13
169    punpckhwd    m12, m13
170
171    ; interleaved in m14,15,1,0,2,3,5,4,6,7,9,8,10,rsp%3,11,12
172    punpckldq    m13, m14, m2
173    punpckhdq    m14, m2
174    punpckldq     m2, m15, m3
175    punpckhdq    m15, m3
176    punpckldq     m3, m1, m5
177    punpckhdq     m1, m5
178    punpckldq     m5, m0, m4
179    punpckhdq     m0, m4
180    punpckldq     m4, m6, m10
181    punpckhdq     m6, m10
182    punpckldq    m10, m9, m11
183    punpckhdq     m9, m11
184    punpckldq    m11, m8, m12
185    punpckhdq     m8, m12
186    mova         m12, %3
187    mova          %3, m8
188    punpckldq     m8, m7, m12
189    punpckhdq     m7, m12
190
191    ; interleaved in m13,14,2,15,3,1,5,0,4,6,8,7,10,9,11,rsp%3
192    punpcklqdq   m12, m13, m4
193    punpckhqdq   m13, m4
194    punpcklqdq    m4, m14, m6
195    punpckhqdq   m14, m6
196    punpcklqdq    m6, m2, m8
197    punpckhqdq    m2, m8
198    punpcklqdq    m8, m15, m7
199    punpckhqdq   m15, m7
200    punpcklqdq    m7, m3, m10
201    punpckhqdq    m3, m10
202    punpcklqdq   m10, m1, m9
203    punpckhqdq    m1, m9
204    punpcklqdq    m9, m5, m11
205    punpckhqdq    m5, m11
206    mova         m11, %3
207    mova          %3, m12
208    punpcklqdq   m12, m0, m11
209    punpckhqdq    m0, m11
210%if %2 == 0
211    mova         m11, %3
212%endif
213
214    ; interleaved m11,13,4,14,6,2,8,15,7,3,10,1,9,5,12,0
215    SWAP          0, 11, 1, 13, 5, 2, 4, 6, 8, 7, 15
216    SWAP          3, 14, 12, 9
217%endmacro
218
219%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
220    ; load data
221%ifidn %2, v
222%if %1 == 4
223    lea         tmpq, [dstq+mstrideq*2]
224    mova          m3, [tmpq+strideq*0]          ; p1
225    mova          m4, [tmpq+strideq*1]          ; p0
226    mova          m5, [tmpq+strideq*2]          ; q0
227    mova          m6, [tmpq+stride3q]           ; q1
228%else
229    ; load 6-8 pixels, remainder (for wd=16) will be read inline
230    lea         tmpq, [dstq+mstrideq*4]
231%if %1 != 6
232    mova         m12, [tmpq+strideq*0]
233%endif
234    mova         m13, [tmpq+strideq*1]
235    mova          m3, [tmpq+strideq*2]
236    mova          m4, [tmpq+stride3q]
237    mova          m5, [dstq+strideq*0]
238    mova          m6, [dstq+strideq*1]
239    mova         m14, [dstq+strideq*2]
240%if %1 != 6
241    mova         m15, [dstq+stride3q]
242%endif
243%endif
244%else
245    ; load lines
246%if %1 == 4
247    movd         xm3, [dstq+strideq*0-2]
248    movd         xm4, [dstq+strideq*1-2]
249    movd         xm5, [dstq+strideq*2-2]
250    movd         xm6, [dstq+stride3q -2]
251    lea         tmpq, [dstq+strideq*4]
252    pinsrd       xm3, [tmpq+strideq*0-2], 2
253    pinsrd       xm4, [tmpq+strideq*1-2], 2
254    pinsrd       xm5, [tmpq+strideq*2-2], 2
255    pinsrd       xm6, [tmpq+stride3q -2], 2
256    lea         tmpq, [tmpq+strideq*4]
257    pinsrd       xm3, [tmpq+strideq*0-2], 1
258    pinsrd       xm4, [tmpq+strideq*1-2], 1
259    pinsrd       xm5, [tmpq+strideq*2-2], 1
260    pinsrd       xm6, [tmpq+stride3q -2], 1
261    lea         tmpq, [tmpq+strideq*4]
262    pinsrd       xm3, [tmpq+strideq*0-2], 3
263    pinsrd       xm4, [tmpq+strideq*1-2], 3
264    pinsrd       xm5, [tmpq+strideq*2-2], 3
265    pinsrd       xm6, [tmpq+stride3q -2], 3
266    lea         tmpq, [tmpq+strideq*4]
267    movd        xm12, [tmpq+strideq*0-2]
268    movd        xm13, [tmpq+strideq*1-2]
269    movd        xm14, [tmpq+strideq*2-2]
270    movd        xm15, [tmpq+stride3q -2]
271    lea         tmpq, [tmpq+strideq*4]
272    pinsrd      xm12, [tmpq+strideq*0-2], 2
273    pinsrd      xm13, [tmpq+strideq*1-2], 2
274    pinsrd      xm14, [tmpq+strideq*2-2], 2
275    pinsrd      xm15, [tmpq+stride3q -2], 2
276    lea         tmpq, [tmpq+strideq*4]
277    pinsrd      xm12, [tmpq+strideq*0-2], 1
278    pinsrd      xm13, [tmpq+strideq*1-2], 1
279    pinsrd      xm14, [tmpq+strideq*2-2], 1
280    pinsrd      xm15, [tmpq+stride3q -2], 1
281    lea         tmpq, [tmpq+strideq*4]
282    pinsrd      xm12, [tmpq+strideq*0-2], 3
283    pinsrd      xm13, [tmpq+strideq*1-2], 3
284    pinsrd      xm14, [tmpq+strideq*2-2], 3
285    pinsrd      xm15, [tmpq+stride3q -2], 3
286    vinserti128   m3, xm12, 1
287    vinserti128   m4, xm13, 1
288    vinserti128   m5, xm14, 1
289    vinserti128   m6, xm15, 1
290
291    ; transpose 4x16
292    ; xm3: A-D0,A-D8,A-D4,A-D12
293    ; xm4: A-D1,A-D9,A-D5,A-D13
294    ; xm5: A-D2,A-D10,A-D6,A-D14
295    ; xm6: A-D3,A-D11,A-D7,A-D15
296    punpcklbw     m7, m3, m4
297    punpckhbw     m3, m4
298    punpcklbw     m4, m5, m6
299    punpckhbw     m5, m6
300    ; xm7: A0-1,B0-1,C0-1,D0-1,A8-9,B8-9,C8-9,D8-9
301    ; xm3: A4-5,B4-5,C4-5,D4-5,A12-13,B12-13,C12-13,D12-13
302    ; xm4: A2-3,B2-3,C2-3,D2-3,A10-11,B10-11,C10-11,D10-11
303    ; xm5: A6-7,B6-7,C6-7,D6-7,A14-15,B14-15,C14-15,D14-15
304    punpcklwd     m6, m7, m4
305    punpckhwd     m7, m4
306    punpcklwd     m4, m3, m5
307    punpckhwd     m3, m5
308    ; xm6: A0-3,B0-3,C0-3,D0-3
309    ; xm7: A8-11,B8-11,C8-11,D8-11
310    ; xm4: A4-7,B4-7,C4-7,D4-7
311    ; xm3: A12-15,B12-15,C12-15,D12-15
312    punpckldq     m5, m6, m4
313    punpckhdq     m6, m4
314    punpckldq     m4, m7, m3
315    punpckhdq     m7, m3
316    ; xm5: A0-7,B0-7
317    ; xm6: C0-7,D0-7
318    ; xm4: A8-15,B8-15
319    ; xm7: C8-15,D8-15
320    punpcklqdq    m3, m5, m4
321    punpckhqdq    m4, m5, m4
322    punpcklqdq    m5, m6, m7
323    punpckhqdq    m6, m7
324    ; xm3: A0-15
325    ; xm5: B0-15
326    ; xm4: C0-15
327    ; xm6: D0-15
328%elif %1 == 6 || %1 == 8
329    movq         xm3, [dstq+strideq*0-%1/2]
330    movq         xm4, [dstq+strideq*1-%1/2]
331    movq         xm5, [dstq+strideq*2-%1/2]
332    movq         xm6, [dstq+stride3q -%1/2]
333    lea         tmpq, [dstq+strideq*8]
334    movhps       xm3, [tmpq+strideq*0-%1/2]
335    movhps       xm4, [tmpq+strideq*1-%1/2]
336    movhps       xm5, [tmpq+strideq*2-%1/2]
337    movhps       xm6, [tmpq+stride3q -%1/2]
338    lea         tmpq, [tmpq+strideq*8]
339    movq         xm7, [tmpq+strideq*0-%1/2]
340    movq         xm8, [tmpq+strideq*1-%1/2]
341    movq         xm9, [tmpq+strideq*2-%1/2]
342    movq        xm11, [tmpq+stride3q -%1/2]
343    lea         tmpq, [tmpq+strideq*8]
344    movhps       xm7, [tmpq+strideq*0-%1/2]
345    movhps       xm8, [tmpq+strideq*1-%1/2]
346    movhps       xm9, [tmpq+strideq*2-%1/2]
347    movhps      xm11, [tmpq+stride3q -%1/2]
348    vinserti128   m3, xm7, 1
349    vinserti128   m4, xm8, 1
350    vinserti128   m5, xm9, 1
351    vinserti128   m6, xm11, 1
352    lea         tmpq, [dstq+strideq*4]
353    movq        xm12, [tmpq+strideq*0-%1/2]
354    movq        xm13, [tmpq+strideq*1-%1/2]
355    movq        xm14, [tmpq+strideq*2-%1/2]
356    movq        xm15, [tmpq+stride3q -%1/2]
357    lea         tmpq, [tmpq+strideq*8]
358    movhps      xm12, [tmpq+strideq*0-%1/2]
359    movhps      xm13, [tmpq+strideq*1-%1/2]
360    movhps      xm14, [tmpq+strideq*2-%1/2]
361    movhps      xm15, [tmpq+stride3q -%1/2]
362    lea         tmpq, [tmpq+strideq*8]
363    movq         xm7, [tmpq+strideq*0-%1/2]
364    movq         xm8, [tmpq+strideq*1-%1/2]
365    movq         xm9, [tmpq+strideq*2-%1/2]
366    movq        xm11, [tmpq+stride3q -%1/2]
367    lea         tmpq, [tmpq+strideq*8]
368    movhps       xm7, [tmpq+strideq*0-%1/2]
369    movhps       xm8, [tmpq+strideq*1-%1/2]
370    movhps       xm9, [tmpq+strideq*2-%1/2]
371    movhps      xm11, [tmpq+stride3q -%1/2]
372    vinserti128  m12, xm7, 1
373    vinserti128  m13, xm8, 1
374    vinserti128  m14, xm9, 1
375    vinserti128  m15, xm11, 1
376
377    ; transpose 8x16
378    ; xm3: A-H0,A-H8
379    ; xm4: A-H1,A-H9
380    ; xm5: A-H2,A-H10
381    ; xm6: A-H3,A-H11
382    ; xm12: A-H4,A-H12
383    ; xm13: A-H5,A-H13
384    ; xm14: A-H6,A-H14
385    ; xm15: A-H7,A-H15
386    punpcklbw    m7, m3, m4
387    punpckhbw    m3, m4
388    punpcklbw    m4, m5, m6
389    punpckhbw    m5, m6
390    punpcklbw    m6, m12, m13
391    punpckhbw   m12, m13
392    punpcklbw   m13, m14, m15
393    punpckhbw   m14, m15
394    ; xm7: A0-1,B0-1,C0-1,D0-1,E0-1,F0-1,G0-1,H0-1
395    ; xm3: A8-9,B8-9,C8-9,D8-9,E8-9,F8-9,G8-9,H8-9
396    ; xm4: A2-3,B2-3,C2-3,D2-3,E2-3,F2-3,G2-3,H2-3
397    ; xm5: A10-11,B10-11,C10-11,D10-11,E10-11,F10-11,G10-11,H10-11
398    ; xm6: A4-5,B4-5,C4-5,D4-5,E4-5,F4-5,G4-5,H4-5
399    ; xm12: A12-13,B12-13,C12-13,D12-13,E12-13,F12-13,G12-13,H12-13
400    ; xm13: A6-7,B6-7,C6-7,D6-7,E6-7,F6-7,G6-7,H6-7
401    ; xm14: A14-15,B14-15,C14-15,D14-15,E14-15,F14-15,G14-15,H14-15
402    punpcklwd   m15, m7, m4
403    punpckhwd    m7, m4
404    punpcklwd    m4, m3, m5
405    punpckhwd    m3, m5
406    punpcklwd    m5, m6, m13
407    punpckhwd    m6, m13
408    punpcklwd   m13, m12, m14
409    punpckhwd   m12, m14
410    ; xm15: A0-3,B0-3,C0-3,D0-3
411    ; xm7: E0-3,F0-3,G0-3,H0-3
412    ; xm4: A8-11,B8-11,C8-11,D8-11
413    ; xm3: E8-11,F8-11,G8-11,H8-11
414    ; xm5: A4-7,B4-7,C4-7,D4-7
415    ; xm6: E4-7,F4-7,G4-7,H4-7
416    ; xm13: A12-15,B12-15,C12-15,D12-15
417    ; xm12: E12-15,F12-15,G12-15,H12-15
418    punpckldq   m14, m15, m5
419    punpckhdq   m15, m5
420    punpckldq    m5, m7, m6
421%if %1 != 6
422    punpckhdq    m7, m6
423%endif
424    punpckldq    m6, m4, m13
425    punpckhdq    m4, m13
426    punpckldq   m13, m3, m12
427%if %1 != 6
428    punpckhdq   m12, m3, m12
429%endif
430    ; xm14: A0-7,B0-7
431    ; xm15: C0-7,D0-7
432    ; xm5: E0-7,F0-7
433    ; xm7: G0-7,H0-7
434    ; xm6: A8-15,B8-15
435    ; xm4: C8-15,D8-15
436    ; xm13: E8-15,F8-15
437    ; xm12: G8-15,H8-15
438    punpcklqdq   m3, m14, m6
439    punpckhqdq  m14, m6
440    punpckhqdq   m6, m15, m4
441    punpcklqdq  m15, m4
442    punpcklqdq   m4, m5, m13
443    punpckhqdq  m13, m5, m13
444%if %1 == 8
445    punpcklqdq   m5, m7, m12
446    punpckhqdq  m12, m7, m12
447    ; xm3: A0-15
448    ; xm14: B0-15
449    ; xm15: C0-15
450    ; xm6: D0-15
451    ; xm4: E0-15
452    ; xm13: F0-15
453    ; xm5: G0-15
454    ; xm12: H0-15
455    SWAP         12, 3, 15
456    SWAP         13, 14, 5, 4, 6
457    ; 3,14,15,6,4,13,5,12 -> 12,13,3,4,5,6,14,15
458%else
459    SWAP         13, 3, 14
460    SWAP          6, 4, 15, 5
461    ; 3,14,15,6,4,13 -> 13,3,4,5,6,14
462%endif
463%else
464    ; load and 16x16 transpose. We only use 14 pixels but we'll need the
465    ; remainder at the end for the second transpose
466    movu         xm0, [dstq+strideq*0-8]
467    movu         xm1, [dstq+strideq*1-8]
468    movu         xm2, [dstq+strideq*2-8]
469    movu         xm3, [dstq+stride3q -8]
470    lea         tmpq, [dstq+strideq*4]
471    movu         xm4, [tmpq+strideq*0-8]
472    movu         xm5, [tmpq+strideq*1-8]
473    movu         xm6, [tmpq+strideq*2-8]
474    movu         xm7, [tmpq+stride3q -8]
475    lea         tmpq, [tmpq+strideq*4]
476    movu         xm8, [tmpq+strideq*0-8]
477    movu         xm9, [tmpq+strideq*1-8]
478    movu        xm10, [tmpq+strideq*2-8]
479    movu        xm11, [tmpq+stride3q -8]
480    lea         tmpq, [tmpq+strideq*4]
481    movu        xm12, [tmpq+strideq*0-8]
482    movu        xm13, [tmpq+strideq*1-8]
483    movu        xm14, [tmpq+strideq*2-8]
484    movu        xm15, [tmpq+stride3q -8]
485    lea         tmpq, [tmpq+strideq*4]
486    vinserti128   m0, [tmpq+strideq*0-8], 1
487    vinserti128   m1, [tmpq+strideq*1-8], 1
488    vinserti128   m2, [tmpq+strideq*2-8], 1
489    vinserti128   m3, [tmpq+stride3q -8], 1
490    lea         tmpq, [tmpq+strideq*4]
491    vinserti128   m4, [tmpq+strideq*0-8], 1
492    vinserti128   m5, [tmpq+strideq*1-8], 1
493    vinserti128   m6, [tmpq+strideq*2-8], 1
494    vinserti128   m7, [tmpq+stride3q -8], 1
495    lea         tmpq, [tmpq+strideq*4]
496    vinserti128   m8, [tmpq+strideq*0-8], 1
497    vinserti128   m9, [tmpq+strideq*1-8], 1
498    vinserti128  m10, [tmpq+strideq*2-8], 1
499    vinserti128  m11, [tmpq+stride3q -8], 1
500    lea         tmpq, [tmpq+strideq*4]
501    vinserti128  m12, [tmpq+strideq*0-8], 1
502    vinserti128  m13, [tmpq+strideq*1-8], 1
503    vinserti128  m14, [tmpq+strideq*2-8], 1
504    vinserti128  m15, [tmpq+stride3q -8], 1
505
506    TRANSPOSE_16X16B 0, 1, [rsp+11*32]
507    mova  [rsp+12*32], m1
508    mova  [rsp+13*32], m2
509    mova  [rsp+14*32], m3
510    mova  [rsp+15*32], m12
511    mova  [rsp+16*32], m13
512    mova  [rsp+17*32], m14
513    mova  [rsp+18*32], m15
514    ; 4,5,6,7,8,9,10,11 -> 12,13,3,4,5,6,14,15
515    SWAP           12, 4, 7
516    SWAP           13, 5, 8
517    SWAP            3, 6, 9
518    SWAP           10, 14
519    SWAP           11, 15
520%endif
521%endif
522
523    ; load L/E/I/H
524%ifidn %2, v
525    movu          m1, [lq]
526    movu          m0, [lq+l_strideq]
527%else
528    movq         xm1, [lq]
529    movq         xm2, [lq+l_strideq*2]
530    movhps       xm1, [lq+l_strideq]
531    movhps       xm2, [lq+l_stride3q]
532    lea           lq, [lq+l_strideq*4]
533    movq        xm10, [lq]
534    movq         xm0, [lq+l_strideq*2]
535    movhps      xm10, [lq+l_strideq]
536    movhps       xm0, [lq+l_stride3q]
537    lea           lq, [lq+l_strideq*4]
538    vinserti128   m1, xm10, 1
539    vinserti128   m2, xm0, 1
540    shufps        m0, m1, m2, q3131
541    shufps        m1, m2, q2020
542%endif
543    pxor          m2, m2
544    pcmpeqb      m10, m2, m0
545    pand          m1, m10
546    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
547    pshufb        m0, [pb_4x1_4x5_4x9_4x13]     ; l[x][1]
548    pcmpeqb      m10, m2, m0                    ; !L
549    psrlq         m2, m0, [lutq+128]
550    pand          m2, [pb_63]
551    vpbroadcastb  m1, [lutq+136]
552    pminub        m2, m1
553    pmaxub        m2, [pb_1]                    ; I
554    pand          m1, m0, [pb_240]
555    psrlq         m1, 4                         ; H
556    paddb         m0, [pb_2]
557    paddb         m0, m0
558    paddb         m0, m2                        ; E
559    pxor          m1, [pb_128]
560    pxor          m2, [pb_128]
561    pxor          m0, [pb_128]
562
563    ABSSUB        m8, m3, m4, m9                ; abs(p1-p0)
564    pmaxub        m8, m10
565    ABSSUB        m9, m5, m6, m10               ; abs(q1-q0)
566    pmaxub        m8, m9
567%if %1 == 4
568    pxor          m8, [pb_128]
569    pcmpgtb       m7, m8, m1                    ; hev
570%else
571    pxor          m7, m8, [pb_128]
572    pcmpgtb       m7, m1                        ; hev
573
574%if %1 == 6
575    ABSSUB        m9, m13, m4, m10              ; abs(p2-p0)
576    pmaxub        m9, m8
577%else
578    ABSSUB        m9, m12, m4, m10              ; abs(p3-p0)
579    pmaxub        m9, m8
580    ABSSUB       m10, m13, m4, m11              ; abs(p2-p0)
581    pmaxub        m9, m10
582%endif
583    ABSSUB       m10, m5,  m14, m11             ; abs(q2-q0)
584    pmaxub        m9, m10
585%if %1 != 6
586    ABSSUB       m10, m5,  m15, m11             ; abs(q3-q0)
587    pmaxub        m9, m10
588%endif
589    pxor          m9, [pb_128]
590    pcmpgtb       m9, [pb_129]                  ; !flat8in
591
592%if %1 == 6
593    ABSSUB       m10, m13, m3,  m1              ; abs(p2-p1)
594%else
595    ABSSUB       m10, m12, m13, m11             ; abs(p3-p2)
596    ABSSUB       m11, m13, m3,  m1              ; abs(p2-p1)
597    pmaxub       m10, m11
598    ABSSUB       m11, m14, m15, m1              ; abs(q3-q2)
599    pmaxub       m10, m11
600%endif
601    ABSSUB       m11, m14, m6,  m1              ; abs(q2-q1)
602    pmaxub       m10, m11
603%if %1 == 16
604    vpbroadcastd m11, [maskq+8]
605    vpbroadcastd  m1, [maskq+4]
606    por          m11, m1
607    pand         m11, [pb_mask]
608    pcmpeqd      m11, [pb_mask]
609    pand         m10, m11
610%else
611    vpbroadcastd m11, [maskq+4]
612    pand         m11, [pb_mask]
613    pcmpeqd      m11, [pb_mask]
614    pand         m10, m11                       ; only apply fm-wide to wd>4 blocks
615%endif
616    pmaxub        m8, m10
617
618    pxor          m8, [pb_128]
619%endif
620    pcmpgtb       m8, m2
621
622    ABSSUB       m10, m3, m6, m11               ; abs(p1-q1)
623    ABSSUB       m11, m4, m5, m2                ; abs(p0-q0)
624    paddusb      m11, m11
625    pand         m10, [pb_254]
626    psrlq        m10, 1
627    paddusb      m10, m11                       ; abs(p0-q0)*2+(abs(p1-q1)>>1)
628    pxor         m10, [pb_128]
629    pcmpgtb      m10, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
630    por           m8, m10
631
632%if %1 == 16
633%ifidn %2, v
634    lea         tmpq, [dstq+mstrideq*8]
635    mova          m0, [tmpq+strideq*1]
636%else
637    mova          m0, [rsp+12*32]
638%endif
639    ABSSUB        m1, m0, m4, m2
640%ifidn %2, v
641    mova          m0, [tmpq+strideq*2]
642%else
643    mova          m0, [rsp+13*32]
644%endif
645    ABSSUB        m2, m0, m4, m10
646    pmaxub        m1, m2
647%ifidn %2, v
648    mova          m0, [tmpq+stride3q]
649%else
650    mova          m0, [rsp+14*32]
651%endif
652    ABSSUB        m2, m0, m4, m10
653    pmaxub        m1, m2
654%ifidn %2, v
655    lea         tmpq, [dstq+strideq*4]
656    mova          m0, [tmpq+strideq*0]
657%else
658    mova          m0, [rsp+15*32]
659%endif
660    ABSSUB        m2, m0, m5, m10
661    pmaxub        m1, m2
662%ifidn %2, v
663    mova          m0, [tmpq+strideq*1]
664%else
665    mova          m0, [rsp+16*32]
666%endif
667    ABSSUB        m2, m0, m5, m10
668    pmaxub        m1, m2
669%ifidn %2, v
670    mova          m0, [tmpq+strideq*2]
671%else
672    mova          m0, [rsp+17*32]
673%endif
674    ABSSUB        m2, m0, m5, m10
675    pmaxub        m1, m2
676    pxor          m1, [pb_128]
677    pcmpgtb       m1, [pb_129]                  ; !flat8out
678    por           m1, m9                        ; !flat8in | !flat8out
679    vpbroadcastd  m2, [maskq+8]
680    pand         m10, m2, [pb_mask]
681    pcmpeqd      m10, [pb_mask]
682    pandn         m1, m10                       ; flat16
683    pandn         m1, m8, m1                    ; flat16 & fm
684
685    vpbroadcastd m10, [maskq+4]
686    por          m10, m2
687    pand          m2, m10, [pb_mask]
688    pcmpeqd       m2, [pb_mask]
689    pandn         m9, m2                        ; flat8in
690    pandn         m9, m8, m9
691    vpbroadcastd  m2, [maskq+0]
692    por           m2, m10
693    pand          m2, [pb_mask]
694    pcmpeqd       m2, [pb_mask]
695    pandn         m8, m2
696    pandn         m8, m9, m8                    ; fm & !flat8 & !flat16
697    pandn         m9, m1, m9                    ; flat8 & !flat16
698%elif %1 != 4
699    vpbroadcastd  m0, [maskq+4]
700    pand          m2, m0, [pb_mask]
701    pcmpeqd       m2, [pb_mask]
702    pandn         m9, m2
703    pandn         m9, m8, m9                    ; flat8 & fm
704    vpbroadcastd  m2, [maskq+0]
705    por           m0, m2
706    pand          m0, [pb_mask]
707    pcmpeqd       m0, [pb_mask]
708    pandn         m8, m0
709    pandn         m8, m9, m8                    ; fm & !flat8
710%else
711    vpbroadcastd  m0, [maskq+0]
712    pand          m0, [pb_mask]
713    pcmpeqd       m0, [pb_mask]
714    pandn         m8, m0                        ; fm
715%endif
716
717    ; short filter
718
719    pxor          m3, [pb_128]
720    pxor          m6, [pb_128]
721    psubsb       m10, m3, m6                    ; iclip_diff(p1-q1)
722    pand         m10, m7                        ; f=iclip_diff(p1-q1)&hev
723    pxor          m4, [pb_128]
724    pxor          m5, [pb_128]
725    psubsb       m11, m5, m4
726    paddsb       m10, m11
727    paddsb       m10, m11
728    paddsb       m10, m11                       ; f=iclip_diff(3*(q0-p0)+f)
729    pand          m8, m10                       ; f&=fm
730    paddsb       m10, m8, [pb_3]
731    paddsb        m8, [pb_4]
732    pand         m10, [pb_248]
733    pand          m8, [pb_248]
734    psrlq        m10, 3
735    psrlq         m8, 3
736    pxor         m10, [pb_16]
737    pxor          m8, [pb_16]
738    psubb        m10, [pb_16]                   ; f2
739    psubb         m8, [pb_16]                   ; f1
740    paddsb        m4, m10
741    psubsb        m5, m8
742    pxor          m4, [pb_128]
743    pxor          m5, [pb_128]
744
745    pxor          m8, [pb_128]
746    pxor         m10, m10
747    pavgb         m8, m10                       ; f=(f1+1)>>1
748    psubb         m8, [pb_64]
749    pandn         m8, m7, m8                    ; f&=!hev
750    paddsb        m3, m8
751    psubsb        m6, m8
752    pxor          m3, [pb_128]
753    pxor          m6, [pb_128]
754
755%if %1 == 16
756    ; flat16 filter
757%ifidn %2, v
758    lea         tmpq, [dstq+mstrideq*8]
759    mova          m0, [tmpq+strideq*1]          ; p6
760    mova          m2, [tmpq+strideq*2]          ; p5
761    mova          m7, [tmpq+stride3q]           ; p4
762%else
763    mova          m0, [rsp+12*32]
764    mova          m2, [rsp+13*32]
765    mova          m7, [rsp+14*32]
766%endif
767
768    mova  [rsp+0*32], m9
769    mova  [rsp+1*32], m14
770    mova  [rsp+2*32], m15
771
772    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0 [p5/p4/p2/p1/p0/q0][p6/p3] A
773    ; write -6
774    punpcklbw    m14, m0, m12
775    punpckhbw    m15, m0, m12
776    pmaddubsw    m10, m14, [pb_7_1]
777    pmaddubsw    m11, m15, [pb_7_1]             ; p6*7+p3
778    punpcklbw     m8, m2, m7
779    punpckhbw     m9, m2, m7
780    pmaddubsw     m8, [pb_2]
781    pmaddubsw     m9, [pb_2]
782    paddw        m10, m8
783    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3
784    punpcklbw     m8, m13, m3
785    punpckhbw     m9, m13, m3
786    pmaddubsw     m8, [pb_1]
787    pmaddubsw     m9, [pb_1]
788    paddw        m10, m8
789    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1
790    punpcklbw     m8, m4, m5
791    punpckhbw     m9, m4, m5
792    pmaddubsw     m8, [pb_1]
793    pmaddubsw     m9, [pb_1]
794    paddw        m10, m8
795    paddw        m11, m9                        ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
796    pmulhrsw      m8, m10, [pw_2048]
797    pmulhrsw      m9, m11, [pw_2048]
798    packuswb      m8, m9
799    pand          m8, m1
800    pandn         m9, m1, m2
801    por           m8, m9
802%ifidn %2, v
803    mova [tmpq+strideq*2], m8                   ; p5
804%else
805    mova [rsp+13*32], m8
806%endif
807
808    ; sub p6*2, add p3/q1 [reuse p6/p3 from A][-p6,+q1|save] B
809    ; write -5
810    pmaddubsw    m14, [pb_m1_1]
811    pmaddubsw    m15, [pb_m1_1]
812    paddw        m10, m14
813    paddw        m11, m15                       ; p6*6+p5*2+p4*2+p3*2+p2+p1+p0+q0
814    punpcklbw     m8, m0, m6
815    punpckhbw     m9, m0, m6
816    pmaddubsw     m8, [pb_m1_1]
817    pmaddubsw     m9, [pb_m1_1]
818    mova  [rsp+3*32], m8
819    mova  [rsp+4*32], m9
820    paddw        m10, m8
821    paddw        m11, m9                        ; p6*5+p5*2+p4*2+p3*2+p2+p1+p0+q0+q1
822    pmulhrsw      m8, m10, [pw_2048]
823    pmulhrsw      m9, m11, [pw_2048]
824    packuswb      m8, m9
825    vpblendvb     m8, m7, m8, m1
826%ifidn %2, v
827    mova [tmpq+stride3q], m8                    ; p4
828%else
829    mova [rsp+14*32], m8
830%endif
831
832    ; sub p6/p5, add p2/q2 [-p6,+p2][-p5,+q2|save] C
833    ; write -4
834    mova         m14, [rsp+1*32]
835    punpcklbw     m8, m0, m13
836    punpckhbw     m9, m0, m13
837    pmaddubsw     m8, [pb_m1_1]
838    pmaddubsw     m9, [pb_m1_1]
839    paddw        m10, m8
840    paddw        m11, m9                        ; p6*4+p5*2+p4*2+p3*2+p2*2+p1+p0+q0+q1
841    punpcklbw     m8, m2, m14
842    punpckhbw     m2, m14
843    pmaddubsw     m8, [pb_m1_1]
844    pmaddubsw     m2, [pb_m1_1]
845    mova  [rsp+1*32], m8
846    paddw        m10, m8
847    paddw        m11, m2                        ; p6*4+p5+p4*2+p3*2+p2*2+p1+p0+q0+q1+q2
848    pmulhrsw      m8, m10, [pw_2048]
849    pmulhrsw      m9, m11, [pw_2048]
850    packuswb      m8, m9
851    vpblendvb     m8, m12, m8, m1
852%ifidn %2, v
853    mova [tmpq+strideq*4], m8                   ; p3
854%else
855    mova [rsp+19*32], m8
856%endif
857
858    ; sub p6/p4, add p1/q3 [-p6,+p1][-p4,+q3|save] D
859    ; write -3
860    mova         m15, [rsp+2*32]
861    punpcklbw     m8, m0, m3
862    punpckhbw     m9, m0, m3
863    pmaddubsw     m8, [pb_m1_1]
864    pmaddubsw     m9, [pb_m1_1]
865    paddw        m10, m8
866    paddw        m11, m9                        ; p6*3+p5+p4*2+p3*2+p2*2+p1*2+p0+q0+q1+q2
867    punpcklbw     m8, m7, m15
868    punpckhbw     m7, m15
869    pmaddubsw     m8, [pb_m1_1]
870    pmaddubsw     m7, [pb_m1_1]
871    mova  [rsp+2*32], m8
872    paddw        m10, m8
873    paddw        m11, m7                        ; p6*3+p5+p4+p3*2+p2*2+p1*2+p0+q0+q1+q2+q3
874    pmulhrsw      m8, m10, [pw_2048]
875    pmulhrsw      m9, m11, [pw_2048]
876    packuswb      m8, m9
877    vpblendvb     m8, m13, m8, m1
878    mova  [rsp+6*32], m8                        ; don't clobber p2/m13 since we need it in F
879
880    ; sub p6/p3, add p0/q4 [-p6,+p0][-p3,+q4|save] E
881    ; write -2
882%ifidn %2, v
883    lea         tmpq, [dstq+strideq*4]
884%endif
885    punpcklbw     m8, m0, m4
886    punpckhbw     m9, m0, m4
887    pmaddubsw     m8, [pb_m1_1]
888    pmaddubsw     m9, [pb_m1_1]
889    paddw        m10, m8
890    paddw        m11, m9                        ; p6*2+p5+p4+p3*2+p2*2+p1*2+p0*2+q0+q1+q2+q3
891%ifidn %2, v
892    mova          m9, [tmpq+strideq*0]          ; q4
893%else
894    mova          m9, [rsp+15*32]
895%endif
896    punpcklbw     m8, m12, m9
897    punpckhbw     m9, m12, m9
898    pmaddubsw     m8, [pb_m1_1]
899    pmaddubsw     m9, [pb_m1_1]
900    mova  [rsp+7*32], m8
901    mova  [rsp+5*32], m9
902    paddw        m10, m8
903    paddw        m11, m9                        ; p6*2+p5+p4+p3+p2*2+p1*2+p0*2+q0+q1+q2+q3+q4
904    pmulhrsw      m8, m10, [pw_2048]
905    pmulhrsw      m9, m11, [pw_2048]
906    packuswb      m8, m9
907    vpblendvb     m8, m3, m8, m1
908    mova  [rsp+8*32], m8                        ; don't clobber p1/m3 since we need it in G
909
910    ; sub p6/p2, add q0/q5 [-p6,+q0][-p2,+q5|save] F
911    ; write -1
912%ifidn %2, v
913    mova          m9, [tmpq+strideq*1]          ; q5
914%else
915    mova          m9, [rsp+16*32]
916%endif
917    punpcklbw     m8, m0, m5
918    punpckhbw     m0, m5
919    pmaddubsw     m8, [pb_m1_1]
920    pmaddubsw     m0, [pb_m1_1]
921    paddw        m10, m8
922    paddw        m11, m0                        ; p6+p5+p4+p3+p2*2+p1*2+p0*2+q0*2+q1+q2+q3+q4
923    punpcklbw     m0, m13, m9
924    punpckhbw     m9, m13, m9
925    mova         m13, [rsp+6*32]
926    pmaddubsw     m0, [pb_m1_1]
927    pmaddubsw     m9, [pb_m1_1]
928    mova [rsp+ 9*32], m0
929    mova [rsp+10*32], m9
930    paddw        m10, m0
931    paddw        m11, m9                        ; p6+p5+p4+p3+p2+p1*2+p0*2+q0*2+q1+q2+q3+q4+q5
932    pmulhrsw      m0, m10, [pw_2048]
933    pmulhrsw      m8, m11, [pw_2048]
934    packuswb      m0, m8
935    vpblendvb     m0, m4, m0, m1
936    mova  [rsp+6*32], m0                        ; don't clobber p0/m4 since we need it in H
937
938    ; sub p6/p1, add q1/q6 [reuse -p6,+q1 from B][-p1,+q6|save] G
939    ; write +0
940%ifidn %2, v
941    mova          m0, [tmpq+strideq*2]          ; q6
942%else
943    mova          m0, [rsp+17*32]
944%endif
945    paddw        m10, [rsp+3*32]
946    paddw        m11, [rsp+4*32]                ; p5+p4+p3+p2+p1*2+p0*2+q0*2+q1*2+q2+q3+q4+q5
947    punpcklbw     m8, m3, m0
948    punpckhbw     m9, m3, m0
949    mova          m3, [rsp+8*32]
950    pmaddubsw     m8, [pb_m1_1]
951    pmaddubsw     m9, [pb_m1_1]
952    mova  [rsp+3*32], m8
953    mova  [rsp+4*32], m9
954    paddw        m10, m8
955    paddw        m11, m9                        ; p5+p4+p3+p2+p1+p0*2+q0*2+q1*2+q2+q3+q4+q5+q6
956    pmulhrsw      m8, m10, [pw_2048]
957    pmulhrsw      m9, m11, [pw_2048]
958    packuswb      m8, m9
959    vpblendvb     m8, m5, m8, m1
960    mova  [rsp+8*32], m8                        ; don't clobber q0/m5 since we need it in I
961
962    ; sub p5/p0, add q2/q6 [reuse -p5,+q2 from C][-p0,+q6] H
963    ; write +1
964    paddw        m10, [rsp+1*32]
965    paddw        m11, m2                        ; p4+p3+p2+p1+p0*2+q0*2+q1*2+q2*2+q3+q4+q5+q6
966    punpcklbw     m8, m4, m0
967    punpckhbw     m2, m4, m0
968    mova          m4, [rsp+6*32]
969    pmaddubsw     m8, [pb_m1_1]
970    pmaddubsw     m2, [pb_m1_1]
971    paddw        m10, m8
972    paddw        m11, m2                        ; p4+p3+p2+p1+p0+q0*2+q1*2+q2*2+q3+q4+q5+q6*2
973    pmulhrsw      m2, m10, [pw_2048]
974    pmulhrsw      m9, m11, [pw_2048]
975    packuswb      m2, m9
976    vpblendvb     m2, m6, m2, m1                ; don't clobber q1/m6 since we need it in K
977
978    ; sub p4/q0, add q3/q6 [reuse -p4,+q3 from D][-q0,+q6] I
979    ; write +2
980    paddw        m10, [rsp+2*32]
981    paddw        m11, m7                        ; p3+p2+p1+p0+q0*2+q1*2+q2*2+q3*2+q4+q5+q6*2
982    punpcklbw     m8, m5, m0
983    punpckhbw     m9, m5, m0
984    mova          m5, [rsp+8*32]
985    pmaddubsw     m8, [pb_m1_1]
986    pmaddubsw     m9, [pb_m1_1]
987    paddw        m10, m8
988    paddw        m11, m9                        ; p3+p2+p1+p0+q0+q1*2+q2*2+q3*2+q4+q5+q6*3
989    pmulhrsw      m7, m10, [pw_2048]
990    pmulhrsw      m9, m11, [pw_2048]
991    packuswb      m7, m9
992    vpblendvb     m7, m14, m7, m1               ; don't clobber q2/m14 since we need it in K
993
994    ; sub p3/q1, add q4/q6 [reuse -p3,+q4 from E][-q1,+q6] J
995    ; write +3
996    paddw        m10, [rsp+7*32]
997    paddw        m11, [rsp+5*32]                ; p2+p1+p0+q0+q1*2+q2*2+q3*2+q4*2+q5+q6*3
998    punpcklbw     m8, m6, m0
999    punpckhbw     m9, m6, m0
1000    SWAP           2, 6
1001    pmaddubsw     m8, [pb_m1_1]
1002    pmaddubsw     m9, [pb_m1_1]
1003    paddw        m10, m8
1004    paddw        m11, m9                        ; p2+p1+p0+q0+q1+q2*2+q3*2+q4*2+q5+q6*4
1005    pmulhrsw      m8, m10, [pw_2048]
1006    pmulhrsw      m9, m11, [pw_2048]
1007    packuswb      m8, m9
1008    vpblendvb     m8, m15, m8, m1
1009%ifidn %2, v
1010    mova [tmpq+mstrideq], m8                    ; q3
1011%else
1012    mova [rsp+20*32], m8
1013%endif
1014
1015    ; sub p2/q2, add q5/q6 [reuse -p2,+q5 from F][-q2,+q6] K
1016    ; write +4
1017    paddw        m10, [rsp+ 9*32]
1018    paddw        m11, [rsp+10*32]               ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1019    punpcklbw     m8, m14, m0
1020    punpckhbw     m9, m14, m0
1021    SWAP          14, 7
1022    pmaddubsw     m8, [pb_m1_1]
1023    pmaddubsw     m9, [pb_m1_1]
1024    paddw        m10, m8
1025    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1026    pmulhrsw      m8, m10, [pw_2048]
1027    pmulhrsw      m9, m11, [pw_2048]
1028    packuswb      m8, m9
1029%ifidn %2, v
1030    mova          m9, [tmpq+strideq*0]
1031%else
1032    mova          m9, [rsp+15*32]
1033%endif
1034    vpblendvb     m8, m9, m8, m1
1035%ifidn %2, v
1036    mova [tmpq+strideq*0], m8                    ; q4
1037%else
1038    mova [rsp+15*32], m8
1039%endif
1040
1041    ; sub p1/q3, add q6*2 [reuse -p1,+q6 from G][-q3,+q6] L
1042    ; write +5
1043    paddw        m10, [rsp+3*32]
1044    paddw        m11, [rsp+4*32]                ; p1+p0+q0+q1+q2*2+q3*2+q4*2+q5*2+q6*4
1045    punpcklbw     m8, m15, m0
1046    punpckhbw     m9, m15, m0
1047    pmaddubsw     m8, [pb_m1_1]
1048    pmaddubsw     m9, [pb_m1_1]
1049    paddw        m10, m8
1050    paddw        m11, m9                        ; p1+p0+q0+q1+q2+q3*2+q4*2+q5*2+q6*5
1051    pmulhrsw     m10, [pw_2048]
1052    pmulhrsw     m11, [pw_2048]
1053    packuswb     m10, m11
1054%ifidn %2, v
1055    mova         m11, [tmpq+strideq*1]
1056%else
1057    mova         m11, [rsp+16*32]
1058%endif
1059    vpblendvb    m10, m11, m10, m1
1060%ifidn %2, v
1061    mova [tmpq+strideq*1], m10                  ; q5
1062%else
1063    mova [rsp+16*32], m10
1064%endif
1065
1066    mova          m9, [rsp+0*32]
1067%ifidn %2, v
1068    lea         tmpq, [dstq+mstrideq*4]
1069%endif
1070%endif
1071%if %1 >= 8
1072    ; flat8 filter
1073    punpcklbw     m0, m12, m3
1074    punpckhbw     m1, m12, m3
1075    pmaddubsw     m2, m0, [pb_3_1]
1076    pmaddubsw     m7, m1, [pb_3_1]              ; 3 * p3 + p1
1077    punpcklbw     m8, m13, m4
1078    punpckhbw    m11, m13, m4
1079    pmaddubsw     m8, [pb_2_1]
1080    pmaddubsw    m11, [pb_2_1]
1081    paddw         m2, m8
1082    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0
1083    punpcklbw     m8, m5, [pb_4]
1084    punpckhbw    m11, m5, [pb_4]
1085    pmaddubsw     m8, [pb_1]
1086    pmaddubsw    m11, [pb_1]
1087    paddw         m2, m8
1088    paddw         m7, m11                       ; 3 * p3 + 2 * p2 + p1 + p0 + q0 + 4
1089    psrlw         m8, m2, 3
1090    psrlw        m11, m7, 3
1091    packuswb      m8, m11
1092    vpblendvb    m10, m13, m8, m9              ; p2
1093%ifidn %2, v
1094    mova [tmpq+strideq*1], m10                 ; p2
1095%endif
1096
1097    pmaddubsw     m8, m0, [pb_m1_1]
1098    pmaddubsw    m11, m1, [pb_m1_1]
1099    paddw         m2, m8
1100    paddw         m7, m11
1101    punpcklbw     m8, m13, m6
1102    punpckhbw    m11, m13, m6
1103    pmaddubsw     m8, [pb_m1_1]
1104    pmaddubsw    m11, [pb_m1_1]
1105    paddw         m2, m8
1106    paddw         m7, m11                       ; 2 * p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4
1107    psrlw         m8, m2, 3
1108    psrlw        m11, m7, 3
1109    packuswb      m8, m11
1110    vpblendvb     m8, m3, m8, m9                ; p1
1111%ifidn %2, v
1112    mova [tmpq+strideq*2], m8                   ; p1
1113%else
1114    mova  [rsp+0*32], m8
1115%endif
1116
1117    pmaddubsw     m0, [pb_1]
1118    pmaddubsw     m1, [pb_1]
1119    psubw         m2, m0
1120    psubw         m7, m1
1121    punpcklbw     m8, m4, m14
1122    punpckhbw    m11, m4, m14
1123    pmaddubsw     m8, [pb_1]
1124    pmaddubsw    m11, [pb_1]
1125    paddw         m2, m8
1126    paddw         m7, m11                       ; p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4
1127    psrlw         m8, m2, 3
1128    psrlw        m11, m7, 3
1129    packuswb      m8, m11
1130    vpblendvb     m8, m4, m8, m9                ; p0
1131%ifidn %2, v
1132    mova [tmpq+stride3q ], m8                   ; p0
1133%else
1134    mova  [rsp+1*32], m8
1135%endif
1136
1137    punpcklbw     m0, m5, m15
1138    punpckhbw     m1, m5, m15
1139    pmaddubsw     m8, m0, [pb_1]
1140    pmaddubsw    m11, m1, [pb_1]
1141    paddw         m2, m8
1142    paddw         m7, m11
1143    punpcklbw     m8, m4, m12
1144    punpckhbw    m11, m4, m12
1145    pmaddubsw     m8, [pb_1]
1146    pmaddubsw    m11, [pb_1]
1147    psubw         m2, m8
1148    psubw         m7, m11                       ; p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4
1149    psrlw         m8, m2, 3
1150    psrlw        m11, m7, 3
1151    packuswb      m8, m11
1152    vpblendvb    m11, m5, m8, m9                ; q0
1153%ifidn %2, v
1154    mova [dstq+strideq*0], m11                  ; q0
1155%endif
1156
1157    pmaddubsw     m0, [pb_m1_1]
1158    pmaddubsw     m1, [pb_m1_1]
1159    paddw         m2, m0
1160    paddw         m7, m1
1161    punpcklbw     m8, m13, m6
1162    punpckhbw    m13, m6
1163    pmaddubsw     m8, [pb_m1_1]
1164    pmaddubsw    m13, [pb_m1_1]
1165    paddw         m2, m8
1166    paddw         m7, m13                       ; p1 + p0 + q0 + 2 * q1 + q2 + 2 * q3 + 4
1167    psrlw         m8, m2, 3
1168    psrlw        m13, m7, 3
1169    packuswb      m8, m13
1170    vpblendvb    m13, m6, m8, m9                ; q1
1171%ifidn %2, v
1172    mova [dstq+strideq*1], m13                  ; q1
1173%endif
1174
1175    punpcklbw     m0, m3, m6
1176    punpckhbw     m1, m3, m6
1177    pmaddubsw     m0, [pb_1]
1178    pmaddubsw     m1, [pb_1]
1179    psubw         m2, m0
1180    psubw         m7, m1
1181    punpcklbw     m0, m14, m15
1182    punpckhbw     m1, m14, m15
1183    pmaddubsw     m0, [pb_1]
1184    pmaddubsw     m1, [pb_1]
1185    paddw         m2, m0
1186    paddw         m7, m1                        ; p0 + q0 + q1 + q2 + 2 * q2 + 3 * q3 + 4
1187    psrlw         m2, 3
1188    psrlw         m7, 3
1189    packuswb      m2, m7
1190    vpblendvb     m2, m14, m2, m9               ; q2
1191%ifidn %2, v
1192    mova [dstq+strideq*2], m2                   ; q2
1193%else
1194    mova          m0, [rsp+0*32]
1195    mova          m1, [rsp+1*32]
1196%if %1 == 8
1197    ; 16x8 transpose
1198    punpcklbw     m3, m12, m10
1199    punpckhbw    m12, m10
1200    punpcklbw    m10, m0, m1
1201    punpckhbw     m0, m1
1202    punpcklbw     m1, m11, m13
1203    punpckhbw    m11, m13
1204    punpcklbw    m13, m2, m15
1205    punpckhbw     m2, m15
1206
1207    punpcklwd    m15, m3, m10
1208    punpckhwd     m3, m10
1209    punpcklwd    m10, m12, m0
1210    punpckhwd    m12, m0
1211    punpcklwd     m0, m1, m13
1212    punpckhwd     m1, m13
1213    punpcklwd    m13, m11, m2
1214    punpckhwd    m11, m2
1215
1216    punpckldq     m2, m15, m0
1217    punpckhdq    m15, m0
1218    punpckldq     m0, m3, m1
1219    punpckhdq     m3, m1
1220    punpckldq     m1, m10, m13
1221    punpckhdq    m10, m13
1222    punpckldq    m13, m12, m11
1223    punpckhdq    m12, m11
1224
1225    ; write 8x32
1226    movq   [dstq+strideq*0-4], xm2
1227    movhps [dstq+strideq*1-4], xm2
1228    movq   [dstq+strideq*2-4], xm15
1229    movhps [dstq+stride3q -4], xm15
1230    lea         dstq, [dstq+strideq*4]
1231    movq   [dstq+strideq*0-4], xm0
1232    movhps [dstq+strideq*1-4], xm0
1233    movq   [dstq+strideq*2-4], xm3
1234    movhps [dstq+stride3q -4], xm3
1235    lea         dstq, [dstq+strideq*4]
1236    movq   [dstq+strideq*0-4], xm1
1237    movhps [dstq+strideq*1-4], xm1
1238    movq   [dstq+strideq*2-4], xm10
1239    movhps [dstq+stride3q -4], xm10
1240    lea         dstq, [dstq+strideq*4]
1241    movq   [dstq+strideq*0-4], xm13
1242    movhps [dstq+strideq*1-4], xm13
1243    movq   [dstq+strideq*2-4], xm12
1244    movhps [dstq+stride3q -4], xm12
1245    lea         dstq, [dstq+strideq*4]
1246
1247    vextracti128  xm2,  m2, 1
1248    vextracti128 xm15, m15, 1
1249    vextracti128  xm0,  m0, 1
1250    vextracti128  xm3,  m3, 1
1251    vextracti128  xm1,  m1, 1
1252    vextracti128 xm10, m10, 1
1253    vextracti128 xm13, m13, 1
1254    vextracti128 xm12, m12, 1
1255
1256    movq   [dstq+strideq*0-4], xm2
1257    movhps [dstq+strideq*1-4], xm2
1258    movq   [dstq+strideq*2-4], xm15
1259    movhps [dstq+stride3q -4], xm15
1260    lea         dstq, [dstq+strideq*4]
1261    movq   [dstq+strideq*0-4], xm0
1262    movhps [dstq+strideq*1-4], xm0
1263    movq   [dstq+strideq*2-4], xm3
1264    movhps [dstq+stride3q -4], xm3
1265    lea         dstq, [dstq+strideq*4]
1266    movq   [dstq+strideq*0-4], xm1
1267    movhps [dstq+strideq*1-4], xm1
1268    movq   [dstq+strideq*2-4], xm10
1269    movhps [dstq+stride3q -4], xm10
1270    lea         dstq, [dstq+strideq*4]
1271    movq   [dstq+strideq*0-4], xm13
1272    movhps [dstq+strideq*1-4], xm13
1273    movq   [dstq+strideq*2-4], xm12
1274    movhps [dstq+stride3q -4], xm12
1275    lea         dstq, [dstq+strideq*4]
1276%else
1277    ; 16x16 transpose and store
1278    SWAP           5, 10, 2
1279    SWAP           6, 0
1280    SWAP           7, 1
1281    SWAP           8, 11
1282    SWAP           9, 13
1283    mova          m0, [rsp+11*32]
1284    mova          m1, [rsp+12*32]
1285    mova          m2, [rsp+13*32]
1286    mova          m3, [rsp+14*32]
1287    mova          m4, [rsp+19*32]
1288    mova         m11, [rsp+20*32]
1289    mova         m12, [rsp+15*32]
1290    mova         m13, [rsp+16*32]
1291    mova         m14, [rsp+17*32]
1292    TRANSPOSE_16X16B 1, 0, [rsp+18*32]
1293    movu [dstq+strideq*0-8], xm0
1294    movu [dstq+strideq*1-8], xm1
1295    movu [dstq+strideq*2-8], xm2
1296    movu [dstq+stride3q -8], xm3
1297    lea         dstq, [dstq+strideq*4]
1298    movu [dstq+strideq*0-8], xm4
1299    movu [dstq+strideq*1-8], xm5
1300    movu [dstq+strideq*2-8], xm6
1301    movu [dstq+stride3q -8], xm7
1302    lea         dstq, [dstq+strideq*4]
1303    movu [dstq+strideq*0-8], xm8
1304    movu [dstq+strideq*1-8], xm9
1305    movu [dstq+strideq*2-8], xm10
1306    movu [dstq+stride3q -8], xm11
1307    lea         dstq, [dstq+strideq*4]
1308    movu [dstq+strideq*0-8], xm12
1309    movu [dstq+strideq*1-8], xm13
1310    movu [dstq+strideq*2-8], xm14
1311    movu [dstq+stride3q -8], xm15
1312    lea         dstq, [dstq+strideq*4]
1313    vextracti128 [dstq+strideq*0-8], m0, 1
1314    vextracti128 [dstq+strideq*1-8], m1, 1
1315    vextracti128 [dstq+strideq*2-8], m2, 1
1316    vextracti128 [dstq+stride3q -8], m3, 1
1317    lea         dstq, [dstq+strideq*4]
1318    vextracti128 [dstq+strideq*0-8], m4, 1
1319    vextracti128 [dstq+strideq*1-8], m5, 1
1320    vextracti128 [dstq+strideq*2-8], m6, 1
1321    vextracti128 [dstq+stride3q -8], m7, 1
1322    lea         dstq, [dstq+strideq*4]
1323    vextracti128 [dstq+strideq*0-8], m8, 1
1324    vextracti128 [dstq+strideq*1-8], m9, 1
1325    vextracti128 [dstq+strideq*2-8], m10, 1
1326    vextracti128 [dstq+stride3q -8], m11, 1
1327    lea         dstq, [dstq+strideq*4]
1328    vextracti128 [dstq+strideq*0-8], m12, 1
1329    vextracti128 [dstq+strideq*1-8], m13, 1
1330    vextracti128 [dstq+strideq*2-8], m14, 1
1331    vextracti128 [dstq+stride3q -8], m15, 1
1332    lea         dstq, [dstq+strideq*4]
1333%endif
1334%endif
1335%elif %1 == 6
1336    ; flat6 filter
1337
1338    punpcklbw     m8, m13, m5
1339    punpckhbw    m11, m13, m5
1340    pmaddubsw     m0, m8, [pb_3_1]
1341    pmaddubsw     m1, m11, [pb_3_1]
1342    punpcklbw     m7, m4, m3
1343    punpckhbw    m10, m4, m3
1344    pmaddubsw     m2, m7, [pb_2]
1345    pmaddubsw    m12, m10, [pb_2]
1346    paddw         m0, m2
1347    paddw         m1, m12
1348    pmulhrsw      m2, m0, [pw_4096]
1349    pmulhrsw     m12, m1, [pw_4096]
1350    packuswb      m2, m12
1351    vpblendvb     m2, m3, m2, m9
1352%ifidn %2, v
1353    mova [tmpq+strideq*2], m2                   ; p1
1354%endif
1355
1356    pmaddubsw     m8, [pb_m1_1]
1357    pmaddubsw    m11, [pb_m1_1]
1358    paddw         m0, m8
1359    paddw         m1, m11
1360    punpcklbw     m8, m13, m6
1361    punpckhbw    m11, m13, m6
1362    pmaddubsw     m8, [pb_m1_1]
1363    pmaddubsw    m11, [pb_m1_1]
1364    paddw         m0, m8
1365    paddw         m1, m11
1366    pmulhrsw     m12, m0, [pw_4096]
1367    pmulhrsw     m13, m1, [pw_4096]
1368    packuswb     m12, m13
1369    vpblendvb    m12, m4, m12, m9
1370%ifidn %2, v
1371    mova [tmpq+stride3q], m12                   ; p0
1372%endif
1373
1374    paddw         m0, m8
1375    paddw         m1, m11
1376    punpcklbw     m8, m3, m14
1377    punpckhbw    m11, m3, m14
1378    pmaddubsw    m14, m8, [pb_m1_1]
1379    pmaddubsw    m13, m11, [pb_m1_1]
1380    paddw         m0, m14
1381    paddw         m1, m13
1382    pmulhrsw     m14, m0, [pw_4096]
1383    pmulhrsw     m13, m1, [pw_4096]
1384    packuswb     m14, m13
1385    vpblendvb    m14, m5, m14, m9
1386%ifidn %2, v
1387    mova [dstq+strideq*0], m14                  ; q0
1388%endif
1389
1390    pmaddubsw     m8, [pb_m1_2]
1391    pmaddubsw    m11, [pb_m1_2]
1392    paddw         m0, m8
1393    paddw         m1, m11
1394    pmaddubsw     m7, [pb_m1_0]
1395    pmaddubsw    m10, [pb_m1_0]
1396    paddw         m0, m7
1397    paddw         m1, m10
1398    pmulhrsw      m0, [pw_4096]
1399    pmulhrsw      m1, [pw_4096]
1400    packuswb      m0, m1
1401    vpblendvb     m0, m6, m0, m9
1402%ifidn %2, v
1403    mova [dstq+strideq*1], m0                   ; q1
1404%else
1405    TRANSPOSE_16x4_AND_WRITE_4x32 2, 12, 14, 0, 1
1406%endif
1407%else
1408%ifidn %2, v
1409    mova [tmpq+strideq*0], m3                   ; p1
1410    mova [tmpq+strideq*1], m4                   ; p0
1411    mova [tmpq+strideq*2], m5                   ; q0
1412    mova [tmpq+stride3q ], m6                   ; q1
1413%else
1414    TRANSPOSE_16x4_AND_WRITE_4x32 3, 4, 5, 6, 7
1415%endif
1416%endif
1417%endmacro
1418
1419INIT_YMM avx2
1420cglobal lpf_v_sb_y_8bpc, 7, 10, 16, 32 * 11, \
1421                    dst, stride, mask, l, l_stride, lut, \
1422                    w, stride3, mstride, tmp
1423    shl    l_strideq, 2
1424    sub           lq, l_strideq
1425    mov     mstrideq, strideq
1426    neg     mstrideq
1427    lea     stride3q, [strideq*3]
1428
1429.loop:
1430    cmp byte [maskq+8], 0                       ; vmask[2]
1431    je .no_flat16
1432
1433    FILTER        16, v
1434    jmp .end
1435
1436.no_flat16:
1437    cmp byte [maskq+4], 0                       ; vmask[1]
1438    je .no_flat
1439
1440    FILTER         8, v
1441    jmp .end
1442
1443.no_flat:
1444    cmp byte [maskq+0], 0                       ; vmask[0]
1445    je .end
1446
1447    call .v4
1448
1449.end:
1450    add           lq, 32
1451    add         dstq, 32
1452    add        maskq, 1
1453    sub           wd, 8
1454    jg .loop
1455    RET
1456ALIGN function_align
1457.v4:
1458    FILTER         4, v
1459    ret
1460
1461INIT_YMM avx2
1462cglobal lpf_h_sb_y_8bpc, 7, 10, 16, 32 * 21, \
1463                    dst, stride, mask, l, l_stride, lut, \
1464                    h, stride3, l_stride3, tmp
1465    shl    l_strideq, 2
1466    sub           lq, 4
1467    lea     stride3q, [strideq*3]
1468    lea   l_stride3q, [l_strideq*3]
1469
1470.loop:
1471    cmp byte [maskq+8], 0                       ; vmask[2]
1472    je .no_flat16
1473
1474    FILTER        16, h
1475    jmp .end
1476
1477.no_flat16:
1478    cmp byte [maskq+4], 0                       ; vmask[1]
1479    je .no_flat
1480
1481    FILTER         8, h
1482    jmp .end
1483
1484.no_flat:
1485    cmp byte [maskq+0], 0                       ; vmask[0]
1486    je .no_filter
1487
1488    call .h4
1489    jmp .end
1490
1491.no_filter:
1492    lea         dstq, [dstq+stride3q*8]
1493    lea           lq, [lq+l_strideq*8]
1494    lea         dstq, [dstq+strideq*8]
1495.end:
1496    add        maskq, 1
1497    sub           hd, 8
1498    jg .loop
1499    RET
1500ALIGN function_align
1501.h4:
1502    FILTER         4, h
1503    ret
1504
1505INIT_YMM avx2
1506cglobal lpf_v_sb_uv_8bpc, 7, 10, 16, \
1507                     dst, stride, mask, l, l_stride, lut, \
1508                     w, stride3, mstride, tmp
1509    shl    l_strideq, 2
1510    sub           lq, l_strideq
1511    mov     mstrideq, strideq
1512    neg     mstrideq
1513    lea     stride3q, [strideq*3]
1514
1515.loop:
1516    cmp byte [maskq+4], 0                       ; vmask[1]
1517    je .no_flat
1518
1519    FILTER         6, v
1520    jmp .end
1521
1522.no_flat:
1523    cmp byte [maskq+0], 0                       ; vmask[0]
1524    je .end
1525
1526    call mangle(private_prefix %+ _lpf_v_sb_y_8bpc_avx2).v4
1527
1528.end:
1529    add           lq, 32
1530    add         dstq, 32
1531    add        maskq, 1
1532    sub           wd, 8
1533    jg .loop
1534    RET
1535
1536INIT_YMM avx2
1537cglobal lpf_h_sb_uv_8bpc, 7, 10, 16, \
1538                     dst, stride, mask, l, l_stride, lut, \
1539                     h, stride3, l_stride3, tmp
1540    shl    l_strideq, 2
1541    sub           lq, 4
1542    lea     stride3q, [strideq*3]
1543    lea   l_stride3q, [l_strideq*3]
1544
1545.loop:
1546    cmp byte [maskq+4], 0                       ; vmask[1]
1547    je .no_flat
1548
1549    FILTER         6, h
1550    jmp .end
1551
1552.no_flat:
1553    cmp byte [maskq+0], 0                       ; vmask[0]
1554    je .no_filter
1555
1556    call mangle(private_prefix %+ _lpf_h_sb_y_8bpc_avx2).h4
1557    jmp .end
1558
1559.no_filter:
1560    lea         dstq, [dstq+stride3q*8]
1561    lea           lq, [lq+l_strideq*8]
1562    lea         dstq, [dstq+strideq*8]
1563.end:
1564    add        maskq, 1
1565    sub           hd, 8
1566    jg .loop
1567    RET
1568
1569%endif ; ARCH_X86_64
1570