• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33pb_mask: dd 1, 1, 2, 2, 4, 4, 8, 8
34pb_4x1_4x5_4x9_4x13: times 4 db 0, 1
35                     times 4 db 8, 9
36                     times 4 db 0, 1
37                     times 4 db 8, 9
38
39pw_1:     times 16 dw 1
40pw_2:     times 16 dw 2
41pw_3:     times 16 dw 3
42pw_4096:  times 2 dw 4096
43
44; 10bpc/12bpc:
45pw_4:     times 2 dw 4
46          times 2 dw 16
47clip_max: times 2 dw 511
48          times 2 dw 2047
49clip_min: times 2 dw -512
50          times 2 dw -2048
51
52SECTION .text
53
54;        in:            out:
55; mm%1   a b c d        a e i m
56; mm%2   e f g h        b f j n
57; mm%3   i j k l   ->   c g k o
58; mm%4   m n o p        d h l p
59%macro TRANSPOSE4X4W 5
60    punpcklwd        m%5, m%1, m%2
61    punpckhwd        m%1, m%2
62    punpcklwd        m%2, m%3, m%4
63    punpckhwd        m%3, m%4
64    punpckldq        m%4, m%5, m%2
65    punpckhdq        m%5, m%2
66    punpckldq        m%2, m%1, m%3
67    punpckhdq        m%1, m%3
68
69    SWAP              %1, %4
70    SWAP              %2, %5, %3
71%endmacro
72
73;         in:                  out:
74; xmm%1   a b c d e f g h      a i q y 6 E M U
75; xmm%2   i j k l m n o p      b j r z 7 F N V
76; xmm%3   q r s t u v w x      c k s 0 8 G O W
77; xmm%4   y z 0 1 2 3 4 5      d l t 1 9 H P X
78; xmm%5   6 7 8 9 A B C D  ->  e m u 2 A I Q Y
79; xmm%6   E F G H I J K L      f n v 3 B J R Z
80; xmm%7   M N O P Q R S T      g o w 4 C K S +
81; xmm%8   U V W X Y Z + =      h p x 5 D L T =
82%macro TRANSPOSE8X8W 9
83    ; xmm%1   a b c d e f g h      a i q y b j r z
84    ; xmm%2   i j k l m n o p      c k s 0 d l t 1
85    ; xmm%3   q r s t u v w x  ->  e m u 2 f n v 3
86    ; xmm%4   y z 0 1 2 3 4 5      g o w 4 h p x 5
87    TRANSPOSE4X4W     %1, %2, %3, %4, %9
88
89    ; xmm%5   6 7 8 9 A B C D      6 E M U 7 F N V
90    ; xmm%6   E F G H I J K L      8 G O W 9 H P X
91    ; xmm%7   M N O P Q R S T  ->  A I Q Y B J R Z
92    ; xmm%8   U V W X Y Z + =      C K S + D L T =
93    TRANSPOSE4X4W     %5, %6, %7, %8, %9
94
95    ; xmm%1   a i q y b j r z      a i q y 6 E M U
96    ; xmm%2   c k s 0 d l t 1      b j r z 7 F N V
97    ; xmm%3   e m u 2 f n v 3      c k s 0 8 G O W
98    ; xmm%4   g o w 4 h p x 5      d l t 1 9 H P X
99    ; xmm%5   6 E M U 7 F N V  ->  e m u 2 A I Q Y
100    ; xmm%6   8 G O W 9 H P X      f n v 3 B J R Z
101    ; xmm%7   A I Q Y B J R Z      g o w 4 C K S +
102    ; xmm%8   C K S + D L T =      h p x 5 D L T =
103    punpckhqdq       m%9, m%1, m%5
104    punpcklqdq       m%1, m%5
105    punpckhqdq       m%5, m%2, m%6
106    punpcklqdq       m%2, m%6
107    punpckhqdq       m%6, m%3, m%7
108    punpcklqdq       m%3, m%7
109    punpckhqdq       m%7, m%4, m%8
110    punpcklqdq       m%4, m%8
111
112    SWAP %8, %7, %4, %5, %3, %2, %9
113%endmacro
114
115; transpose and write m3-6, everything else is scratch
116%macro TRANSPOSE_8x4_AND_WRITE_4x16 0
117    ; transpose 8x4
118    punpcklwd     m0, m3, m4
119    punpckhwd     m3, m4
120    punpcklwd     m4, m5, m6
121    punpckhwd     m5, m6
122    punpckldq     m6, m0, m4
123    punpckhdq     m0, m4
124    punpckldq     m4, m3, m5
125    punpckhdq     m3, m5
126
127    ; write out
128    movq   [dstq+strideq*0-4], xm6
129    movhps [dstq+strideq*1-4], xm6
130    movq   [dstq+strideq*2-4], xm0
131    movhps [dstq+stride3q -4], xm0
132    lea         dstq, [dstq+strideq*4]
133    movq   [dstq+strideq*0-4], xm4
134    movhps [dstq+strideq*1-4], xm4
135    movq   [dstq+strideq*2-4], xm3
136    movhps [dstq+stride3q -4], xm3
137    lea         dstq, [dstq+strideq*4]
138
139    vextracti128 xm6, m6, 1
140    vextracti128 xm0, m0, 1
141    vextracti128 xm4, m4, 1
142    vextracti128 xm3, m3, 1
143
144    movq   [dstq+strideq*0-4], xm6
145    movhps [dstq+strideq*1-4], xm6
146    movq   [dstq+strideq*2-4], xm0
147    movhps [dstq+stride3q -4], xm0
148    lea         dstq, [dstq+strideq*4]
149    movq   [dstq+strideq*0-4], xm4
150    movhps [dstq+strideq*1-4], xm4
151    movq   [dstq+strideq*2-4], xm3
152    movhps [dstq+stride3q -4], xm3
153    lea         dstq, [dstq+strideq*4]
154%endmacro
155
156%macro FILTER 2 ; width [4/6/8/16], dir [h/v]
157    ; load data
158%ifidn %2, v
159%if %1 == 4
160    lea         tmpq, [dstq+mstrideq*2]
161    mova          m3, [tmpq+strideq*0]          ; p1
162    mova          m4, [tmpq+strideq*1]          ; p0
163    mova          m5, [tmpq+strideq*2]          ; q0
164    mova          m6, [tmpq+stride3q]           ; q1
165%else
166    ; load 6-8 pixels, remainder (for wd=16) will be read inline
167    lea         tmpq, [dstq+mstrideq*4]
168    ; we load p3 later
169    mova         m13, [tmpq+strideq*1]
170    mova          m3, [tmpq+strideq*2]
171    mova          m4, [tmpq+stride3q]
172    mova          m5, [dstq+strideq*0]
173    mova          m6, [dstq+strideq*1]
174    mova         m14, [dstq+strideq*2]
175%if %1 != 6
176    mova         m15, [dstq+stride3q]
177%endif
178%endif
179%else
180    ; load lines
181%if %1 == 4
182    movq         xm3, [dstq+strideq*0-4]
183    movq         xm4, [dstq+strideq*1-4]
184    movq         xm5, [dstq+strideq*2-4]
185    movq         xm6, [dstq+stride3q -4]
186    lea         tmpq, [dstq+strideq*4]
187    movq        xm11, [tmpq+strideq*0-4]
188    movq        xm13, [tmpq+strideq*1-4]
189    movq        xm14, [tmpq+strideq*2-4]
190    movq        xm15, [tmpq+stride3q -4]
191    lea         tmpq, [tmpq+strideq*4]
192    ; this overreads by 8 bytes but the buffers are padded
193    ; so that should be ok
194    vinserti128   m3, [tmpq+strideq*0-4], 1
195    vinserti128   m4, [tmpq+strideq*1-4], 1
196    vinserti128   m5, [tmpq+strideq*2-4], 1
197    vinserti128   m6, [tmpq+stride3q -4], 1
198    lea         tmpq, [tmpq+strideq*4]
199    vinserti128  m11, [tmpq+strideq*0-4], 1
200    vinserti128  m13, [tmpq+strideq*1-4], 1
201    vinserti128  m14, [tmpq+strideq*2-4], 1
202    vinserti128  m15, [tmpq+stride3q -4], 1
203
204    ; transpose 4x8
205    ; xm3: A-D0,A-D4
206    ; xm4: A-D1,A-D5
207    ; xm5: A-D2,A-D6
208    ; xm6: A-D3,A-D7
209    punpcklwd     m7, m3, m4
210    punpcklwd     m3, m11, m13
211    punpcklwd     m4, m5, m6
212    punpcklwd     m5, m14, m15
213    ; xm7: A0-1,B0-1,C0-1,D0-1
214    ; xm3: A4-5,B4-5,C4-5,D4-5
215    ; xm4: A2-3,B2-3,C2-3,D2-3
216    ; xm5: A6-7,B6-7,C6-7,D6-7
217    punpckldq     m6, m7, m4
218    punpckhdq     m7, m4
219    punpckldq     m8, m3, m5
220    punpckhdq     m5, m3, m5
221    ; xm6: A0-3,B0-3
222    ; xm7: C0-3,D0-3
223    ; xm8: A4-7,B4-7
224    ; xm5: C4-7,D4-7
225    punpcklqdq    m3, m6, m8
226    punpckhqdq    m4, m6, m8
227    punpckhqdq    m6, m7, m5
228    punpcklqdq    m5, m7, m5
229    ; xm3: A0-7
230    ; xm4: B0-7
231    ; xm5: C0-7
232    ; xm6: D0-7
233%elif %1 == 6 || %1 == 8
234    movu         xm3, [dstq+strideq*0-8]
235    movu         xm4, [dstq+strideq*1-8]
236    movu         xm5, [dstq+strideq*2-8]
237    movu         xm6, [dstq+stride3q -8]
238    lea         tmpq, [dstq+strideq*4]
239    movu        xm11, [tmpq+strideq*0-8]
240    movu        xm13, [tmpq+strideq*1-8]
241    movu        xm14, [tmpq+strideq*2-8]
242    movu        xm15, [tmpq+stride3q -8]
243    lea         tmpq, [tmpq+strideq*4]
244    vinserti128   m3, [tmpq+strideq*0-8], 1
245    vinserti128   m4, [tmpq+strideq*1-8], 1
246    vinserti128   m5, [tmpq+strideq*2-8], 1
247    vinserti128   m6, [tmpq+stride3q -8], 1
248    lea         tmpq, [tmpq+strideq*4]
249    vinserti128  m11, [tmpq+strideq*0-8], 1
250    vinserti128  m13, [tmpq+strideq*1-8], 1
251    vinserti128  m14, [tmpq+strideq*2-8], 1
252    vinserti128  m15, [tmpq+stride3q -8], 1
253
254    ; transpose 8x16
255    ; xm3: A-H0,A-H8
256    ; xm4: A-H1,A-H9
257    ; xm5: A-H2,A-H10
258    ; xm6: A-H3,A-H11
259    ; xm11: A-H4,A-H12
260    ; xm13: A-H5,A-H13
261    ; xm14: A-H6,A-H14
262    ; xm15: A-H7,A-H15
263    punpcklwd    m7, m3, m4
264    punpckhwd    m3, m4
265    punpcklwd    m4, m5, m6
266    punpckhwd    m5, m6
267    punpcklwd    m6, m11, m13
268    punpckhwd   m11, m13
269    punpcklwd   m13, m14, m15
270    punpckhwd   m14, m15
271    ; xm7: A0-1,B0-1,C0-1,D0-1
272    ; xm3: E0-1,F0-1,G0-1,H0-1
273    ; xm4: A2-3,B2-3,C2-3,D2-3
274    ; xm5: E2-3,F2-3,G2-3,H2-3
275    ; xm6: A4-5,B4-5,C4-5,D4-5
276    ; xm11: E4-5,F4-5,G4-5,H4-5
277    ; xm13: A6-7,B6-7,C6-7,D6-7
278    ; xm14: E6-7,F6-7,G6-7,H6-7
279    punpckldq   m15, m7, m4
280    punpckhdq    m7, m4
281    punpckldq    m9, m3, m5
282    punpckhdq    m8, m3, m5
283    punpckldq    m3, m6, m13
284    punpckhdq    m6, m13
285    punpckldq   m10, m11, m14
286    punpckhdq   m11, m14
287    ; xm15: A0-3,B0-3
288    ; xm7: C0-3,D0-3
289    ; xm9: E0-3,F0-3
290    ; xm8: G0-3,H0-3
291    ; xm3: A4-7,B4-7
292    ; xm6: C4-7,D4-7
293    ; xm10: E4-7,F4-7
294    ; xm11: G4-7,H4-7
295%if %1 != 6
296    punpcklqdq   m0, m15, m3
297%endif
298    punpckhqdq  m13, m15, m3
299    punpcklqdq   m3, m7, m6
300    punpckhqdq   m4, m7, m6
301    punpcklqdq   m5, m9, m10
302    punpckhqdq   m6, m9, m10
303    punpcklqdq  m14, m8, m11
304%if %1 != 6
305    punpckhqdq  m15, m8, m11
306    mova [rsp+5*32], m0
307%endif
308%else
309    ; We only use 14 pixels but we'll need the remainder at the end for
310    ; the second transpose
311    mova         xm0, [dstq+strideq*0-16]
312    mova         xm1, [dstq+strideq*1-16]
313    mova         xm2, [dstq+strideq*2-16]
314    mova         xm3, [dstq+stride3q -16]
315    lea         tmpq, [dstq+strideq*4]
316    mova         xm4, [tmpq+strideq*0-16]
317    mova         xm5, [tmpq+strideq*1-16]
318    mova         xm6, [tmpq+strideq*2-16]
319    mova         xm7, [tmpq+stride3q -16]
320    lea         tmpq, [tmpq+strideq*4]
321    vinserti128   m0, m0, [tmpq+strideq*0-16], 1
322    vinserti128   m1, m1, [tmpq+strideq*1-16], 1
323    vinserti128   m2, m2, [tmpq+strideq*2-16], 1
324    vinserti128   m3, m3, [tmpq+stride3q -16], 1
325    lea         tmpq, [tmpq+strideq*4]
326    vinserti128   m4, m4, [tmpq+strideq*0-16], 1
327    vinserti128   m5, m5, [tmpq+strideq*1-16], 1
328    vinserti128   m6, m6, [tmpq+strideq*2-16], 1
329    vinserti128   m7, m7, [tmpq+stride3q -16], 1
330
331    TRANSPOSE8X8W 0, 1, 2, 3, 4, 5, 6, 7, 8
332
333    mova    [rsp+6*32], m0
334    mova    [rsp+7*32], m1
335    mova    [rsp+8*32], m2
336    mova    [rsp+9*32], m3
337    mova    [rsp+5*32], m4
338
339    mova         xm0, [dstq+strideq*0]
340    mova         xm1, [dstq+strideq*1]
341    mova         xm2, [dstq+strideq*2]
342    mova         xm3, [dstq+stride3q ]
343    lea         tmpq, [dstq+strideq*4]
344    mova         xm8, [tmpq+strideq*0]
345    mova         xm9, [tmpq+strideq*1]
346    mova        xm10, [tmpq+strideq*2]
347    mova        xm11, [tmpq+stride3q ]
348    lea         tmpq, [tmpq+strideq*4]
349    vinserti128   m0, m0, [tmpq+strideq*0], 1
350    vinserti128   m1, m1, [tmpq+strideq*1], 1
351    vinserti128   m2, m2, [tmpq+strideq*2], 1
352    vinserti128   m3, m3, [tmpq+stride3q ], 1
353    lea         tmpq, [tmpq+strideq*4]
354    vinserti128   m8, m8, [tmpq+strideq*0], 1
355    vinserti128   m9, m9, [tmpq+strideq*1], 1
356    vinserti128  m10, m10, [tmpq+strideq*2], 1
357    vinserti128  m11, m11, [tmpq+stride3q ], 1
358
359    TRANSPOSE8X8W 0, 1, 2, 3, 8, 9, 10, 11, 4
360
361    mova   [rsp+10*32], m8
362    mova   [rsp+11*32], m9
363    mova   [rsp+12*32], m10
364    mova   [rsp+13*32], m11
365
366    ; 5,6,7,0,1,2,3 -> 13,3,4,5,6,14,15
367    SWAP         13, 5, 0
368    SWAP          3, 6, 1, 15
369    SWAP          4, 7
370    SWAP          2, 14
371%endif
372%endif
373
374    ; load L/E/I/H
375%ifidn %2, v
376    pmovzxbw      m1, [lq]
377    pmovzxbw      m0, [lq+l_strideq]
378    pxor          m2, m2
379%else
380    vpbroadcastq  m0, [lq]                      ; l0, l1
381    vpbroadcastq  m1, [lq+l_strideq]            ; l2, l3
382    vpbroadcastq  m2, [lq+l_strideq*2]          ; l4, l5
383    vpbroadcastq m10, [lq+l_stride3q]           ; l6, l7
384    punpckldq     m0, m1                        ; l0, l2, l1, l3 [2x]
385    punpckldq     m2, m10                       ; l4, l6, l5, l7 [2x]
386    vpblendd      m0, m0, m2, 11110000b         ; l0, l2, l1, l3, l4, l6, l5, l7
387    pxor          m2, m2
388    punpcklbw     m1, m0, m2                    ; l0, l2, l4, l6
389    punpckhbw     m0, m2                        ; l1, l3, l5, l7
390%endif
391    pcmpeqw      m10, m2, m0
392    pand          m1, m10
393    por           m0, m1                        ; l[x][] ? l[x][] : l[x-stride][]
394    pshufb        m0, [pb_4x1_4x5_4x9_4x13]     ; l[x][1]
395    pcmpeqw      m10, m2, m0                    ; !L
396    psrlw        m10, 1
397    psrlw         m2, m0, [lutq+128]
398    vpbroadcastw  m1, [lutq+136]
399    pminuw        m2, m1
400    pmaxuw        m2, [pw_1]                    ; I
401    psrlw         m1, m0, 4                     ; H
402    paddw         m0, [pw_2]
403    vpbroadcastd  m8, [r11]
404    paddw         m0, m0
405    paddw         m0, m2                        ; E
406    REPX {pmullw x, m8}, m0, m1, m2
407
408    psubw         m8, m3, m4                    ; p1-p0
409    psubw         m9, m5, m6                    ; q1-q0
410    REPX {pabsw x, x}, m8, m9
411    pmaxuw        m8, m10
412    pmaxuw        m8, m9
413    pcmpgtw       m7, m8, m1                    ; hev
414%if %1 != 4
415    psubw         m9, m13, m4                   ; p2-p0
416    pabsw         m9, m9
417    pmaxuw        m9, m8
418%if %1 != 6
419%ifidn %2, v
420    mova         m11, [tmpq+strideq*0]          ; p3
421%else
422    mova         m11, [rsp+5*32]                ; p3
423%endif
424    psubw        m10, m11, m4                   ; p3-p0
425    pabsw        m10, m10
426    pmaxuw        m9, m10
427%endif
428    psubw        m10, m5, m14                   ; q2-q0
429    pabsw        m10, m10
430    pmaxuw        m9, m10
431%if %1 != 6
432    psubw        m10, m5, m15                   ; q3-q0
433    pabsw        m10, m10
434    pmaxuw        m9, m10
435%endif
436    vpbroadcastd m10, [r11]
437    pcmpgtw       m9, m10                       ; !flat8in
438
439    psubw        m10, m13, m3                   ; p2-p1
440    pabsw        m10, m10
441%if %1 != 6
442    psubw        m11, m13                       ; p3-p2
443    pabsw        m11, m11
444    pmaxuw       m10, m11
445    psubw        m11, m14, m15                  ; q3-q2
446    pabsw        m11, m11
447    pmaxuw       m10, m11
448%endif
449    psubw        m11, m14, m6                   ; q2-q1
450    pabsw        m11, m11
451    pmaxuw       m10, m11
452
453%if %1 == 16
454    vpbroadcastd m11, [maskq+8]
455    vpbroadcastd  m1, [maskq+4]
456    por          m11, m1
457    pand         m11, m12
458    pcmpeqd      m11, m12
459    pand         m10, m11
460%else
461    vpbroadcastd m11, [maskq+4]
462    pand         m11, m12
463    pcmpeqd      m11, m12
464    pand         m10, m11                       ; only apply fm-wide to wd>4 blocks
465%endif
466    pmaxuw        m8, m10
467%endif
468    pcmpgtw       m8, m2
469
470    psubw        m10, m3, m6                    ; p1-q1
471    psubw        m11, m4, m5                    ; p0-q0
472    REPX {pabsw x, x}, m10, m11
473    paddw        m11, m11
474    psrlw        m10, 1
475    paddw        m10, m11                       ; abs(p0-q0)*2+(abs(p1-q1)>>1)
476    pcmpgtw      m10, m0                        ; abs(p0-q0)*2+(abs(p1-q1)>>1) > E
477    por           m8, m10
478
479%if %1 == 16
480
481%ifidn %2, v
482    lea         tmpq, [dstq+mstrideq*8]
483    mova          m0, [tmpq+strideq*1]
484    mova          m1, [tmpq+strideq*2]
485    mova          m2, [tmpq+stride3q]
486%else
487    mova          m0, [rsp+7*32]
488    mova          m1, [rsp+8*32]
489    mova          m2, [rsp+9*32]
490%endif
491    REPX {psubw x, m4}, m0, m1, m2
492    REPX {pabsw x, x}, m0, m1, m2
493    pmaxuw        m1, m0
494    pmaxuw        m1, m2
495%ifidn %2, v
496    lea         tmpq, [dstq+strideq*4]
497    mova          m0, [tmpq+strideq*0]
498    mova          m2, [tmpq+strideq*1]
499    mova         m10, [tmpq+strideq*2]
500%else
501    mova          m0, [rsp+10*32]
502    mova          m2, [rsp+11*32]
503    mova         m10, [rsp+12*32]
504%endif
505    REPX {psubw x, m5}, m0, m2, m10
506    REPX {pabsw x, x}, m0, m2, m10
507    pmaxuw        m0, m2
508    pmaxuw        m1, m10
509    pmaxuw        m1, m0
510    vpbroadcastd  m0, [r11]
511    pcmpgtw       m1, m0                        ; !flat8out
512    por           m1, m9                        ; !flat8in | !flat8out
513    vpbroadcastd  m2, [maskq+8]
514    pand         m10, m2, m12
515    pcmpeqd      m10, m12
516    pandn         m1, m10                       ; flat16
517    pandn         m1, m8, m1                    ; flat16 & fm
518
519    vpbroadcastd m10, [maskq+4]
520    por          m10, m2
521    pand          m2, m10, m12
522    pcmpeqd       m2, m12
523    pandn         m9, m2                        ; flat8in
524    pandn         m9, m8, m9
525    vpbroadcastd  m2, [maskq+0]
526    por           m2, m10
527    pand          m2, m12
528    pcmpeqd       m2, m12
529    pandn         m8, m2
530    pandn         m8, m9, m8                    ; fm & !flat8 & !flat16
531    pandn         m9, m1, m9                    ; flat8 & !flat16
532%elif %1 != 4
533    vpbroadcastd  m0, [maskq+4]
534    pand          m2, m0, m12
535    pcmpeqd       m2, m12
536    pandn         m9, m2
537    pandn         m9, m8, m9                    ; flat8 & fm
538    vpbroadcastd  m2, [maskq+0]
539    por           m0, m2
540    pand          m0, m12
541    pcmpeqd       m0, m12
542    pandn         m8, m0
543    pandn         m8, m9, m8                    ; fm & !flat8
544%else
545    vpbroadcastd  m0, [maskq+0]
546    pand          m0, m12
547    pcmpeqd       m0, m12
548    pandn         m8, m0                        ; fm
549%endif
550
551    ; short filter
552    vpbroadcastd  m0, [r11+8*1]                 ; 511 or 2047
553    vpbroadcastd  m2, [r11+8*2]                 ; -512 or -2048
554    psubw        m10, m5, m4
555    paddw        m11, m10, m10
556    paddw        m11, m10
557    psubw        m10, m3, m6                    ; iclip_diff(p1-q1)
558    pminsw       m10, m0
559    pmaxsw       m10, m2
560    pand         m10, m7                        ; f=iclip_diff(p1-q1)&hev
561    paddw        m10, m11                       ; f=iclip_diff(3*(q0-p0)+f)
562    pminsw       m10, m0
563    pmaxsw       m10, m2
564    pand          m8, m10                       ; f&=fm
565    vpbroadcastd m10, [pw_4]
566    paddw        m10, m8
567    paddw         m8, [pw_3]
568    REPX {pminsw x, m0}, m10, m8
569    psraw        m10, 3                         ; f2
570    psraw         m8, 3                         ; f1
571    psubw         m5, m10
572    paddw         m4, m8
573
574    paddw        m10, [pw_1]
575    psraw        m10, 1                         ; f=(f1+1)>>1
576    pandn         m8, m7, m10                   ; f&=!hev
577    paddw         m3, m8
578    psubw         m6, m8
579    pxor          m8, m8
580    psubw         m0, m2                        ; 1023 or 4095
581    REPX {pminsw x, m0}, m3, m4, m5, m6
582    REPX {pmaxsw x, m8}, m3, m4, m5, m6
583
584%if %1 == 16
585
586; m3-6 = p1/p0/q0/q1, m9=flat8, m1=flat16
587; m12=filter bits mask
588; m13-15=p2/q2/q3
589; m0,2,7-8,10-11 = free
590
591    ; flat16 filter
592%ifidn %2, v
593    lea         tmpq, [dstq+mstrideq*8]
594    mova          m0, [tmpq+strideq*1]          ; p6
595    mova          m2, [tmpq+strideq*2]          ; p5
596    mova          m7, [tmpq+stride3q]           ; p4
597    mova         m11, [tmpq+strideq*4]          ; p3
598%else
599    mova          m0, [rsp+7*32]
600    mova          m2, [rsp+8*32]
601    mova          m7, [rsp+9*32]
602    mova         m11, [rsp+5*32]
603%endif
604
605    mova [rsp+ 0*32], m9
606
607    ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
608    paddw         m8, m0, [pw_1]
609    psllw         m8, 3                         ; p6*8+8
610    paddw        m10, m2, m7                    ; p5+p4
611    psubw         m8, m0
612    paddw        m10, m10                       ; (p5+p4)*2
613    paddw         m8, m11                       ; p6*7+p3
614    paddw        m10, m13                       ; (p5+p4)*2+p2
615    paddw         m8, m3                        ; p6*7+p3+p1
616    paddw        m10, m4                        ; (p5+p4)*2+p2+p0
617    paddw         m8, m5                        ; p6*7+p3+p1+q0
618    paddw         m8, m10                       ; p6*7+p5*2+p4*2+p3+p2+p1+p0+q0
619    psrlw        m10, m8, 4
620    vpblendvb    m10, m2, m10, m1
621%ifidn %2, v
622    mova [tmpq+strideq*2], m10                  ; p5
623%else
624    mova [rsp+8*32], m10
625%endif
626
627    ; sub p6*2, add p3/q1
628    paddw         m8, m11
629    paddw        m10, m0, m0
630    paddw         m8, m6
631    psubw         m8, m10
632    psrlw        m10, m8, 4
633    vpblendvb    m10, m7, m10, m1
634%ifidn %2, v
635    mova [tmpq+stride3q], m10                   ; p4
636%else
637    mova [rsp+9*32], m10
638%endif
639
640    ; sub p6/p5, add p2/q2
641    psubw         m8, m0
642    paddw        m10, m13, m14
643    psubw         m8, m2
644    paddw         m8, m10
645    psrlw        m10, m8, 4
646    vpblendvb    m10, m11, m10, m1
647%ifidn %2, v
648    mova [tmpq+strideq*4], m10                  ; p3
649    lea         tmpq, [dstq+strideq*4]
650%else
651    mova [rsp+5*32], m10
652%endif
653
654    ; sub p6/p4, add p1/q3
655    paddw         m8, m3
656    paddw        m10, m0, m7
657    paddw         m8, m15
658    psubw         m8, m10
659    psrlw        m10, m8, 4
660    vpblendvb    m10, m13, m10, m1
661    mova  [rsp+1*32], m10                       ; don't clobber p2/m13
662
663    ; sub p6/p3, add p0/q4
664    paddw         m8, m4
665    paddw        m10, m0, m11
666%ifidn %2, v
667    paddw         m8, [tmpq+strideq*0]
668%else
669    paddw         m8, [rsp+10*32]
670%endif
671    psubw         m8, m10
672    psrlw        m10, m8, 4
673    vpblendvb    m10, m3, m10, m1
674    mova  [rsp+2*32], m10                       ; don't clobber p1/m3
675
676    ; sub p6/p2, add q0/q5
677    paddw         m8, m5
678    paddw        m10, m0, m13
679%ifidn %2, v
680    paddw         m8, [tmpq+strideq*1]
681%else
682    paddw         m8, [rsp+11*32]
683%endif
684    psubw         m8, m10
685    psrlw        m10, m8, 4
686    vpblendvb    m10, m4, m10, m1
687    mova  [rsp+3*32], m10                       ; don't clobber p0/m4
688
689    ; sub p6/p1, add q1/q6
690    paddw         m8, m6
691    paddw        m10, m0, m3
692%ifidn %2, v
693    mova          m0, [tmpq+strideq*2]          ; q6
694%else
695    mova          m0, [rsp+12*32]               ; q6
696%endif
697    paddw         m8, m0
698    psubw         m8, m10
699    psrlw        m10, m8, 4
700    vpblendvb    m10, m5, m10, m1
701    mova  [rsp+4*32], m10                       ; don't clobber q0/m5
702
703    ; sub p5/p0, add q2/q6
704    paddw         m8, m14
705    paddw        m10, m2, m4
706    paddw         m8, m0
707    psubw         m8, m10
708    psrlw        m10, m8, 4
709    vpblendvb     m2, m6, m10, m1               ; don't clobber q1/m6
710
711    ; sub p4/q0, add q3/q6
712    paddw         m8, m15
713    paddw        m10, m7, m5
714    paddw         m8, m0
715    psubw         m8, m10
716    psrlw        m10, m8, 4
717    vpblendvb     m7, m14, m10, m1              ; don't clobber q2/m14
718
719    ; sub p3/q1, add q4/q6
720%ifidn %2, v
721    paddw         m8, [tmpq+strideq*0]
722%else
723    paddw         m8, [rsp+10*32]
724%endif
725    paddw        m10, m11, m6
726    paddw         m8, m0
727    psubw         m8, m10
728    psrlw        m10, m8, 4
729    vpblendvb    m10, m15, m10, m1
730%ifidn %2, v
731    mova [tmpq+mstrideq], m10                   ; q3
732%else
733    mova [rsp+14*32], m10
734%endif
735
736    ; sub p2/q2, add q5/q6
737%ifidn %2, v
738    paddw         m8, [tmpq+strideq*1]
739%else
740    paddw         m8, [rsp+11*32]
741%endif
742    paddw        m10, m13, m14
743    paddw         m8, m0
744    psubw         m8, m10
745    psrlw        m10, m8, 4
746%ifidn %2, v
747    mova          m9, [tmpq+strideq*0]
748%else
749    mova          m9, [rsp+10*32]
750%endif
751    vpblendvb    m10, m9, m10, m1
752%ifidn %2, v
753    mova [tmpq+strideq*0], m10                   ; q4
754%else
755    mova [rsp+10*32], m10
756%endif
757
758    ; sub p1/q3, add q6*2
759    psubw         m8, m3
760    paddw         m0, m0
761    psubw         m8, m15
762    paddw         m8, m0
763    psrlw        m10, m8, 4
764%ifidn %2, v
765    mova          m9, [tmpq+strideq*1]
766%else
767    mova          m9, [rsp+11*32]
768%endif
769    vpblendvb    m10, m9, m10, m1
770%ifidn %2, v
771    mova [tmpq+strideq*1], m10                  ; q5
772%else
773    mova [rsp+11*32], m10
774%endif
775
776    mova          m9, [rsp+0*32]
777    mova         m13, [rsp+1*32]
778    mova          m3, [rsp+2*32]
779    mova          m4, [rsp+3*32]
780    mova          m5, [rsp+4*32]
781    SWAP           2, 6
782    SWAP           7, 14
783%ifidn %2, v
784    lea         tmpq, [dstq+mstrideq*4]
785%else
786    mova         m15, [rsp+14*32]
787%endif
788%endif
789
790%if %1 >= 8
791    ; flat8 filter
792    vpbroadcastd  m7, [pw_4096]
793%ifidn %2, v
794    mova          m0, [tmpq+strideq*0]          ; p3
795%else
796    mova          m0, [rsp+5*32]                ; p3
797%endif
798    paddw         m1, m0, m13                   ; p3+p2
799    paddw         m2, m3, m4                    ; p1+p0
800    paddw         m8, m1, m1                    ; 2*(p3+p2)
801    paddw         m2, m0                        ; p1+p0+p3
802    paddw         m8, m5                        ; 2*(p3+p2)+q0
803    paddw         m2, m8                        ; 3*p3+2*p2+p1+p0+q0
804    pmulhrsw     m10, m2, m7
805
806    paddw         m8, m3, m6
807    psubw         m2, m1
808    paddw         m2, m8
809    pmulhrsw      m8, m2, m7
810
811    paddw        m11, m0, m3
812    paddw         m1, m4, m14
813    psubw         m2, m11
814    paddw         m2, m1
815    pmulhrsw      m1, m2, m7
816
817    paddw        m11, m0, m4
818    pblendvb      m4, m1, m9
819    paddw         m1, m5, m15
820    psubw         m2, m11
821    paddw         m2, m1
822    pmulhrsw     m11, m2, m7
823
824    paddw         m2, m6
825    paddw         m2, m15
826    paddw         m1, m13, m5
827    pblendvb      m5, m11, m9
828    pblendvb     m13, m10, m9
829    psubw         m2, m1
830    pmulhrsw      m1, m2, m7
831
832    psubw         m2, m3
833    pblendvb      m3, m8, m9
834    psubw         m2, m6
835    pblendvb      m6, m1, m9
836    paddw         m1, m15, m14
837    paddw         m2, m1
838    pmulhrsw      m2, m7
839
840    pblendvb     m14, m2, m9
841
842%ifidn %2, v
843    mova [tmpq+strideq*1], m13                  ; p2
844    mova [tmpq+strideq*2], m3                   ; p1
845    mova [tmpq+stride3q ], m4                   ; p0
846    mova [dstq+strideq*0], m5                   ; q0
847    mova [dstq+strideq*1], m6                   ; q1
848    mova [dstq+strideq*2], m14                  ; q2
849%elif %1 == 8
850    TRANSPOSE8X8W  0, 13, 3, 4, 5, 6, 14, 15, 1
851
852    ; write 8x16
853    movu   [dstq+strideq*0-8], xm0
854    movu   [dstq+strideq*1-8], xm13
855    movu   [dstq+strideq*2-8], xm3
856    movu   [dstq+stride3q -8], xm4
857    lea         dstq, [dstq+strideq*4]
858    movu   [dstq+strideq*0-8], xm5
859    movu   [dstq+strideq*1-8], xm6
860    movu   [dstq+strideq*2-8], xm14
861    movu   [dstq+stride3q -8], xm15
862    lea         dstq, [dstq+strideq*4]
863    vextracti128 [dstq+strideq*0-8], m0, 1
864    vextracti128 [dstq+strideq*1-8], m13, 1
865    vextracti128 [dstq+strideq*2-8], m3, 1
866    vextracti128 [dstq+stride3q -8], m4, 1
867    lea         dstq, [dstq+strideq*4]
868    vextracti128 [dstq+strideq*0-8], m5, 1
869    vextracti128 [dstq+strideq*1-8], m6, 1
870    vextracti128 [dstq+strideq*2-8], m14, 1
871    vextracti128 [dstq+stride3q -8], m15, 1
872    lea         dstq, [dstq+strideq*4]
873%else
874    mova          m8, [rsp+6*32]
875    mova          m1, [rsp+7*32]
876    mova          m2, [rsp+8*32]
877    mova          m7, [rsp+9*32]
878    TRANSPOSE8X8W  8, 1, 2, 7, 0, 13, 3, 4, 9
879
880    mova [dstq+strideq*0-16], xm8
881    mova [dstq+strideq*1-16], xm1
882    mova [dstq+strideq*2-16], xm2
883    mova [dstq+stride3q -16], xm7
884    lea         tmpq, [dstq+strideq*4]
885    mova [tmpq+strideq*0-16], xm0
886    mova [tmpq+strideq*1-16], xm13
887    mova [tmpq+strideq*2-16], xm3
888    mova [tmpq+stride3q -16], xm4
889    lea         tmpq, [tmpq+strideq*4]
890    vextracti128 [tmpq+strideq*0-16], m8, 1
891    vextracti128 [tmpq+strideq*1-16], m1, 1
892    vextracti128 [tmpq+strideq*2-16], m2, 1
893    vextracti128 [tmpq+stride3q -16], m7, 1
894    lea         tmpq, [tmpq+strideq*4]
895    vextracti128 [tmpq+strideq*0-16], m0, 1
896    vextracti128 [tmpq+strideq*1-16], m13, 1
897    vextracti128 [tmpq+strideq*2-16], m3, 1
898    vextracti128 [tmpq+stride3q -16], m4, 1
899
900    mova          m0, [rsp+10*32]
901    mova          m1, [rsp+11*32]
902    mova          m2, [rsp+12*32]
903    mova          m3, [rsp+13*32]
904    TRANSPOSE8X8W  5, 6, 14, 15, 0, 1, 2, 3, 4
905    mova [dstq+strideq*0], xm5
906    mova [dstq+strideq*1], xm6
907    mova [dstq+strideq*2], xm14
908    mova [dstq+stride3q ], xm15
909    lea         dstq, [dstq+strideq*4]
910    mova [dstq+strideq*0], xm0
911    mova [dstq+strideq*1], xm1
912    mova [dstq+strideq*2], xm2
913    mova [dstq+stride3q ], xm3
914    lea         dstq, [dstq+strideq*4]
915    vextracti128 [dstq+strideq*0], m5, 1
916    vextracti128 [dstq+strideq*1], m6, 1
917    vextracti128 [dstq+strideq*2], m14, 1
918    vextracti128 [dstq+stride3q ], m15, 1
919    lea         dstq, [dstq+strideq*4]
920    vextracti128 [dstq+strideq*0], m0, 1
921    vextracti128 [dstq+strideq*1], m1, 1
922    vextracti128 [dstq+strideq*2], m2, 1
923    vextracti128 [dstq+stride3q ], m3, 1
924    lea         dstq, [dstq+strideq*4]
925%endif
926%elif %1 == 6
927    ; flat6 filter
928    vpbroadcastd  m7, [pw_4096]
929    paddw         m8, m3, m4
930    paddw         m8, m13                       ; p2+p1+p0
931    paddw        m11, m13, m5
932    paddw         m8, m8
933    paddw         m8, m11                       ; p2+2*(p2+p1+p0)+q0
934    pmulhrsw      m2, m8, m7
935
936    paddw         m8, m5
937    paddw        m11, m13, m13
938    paddw         m8, m6
939    psubw         m8, m11
940    pmulhrsw     m10, m8, m7
941
942    paddw         m8, m6
943    paddw        m11, m13, m3
944    paddw         m8, m14
945    psubw         m8, m11
946    pmulhrsw     m11, m8, m7
947
948    psubw         m8, m3
949    paddw        m14, m14
950    psubw         m8, m4
951    paddw         m8, m14
952    pmulhrsw      m8, m7
953
954    pblendvb      m3, m2, m9
955    pblendvb      m4, m10, m9
956    pblendvb      m5, m11, m9
957    pblendvb      m6, m8, m9
958
959%ifidn %2, v
960    mova [tmpq+strideq*2], m3                   ; p1
961    mova [tmpq+stride3q ], m4                   ; p0
962    mova [dstq+strideq*0], m5                   ; q0
963    mova [dstq+strideq*1], m6                   ; q1
964%else
965    TRANSPOSE_8x4_AND_WRITE_4x16
966%endif
967%else
968%ifidn %2, v
969    mova [tmpq+strideq*0], m3                   ; p1
970    mova [tmpq+strideq*1], m4                   ; p0
971    mova [tmpq+strideq*2], m5                   ; q0
972    mova [tmpq+stride3q ], m6                   ; q1
973%else
974    TRANSPOSE_8x4_AND_WRITE_4x16
975%endif
976%endif
977%endmacro
978
979INIT_YMM avx2
980cglobal lpf_v_sb_y_16bpc, 6, 12, 16, 32 * 5, \
981                          dst, stride, mask, l, l_stride, lut, \
982                          w, stride3, mstride, tmp, mask_bits
983    mov          r6d, r7m
984    lea          r11, [pw_4]
985    shr          r6d, 11                      ; is_12bpc
986    lea          r11, [r11+r6*4]
987    mov           wd, wm
988    shl    l_strideq, 2
989    sub           lq, l_strideq
990    mov     mstrideq, strideq
991    neg     mstrideq
992    lea     stride3q, [strideq*3]
993    mov   mask_bitsd, 0xf
994    mova         m12, [pb_mask]
995
996.loop:
997    test   [maskq+8], mask_bitsd              ; vmask[2]
998    jz .no_flat16
999
1000    FILTER        16, v
1001    jmp .end
1002
1003.no_flat16:
1004    test   [maskq+4], mask_bitsd              ; vmask[1]
1005    jz .no_flat
1006
1007    FILTER         8, v
1008    jmp .end
1009
1010.no_flat:
1011    test   [maskq+0], mask_bitsd              ; vmask[0]
1012    jz .end
1013
1014    call .v4
1015
1016.end:
1017    pslld        m12, 4
1018    add           lq, 16
1019    add         dstq, 32
1020    shl   mask_bitsd, 4
1021    sub           wd, 4
1022    jg .loop
1023    RET
1024ALIGN function_align
1025.v4:
1026    FILTER         4, v
1027    ret
1028
1029INIT_YMM avx2
1030cglobal lpf_h_sb_y_16bpc, 6, 12, 16, 32 * 15, \
1031                          dst, stride, mask, l, l_stride, lut, \
1032                          h, stride3, l_stride3, tmp, mask_bits
1033    mov          r6d, r7m
1034    lea          r11, [pw_4]
1035    shr          r6d, 11                      ; is_12bpc
1036    lea          r11, [r11+r6*4]
1037    mov           hd, hm
1038    shl    l_strideq, 2
1039    sub           lq, 4
1040    lea     stride3q, [strideq*3]
1041    lea   l_stride3q, [l_strideq*3]
1042    mov   mask_bitsd, 0xf
1043    mova         m12, [pb_mask]
1044
1045.loop:
1046    test   [maskq+8], mask_bitsd            ; vmask[2]
1047    jz .no_flat16
1048
1049    FILTER        16, h
1050    jmp .end
1051
1052.no_flat16:
1053    test   [maskq+4], mask_bitsd            ; vmask[1]
1054    jz .no_flat
1055
1056    FILTER         8, h
1057    jmp .end
1058
1059.no_flat:
1060    test   [maskq+0], mask_bitsd            ; vmask[0]
1061    jz .no_filter
1062
1063    call .h4
1064    jmp .end
1065
1066.no_filter:
1067    lea         dstq, [dstq+strideq*8]
1068    lea         dstq, [dstq+strideq*8]
1069.end:
1070    pslld        m12, 4
1071    lea           lq, [lq+l_strideq*4]
1072    shl   mask_bitsd, 4
1073    sub           hd, 4
1074    jg .loop
1075    RET
1076ALIGN function_align
1077.h4:
1078    FILTER         4, h
1079    ret
1080
1081INIT_YMM avx2
1082cglobal lpf_v_sb_uv_16bpc, 6, 12, 16, \
1083                           dst, stride, mask, l, l_stride, lut, \
1084                           w, stride3, mstride, tmp, mask_bits
1085    mov          r6d, r7m
1086    lea          r11, [pw_4]
1087    shr          r6d, 11                      ; is_12bpc
1088    lea          r11, [r11+r6*4]
1089    mov           wd, wm
1090    shl    l_strideq, 2
1091    sub           lq, l_strideq
1092    mov     mstrideq, strideq
1093    neg     mstrideq
1094    lea     stride3q, [strideq*3]
1095    mov   mask_bitsd, 0xf
1096    mova         m12, [pb_mask]
1097
1098.loop:
1099    test   [maskq+4], mask_bitsd            ; vmask[1]
1100    jz .no_flat
1101
1102    FILTER         6, v
1103    jmp .end
1104
1105.no_flat:
1106    test   [maskq+0], mask_bitsd            ; vmask[0]
1107    jz .end
1108
1109    call mangle(private_prefix %+ _lpf_v_sb_y_16bpc_avx2).v4
1110
1111.end:
1112    pslld        m12, 4
1113    add           lq, 16
1114    add         dstq, 32
1115    shl   mask_bitsd, 4
1116    sub           wd, 4
1117    jg .loop
1118    RET
1119
1120INIT_YMM avx2
1121cglobal lpf_h_sb_uv_16bpc, 6, 12, 16, \
1122                           dst, stride, mask, l, l_stride, lut, \
1123                           h, stride3, l_stride3, tmp, mask_bits
1124    mov          r6d, r7m
1125    lea          r11, [pw_4]
1126    shr          r6d, 11                      ; is_12bpc
1127    lea          r11, [r11+r6*4]
1128    mov           hd, hm
1129    shl    l_strideq, 2
1130    sub           lq, 4
1131    lea     stride3q, [strideq*3]
1132    lea   l_stride3q, [l_strideq*3]
1133    mov   mask_bitsd, 0xf
1134    mova         m12, [pb_mask]
1135
1136.loop:
1137    test   [maskq+4], mask_bitsd            ; vmask[1]
1138    jz .no_flat
1139
1140    FILTER         6, h
1141    jmp .end
1142
1143.no_flat:
1144    test   [maskq+0], mask_bitsd            ; vmask[0]
1145    jz .no_filter
1146
1147    call mangle(private_prefix %+ _lpf_h_sb_y_16bpc_avx2).h4
1148    jmp .end
1149
1150.no_filter:
1151    lea         dstq, [dstq+strideq*8]
1152    lea         dstq, [dstq+strideq*8]
1153.end:
1154    pslld        m12, 4
1155    lea           lq, [lq+l_strideq*4]
1156    shl   mask_bitsd, 4
1157    sub           hd, 4
1158    jg .loop
1159    RET
1160
1161%endif ; ARCH_X86_64
1162