• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2018, VideoLAN and dav1d authors
2; Copyright © 2018, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29%if ARCH_X86_64
30
31SECTION_RODATA 32
32
33wiener_l_shuf: db  4,  4,  4,  4,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
34               db  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15
35wiener_shufA:  db  1,  7,  2,  8,  3,  9,  4, 10,  5, 11,  6, 12,  7, 13,  8, 14
36wiener_shufB:  db  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9, 10
37wiener_shufC:  db  6,  5,  7,  6,  8,  7,  9,  8, 10,  9, 11, 10, 12, 11, 13, 12
38sgr_l_shuf:    db  0,  0,  0,  0,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11
39sgr_r_ext:     times 16 db 1
40               times 16 db 9
41
42; dword version of dav1d_sgr_x_by_x[] for use with gathers, wastes a bit of
43; cache but eliminates some shifts in the inner sgr loop which is overall a win
44const sgr_x_by_x_avx2
45              dd 255,128, 85, 64, 51, 43, 37, 32, 28, 26, 23, 21, 20, 18, 17, 16
46              dd  15, 14, 13, 13, 12, 12, 11, 11, 10, 10,  9,  9,  9,  9,  8,  8
47              dd   8,  8,  7,  7,  7,  7,  7,  6,  6,  6,  6,  6,  6,  6,  5,  5
48              dd   5,  5,  5,  5,  5,  5,  5,  5,  4,  4,  4,  4,  4,  4,  4,  4
49              dd   4,  4,  4,  4,  4,  4,  4,  4,  4,  3,  3,  3,  3,  3,  3,  3
50              dd   3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3
51              dd   3,  3,  3,  3,  3,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
52              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
53              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
54              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2
55              dd   2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  1,  1,  1,  1,  1,  1
56              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
57              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
58              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
59              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1
60              dd   1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0
61
62               times 4 db -1 ; needed for 16-bit sgr
63pb_m5:         times 4 db -5
64pb_3:          times 4 db 3
65pw_5_6:        dw 5, 6
66
67sgr_shuf:      db  1, -1,  2, -1,  3, -1,  4, -1,  5, -1,  6, -1,  7, -1,  8, -1
68               db  9, -1, 10, -1, 11, -1, 12, -1
69
70pw_256:        times 2 dw 256
71pw_2056:       times 2 dw 2056
72pw_m16380:     times 2 dw -16380
73pd_25:         dd 25
74pd_34816:      dd 34816
75pd_m4096:      dd -4096
76pd_0xf00801c7: dd 0xf00801c7
77pd_0xf00800a4: dd 0xf00800a4
78
79cextern pb_0to63
80
81SECTION .text
82
83DECLARE_REG_TMP 8, 7, 9, 11, 12, 13, 14 ; ring buffer pointers
84
85INIT_YMM avx2
86cglobal wiener_filter7_8bpc, 4, 15, 16, -384*12-16, dst, stride, left, lpf, \
87                                                    w, h, edge, flt
88    mov           fltq, r6mp
89    movifnidn       hd, hm
90    mov          edged, r7m
91    mov             wd, wm
92    vbroadcasti128  m6, [wiener_shufA]
93    vpbroadcastb   m11, [fltq+ 0] ; x0 x0
94    vbroadcasti128  m7, [wiener_shufB]
95    vpbroadcastd   m12, [fltq+ 2]
96    vbroadcasti128  m8, [wiener_shufC]
97    packsswb       m12, m12       ; x1 x2
98    vpbroadcastw   m13, [fltq+ 6] ; x3
99    vbroadcasti128  m9, [sgr_shuf+6]
100    add           lpfq, wq
101    vpbroadcastd   m10, [pw_m16380]
102    vpbroadcastd   m14, [fltq+16] ; y0 y1
103    add           dstq, wq
104    vpbroadcastd   m15, [fltq+20] ; y2 y3
105    lea             t1, [rsp+wq*2+16]
106    psllw          m14, 5
107    neg             wq
108    psllw          m15, 5
109    test         edgeb, 4 ; LR_HAVE_TOP
110    jz .no_top
111    call .h_top
112    add           lpfq, strideq
113    mov             t6, t1
114    mov             t5, t1
115    add             t1, 384*2
116    call .h_top
117    lea            r10, [lpfq+strideq*4]
118    mov           lpfq, dstq
119    mov             t4, t1
120    add             t1, 384*2
121    add            r10, strideq
122    mov          [rsp], r10 ; below
123    call .h
124    mov             t3, t1
125    mov             t2, t1
126    dec             hd
127    jz .v1
128    add           lpfq, strideq
129    add             t1, 384*2
130    call .h
131    mov             t2, t1
132    dec             hd
133    jz .v2
134    add           lpfq, strideq
135    add             t1, 384*2
136    call .h
137    dec             hd
138    jz .v3
139.main:
140    lea             t0, [t1+384*2]
141.main_loop:
142    call .hv
143    dec             hd
144    jnz .main_loop
145    test         edgeb, 8 ; LR_HAVE_BOTTOM
146    jz .v3
147    mov           lpfq, [rsp]
148    call .hv_bottom
149    add           lpfq, strideq
150    call .hv_bottom
151.v1:
152    call .v
153    RET
154.no_top:
155    lea            r10, [lpfq+strideq*4]
156    mov           lpfq, dstq
157    lea            r10, [r10+strideq*2]
158    mov          [rsp], r10
159    call .h
160    mov             t6, t1
161    mov             t5, t1
162    mov             t4, t1
163    mov             t3, t1
164    mov             t2, t1
165    dec             hd
166    jz .v1
167    add           lpfq, strideq
168    add             t1, 384*2
169    call .h
170    mov             t2, t1
171    dec             hd
172    jz .v2
173    add           lpfq, strideq
174    add             t1, 384*2
175    call .h
176    dec             hd
177    jz .v3
178    lea             t0, [t1+384*2]
179    call .hv
180    dec             hd
181    jz .v3
182    add             t0, 384*8
183    call .hv
184    dec             hd
185    jnz .main
186.v3:
187    call .v
188.v2:
189    call .v
190    jmp .v1
191.extend_right:
192    movd           xm2, r10d
193    vpbroadcastd    m0, [pb_3]
194    vpbroadcastd    m1, [pb_m5]
195    vpbroadcastb    m2, xm2
196    mova            m3, [pb_0to63]
197    psubb           m0, m2
198    psubb           m1, m2
199    pminub          m0, m3
200    pminub          m1, m3
201    pshufb          m4, m0
202    pshufb          m5, m1
203    ret
204.h:
205    mov            r10, wq
206    test         edgeb, 1 ; LR_HAVE_LEFT
207    jz .h_extend_left
208    movd           xm4, [leftq]
209    vpblendd        m4, [lpfq+r10-4], 0xfe
210    add          leftq, 4
211    jmp .h_main
212.h_extend_left:
213    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
214    mova            m4, [lpfq+r10] ; before the start of the buffer
215    palignr         m4, m5, 12
216    pshufb          m4, [wiener_l_shuf]
217    jmp .h_main
218.h_top:
219    mov            r10, wq
220    test         edgeb, 1 ; LR_HAVE_LEFT
221    jz .h_extend_left
222.h_loop:
223    movu            m4, [lpfq+r10-4]
224.h_main:
225    movu            m5, [lpfq+r10+4]
226    test         edgeb, 2 ; LR_HAVE_RIGHT
227    jnz .h_have_right
228    cmp           r10d, -34
229    jl .h_have_right
230    call .extend_right
231.h_have_right:
232    pshufb          m0, m4, m6
233    pmaddubsw       m0, m11
234    pshufb          m1, m5, m6
235    pmaddubsw       m1, m11
236    pshufb          m2, m4, m7
237    pmaddubsw       m2, m12
238    pshufb          m3, m5, m7
239    pmaddubsw       m3, m12
240    paddw           m0, m2
241    pshufb          m2, m4, m8
242    pmaddubsw       m2, m12
243    paddw           m1, m3
244    pshufb          m3, m5, m8
245    pmaddubsw       m3, m12
246    pshufb          m4, m9
247    paddw           m0, m2
248    pmullw          m2, m4, m13
249    pshufb          m5, m9
250    paddw           m1, m3
251    pmullw          m3, m5, m13
252    psllw           m4, 7
253    psllw           m5, 7
254    paddw           m4, m10
255    paddw           m5, m10
256    paddw           m0, m2
257    vpbroadcastd    m2, [pw_2056]
258    paddw           m1, m3
259    paddsw          m0, m4
260    paddsw          m1, m5
261    psraw           m0, 3
262    psraw           m1, 3
263    paddw           m0, m2
264    paddw           m1, m2
265    mova [t1+r10*2+ 0], m0
266    mova [t1+r10*2+32], m1
267    add            r10, 32
268    jl .h_loop
269    ret
270ALIGN function_align
271.hv:
272    add           lpfq, strideq
273    mov            r10, wq
274    test         edgeb, 1 ; LR_HAVE_LEFT
275    jz .hv_extend_left
276    movd           xm4, [leftq]
277    vpblendd        m4, [lpfq+r10-4], 0xfe
278    add          leftq, 4
279    jmp .hv_main
280.hv_extend_left:
281    movu            m4, [lpfq+r10-4]
282    pshufb          m4, [wiener_l_shuf]
283    jmp .hv_main
284.hv_bottom:
285    mov            r10, wq
286    test         edgeb, 1 ; LR_HAVE_LEFT
287    jz .hv_extend_left
288.hv_loop:
289    movu            m4, [lpfq+r10-4]
290.hv_main:
291    movu            m5, [lpfq+r10+4]
292    test         edgeb, 2 ; LR_HAVE_RIGHT
293    jnz .hv_have_right
294    cmp           r10d, -34
295    jl .hv_have_right
296    call .extend_right
297.hv_have_right:
298    pshufb          m0, m4, m6
299    pmaddubsw       m0, m11
300    pshufb          m1, m5, m6
301    pmaddubsw       m1, m11
302    pshufb          m2, m4, m7
303    pmaddubsw       m2, m12
304    pshufb          m3, m5, m7
305    pmaddubsw       m3, m12
306    paddw           m0, m2
307    pshufb          m2, m4, m8
308    pmaddubsw       m2, m12
309    paddw           m1, m3
310    pshufb          m3, m5, m8
311    pmaddubsw       m3, m12
312    pshufb          m4, m9
313    paddw           m0, m2
314    pmullw          m2, m4, m13
315    pshufb          m5, m9
316    paddw           m1, m3
317    pmullw          m3, m5, m13
318    psllw           m4, 7
319    psllw           m5, 7
320    paddw           m4, m10
321    paddw           m5, m10
322    paddw           m0, m2
323    paddw           m1, m3
324    mova            m2, [t4+r10*2]
325    paddw           m2, [t2+r10*2]
326    mova            m3, [t3+r10*2]
327    paddsw          m0, m4
328    vpbroadcastd    m4, [pw_2056]
329    paddsw          m1, m5
330    mova            m5, [t5+r10*2]
331    paddw           m5, [t1+r10*2]
332    psraw           m0, 3
333    psraw           m1, 3
334    paddw           m0, m4
335    paddw           m1, m4
336    paddw           m4, m0, [t6+r10*2]
337    mova    [t0+r10*2], m0
338    punpcklwd       m0, m2, m3
339    pmaddwd         m0, m15
340    punpckhwd       m2, m3
341    pmaddwd         m2, m15
342    punpcklwd       m3, m4, m5
343    pmaddwd         m3, m14
344    punpckhwd       m4, m5
345    pmaddwd         m4, m14
346    paddd           m0, m3
347    paddd           m4, m2
348    mova            m2, [t4+r10*2+32]
349    paddw           m2, [t2+r10*2+32]
350    mova            m3, [t3+r10*2+32]
351    mova            m5, [t5+r10*2+32]
352    paddw           m5, [t1+r10*2+32]
353    packuswb        m0, m4
354    paddw           m4, m1, [t6+r10*2+32]
355    mova [t0+r10*2+32], m1
356    punpcklwd       m1, m2, m3
357    pmaddwd         m1, m15
358    punpckhwd       m2, m3
359    pmaddwd         m2, m15
360    punpcklwd       m3, m4, m5
361    pmaddwd         m3, m14
362    punpckhwd       m4, m5
363    pmaddwd         m4, m14
364    paddd           m1, m3
365    paddd           m2, m4
366    packuswb        m1, m2
367    psrlw           m0, 8
368    psrlw           m1, 8
369    packuswb        m0, m1
370    mova    [dstq+r10], m0
371    add            r10, 32
372    jl .hv_loop
373    mov             t6, t5
374    mov             t5, t4
375    mov             t4, t3
376    mov             t3, t2
377    mov             t2, t1
378    mov             t1, t0
379    mov             t0, t6
380    add           dstq, strideq
381    ret
382.v:
383    mov            r10, wq
384.v_loop:
385    mova            m2, [t4+r10*2+ 0]
386    paddw           m2, [t2+r10*2+ 0]
387    mova            m4, [t3+r10*2+ 0]
388    mova            m6, [t1+r10*2+ 0]
389    paddw           m8, m6, [t6+r10*2+ 0]
390    paddw           m6, [t5+r10*2+ 0]
391    mova            m3, [t4+r10*2+32]
392    paddw           m3, [t2+r10*2+32]
393    mova            m5, [t3+r10*2+32]
394    mova            m7, [t1+r10*2+32]
395    paddw           m9, m7, [t6+r10*2+32]
396    paddw           m7, [t5+r10*2+32]
397    punpcklwd       m0, m2, m4
398    pmaddwd         m0, m15
399    punpckhwd       m2, m4
400    pmaddwd         m2, m15
401    punpcklwd       m4, m8, m6
402    pmaddwd         m4, m14
403    punpckhwd       m6, m8, m6
404    pmaddwd         m6, m14
405    punpcklwd       m1, m3, m5
406    pmaddwd         m1, m15
407    punpckhwd       m3, m5
408    pmaddwd         m3, m15
409    punpcklwd       m5, m9, m7
410    pmaddwd         m5, m14
411    punpckhwd       m7, m9, m7
412    pmaddwd         m7, m14
413    paddd           m0, m4
414    paddd           m2, m6
415    paddd           m1, m5
416    paddd           m3, m7
417    packuswb        m0, m2
418    packuswb        m1, m3
419    psrlw           m0, 8
420    psrlw           m1, 8
421    packuswb        m0, m1
422    mova    [dstq+r10], m0
423    add            r10, 32
424    jl .v_loop
425    mov             t6, t5
426    mov             t5, t4
427    mov             t4, t3
428    mov             t3, t2
429    mov             t2, t1
430    add           dstq, strideq
431    ret
432
433cglobal wiener_filter5_8bpc, 4, 13, 16, 384*8+16, dst, stride, left, lpf, \
434                                                  w, h, edge, flt
435    mov           fltq, r6mp
436    movifnidn       hd, hm
437    mov          edged, r7m
438    mov             wd, wm
439    vbroadcasti128  m6, [wiener_shufB]
440    vpbroadcastd   m12, [fltq+ 2]
441    vbroadcasti128  m7, [wiener_shufC]
442    packsswb       m12, m12       ; x1 x2
443    vpbroadcastw   m13, [fltq+ 6] ; x3
444    vbroadcasti128  m8, [sgr_shuf+6]
445    add           lpfq, wq
446    vpbroadcastd    m9, [pw_m16380]
447    vpbroadcastd   m10, [pw_2056]
448    mova           m11, [wiener_l_shuf]
449    vpbroadcastd   m14, [fltq+16] ; __ y1
450    add           dstq, wq
451    vpbroadcastd   m15, [fltq+20] ; y2 y3
452    lea             t1, [rsp+wq*2+16]
453    psllw          m14, 5
454    neg             wq
455    psllw          m15, 5
456    test         edgeb, 4 ; LR_HAVE_TOP
457    jz .no_top
458    call .h_top
459    add           lpfq, strideq
460    mov             t4, t1
461    add             t1, 384*2
462    call .h_top
463    lea            r10, [lpfq+strideq*4]
464    mov           lpfq, dstq
465    mov             t3, t1
466    add             t1, 384*2
467    add            r10, strideq
468    mov          [rsp], r10 ; below
469    call .h
470    mov             t2, t1
471    dec             hd
472    jz .v1
473    add           lpfq, strideq
474    add             t1, 384*2
475    call .h
476    dec             hd
477    jz .v2
478.main:
479    mov             t0, t4
480.main_loop:
481    call .hv
482    dec             hd
483    jnz .main_loop
484    test         edgeb, 8 ; LR_HAVE_BOTTOM
485    jz .v2
486    mov           lpfq, [rsp]
487    call .hv_bottom
488    add           lpfq, strideq
489    call .hv_bottom
490.end:
491    RET
492.no_top:
493    lea            r10, [lpfq+strideq*4]
494    mov           lpfq, dstq
495    lea            r10, [r10+strideq*2]
496    mov          [rsp], r10
497    call .h
498    mov             t4, t1
499    mov             t3, t1
500    mov             t2, t1
501    dec             hd
502    jz .v1
503    add           lpfq, strideq
504    add             t1, 384*2
505    call .h
506    dec             hd
507    jz .v2
508    lea             t0, [t1+384*2]
509    call .hv
510    dec             hd
511    jz .v2
512    add             t0, 384*6
513    call .hv
514    dec             hd
515    jnz .main
516.v2:
517    call .v
518    mov             t4, t3
519    mov             t3, t2
520    mov             t2, t1
521    add           dstq, strideq
522.v1:
523    call .v
524    jmp .end
525.h:
526    mov            r10, wq
527    test         edgeb, 1 ; LR_HAVE_LEFT
528    jz .h_extend_left
529    movd           xm4, [leftq]
530    vpblendd        m4, [lpfq+r10-4], 0xfe
531    add          leftq, 4
532    jmp .h_main
533.h_extend_left:
534    vbroadcasti128  m5, [lpfq+r10] ; avoid accessing memory located
535    mova            m4, [lpfq+r10] ; before the start of the buffer
536    palignr         m4, m5, 12
537    pshufb          m4, m11
538    jmp .h_main
539.h_top:
540    mov            r10, wq
541    test         edgeb, 1 ; LR_HAVE_LEFT
542    jz .h_extend_left
543.h_loop:
544    movu            m4, [lpfq+r10-4]
545.h_main:
546    movu            m5, [lpfq+r10+4]
547    test         edgeb, 2 ; LR_HAVE_RIGHT
548    jnz .h_have_right
549    cmp           r10d, -33
550    jl .h_have_right
551    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
552.h_have_right:
553    pshufb          m0, m4, m6
554    pmaddubsw       m0, m12
555    pshufb          m1, m5, m6
556    pmaddubsw       m1, m12
557    pshufb          m2, m4, m7
558    pmaddubsw       m2, m12
559    pshufb          m3, m5, m7
560    pmaddubsw       m3, m12
561    pshufb          m4, m8
562    paddw           m0, m2
563    pmullw          m2, m4, m13
564    pshufb          m5, m8
565    paddw           m1, m3
566    pmullw          m3, m5, m13
567    psllw           m4, 7
568    psllw           m5, 7
569    paddw           m4, m9
570    paddw           m5, m9
571    paddw           m0, m2
572    paddw           m1, m3
573    paddsw          m0, m4
574    paddsw          m1, m5
575    psraw           m0, 3
576    psraw           m1, 3
577    paddw           m0, m10
578    paddw           m1, m10
579    mova [t1+r10*2+ 0], m0
580    mova [t1+r10*2+32], m1
581    add            r10, 32
582    jl .h_loop
583    ret
584ALIGN function_align
585.hv:
586    add           lpfq, strideq
587    mov            r10, wq
588    test         edgeb, 1 ; LR_HAVE_LEFT
589    jz .hv_extend_left
590    movd           xm4, [leftq]
591    vpblendd        m4, [lpfq+r10-4], 0xfe
592    add          leftq, 4
593    jmp .hv_main
594.hv_extend_left:
595    movu            m4, [lpfq+r10-4]
596    pshufb          m4, m11
597    jmp .hv_main
598.hv_bottom:
599    mov            r10, wq
600    test         edgeb, 1 ; LR_HAVE_LEFT
601    jz .hv_extend_left
602.hv_loop:
603    movu            m4, [lpfq+r10-4]
604.hv_main:
605    movu            m5, [lpfq+r10+4]
606    test         edgeb, 2 ; LR_HAVE_RIGHT
607    jnz .hv_have_right
608    cmp           r10d, -33
609    jl .hv_have_right
610    call mangle(private_prefix %+ _wiener_filter7_8bpc_avx2).extend_right
611.hv_have_right:
612    pshufb          m0, m4, m6
613    pmaddubsw       m0, m12
614    pshufb          m1, m5, m6
615    pmaddubsw       m1, m12
616    pshufb          m2, m4, m7
617    pmaddubsw       m2, m12
618    pshufb          m3, m5, m7
619    pmaddubsw       m3, m12
620    pshufb          m4, m8
621    paddw           m0, m2
622    pmullw          m2, m4, m13
623    pshufb          m5, m8
624    paddw           m1, m3
625    pmullw          m3, m5, m13
626    psllw           m4, 7
627    psllw           m5, 7
628    paddw           m4, m9
629    paddw           m5, m9
630    paddw           m0, m2
631    paddw           m1, m3
632    mova            m2, [t3+r10*2]
633    paddw           m2, [t1+r10*2]
634    mova            m3, [t2+r10*2]
635    paddsw          m0, m4
636    paddsw          m1, m5
637    psraw           m0, 3
638    psraw           m1, 3
639    paddw           m0, m10
640    paddw           m1, m10
641    paddw           m4, m0, [t4+r10*2]
642    mova    [t0+r10*2], m0
643    punpcklwd       m0, m2, m3
644    pmaddwd         m0, m15
645    punpckhwd       m2, m3
646    pmaddwd         m2, m15
647    punpcklwd       m3, m4, m4
648    pmaddwd         m3, m14
649    punpckhwd       m4, m4
650    pmaddwd         m4, m14
651    paddd           m0, m3
652    paddd           m4, m2
653    mova            m2, [t3+r10*2+32]
654    paddw           m2, [t1+r10*2+32]
655    mova            m3, [t2+r10*2+32]
656    packuswb        m0, m4
657    paddw           m4, m1, [t4+r10*2+32]
658    mova [t0+r10*2+32], m1
659    punpcklwd       m1, m2, m3
660    pmaddwd         m1, m15
661    punpckhwd       m2, m3
662    pmaddwd         m2, m15
663    punpcklwd       m3, m4, m4
664    pmaddwd         m3, m14
665    punpckhwd       m4, m4
666    pmaddwd         m4, m14
667    paddd           m1, m3
668    paddd           m2, m4
669    packuswb        m1, m2
670    psrlw           m0, 8
671    psrlw           m1, 8
672    packuswb        m0, m1
673    mova    [dstq+r10], m0
674    add            r10, 32
675    jl .hv_loop
676    mov             t4, t3
677    mov             t3, t2
678    mov             t2, t1
679    mov             t1, t0
680    mov             t0, t4
681    add           dstq, strideq
682    ret
683.v:
684    mov            r10, wq
685    psrld          m13, m14, 16 ; y1 __
686.v_loop:
687    mova            m6, [t1+r10*2+ 0]
688    paddw           m2, m6, [t3+r10*2+ 0]
689    mova            m4, [t2+r10*2+ 0]
690    mova            m7, [t1+r10*2+32]
691    paddw           m3, m7, [t3+r10*2+32]
692    mova            m5, [t2+r10*2+32]
693    paddw           m6, [t4+r10*2+ 0]
694    paddw           m7, [t4+r10*2+32]
695    punpcklwd       m0, m2, m4
696    pmaddwd         m0, m15
697    punpckhwd       m2, m4
698    pmaddwd         m2, m15
699    punpcklwd       m1, m3, m5
700    pmaddwd         m1, m15
701    punpckhwd       m3, m5
702    pmaddwd         m3, m15
703    punpcklwd       m5, m7, m6
704    pmaddwd         m4, m5, m14
705    punpckhwd       m7, m6
706    pmaddwd         m6, m7, m14
707    pmaddwd         m5, m13
708    pmaddwd         m7, m13
709    paddd           m0, m4
710    paddd           m2, m6
711    paddd           m1, m5
712    paddd           m3, m7
713    packuswb        m0, m2
714    packuswb        m1, m3
715    psrlw           m0, 8
716    psrlw           m1, 8
717    packuswb        m0, m1
718    mova    [dstq+r10], m0
719    add            r10, 32
720    jl .v_loop
721    ret
722
723cglobal sgr_filter_5x5_8bpc, 4, 13, 16, 400*24+16, dst, stride, left, lpf, \
724                                                   w, h, edge, params
725%define base r12-sgr_x_by_x_avx2-256*4
726    lea            r12, [sgr_x_by_x_avx2+256*4]
727    mov        paramsq, r6mp
728    mov             wd, wm
729    movifnidn       hd, hm
730    mov          edged, r7m
731    vbroadcasti128  m8, [base+sgr_shuf+0]
732    vbroadcasti128  m9, [base+sgr_shuf+8]
733    add           lpfq, wq
734    vbroadcasti128 m10, [base+sgr_shuf+2]
735    add           dstq, wq
736    vbroadcasti128 m11, [base+sgr_shuf+6]
737    lea             t3, [rsp+wq*4+16+400*12]
738    vpbroadcastd   m12, [paramsq+0] ; s0
739    pxor            m6, m6
740    vpbroadcastw    m7, [paramsq+8] ; w0
741    lea             t1, [rsp+wq*2+20]
742    vpbroadcastd   m13, [base+pd_0xf00800a4]
743    neg             wq
744    vpbroadcastd   m14, [base+pd_34816]  ; (1 << 11) + (1 << 15)
745    psllw           m7, 4
746    vpbroadcastd   m15, [base+pd_m4096]
747    test         edgeb, 4 ; LR_HAVE_TOP
748    jz .no_top
749    call .h_top
750    add           lpfq, strideq
751    mov             t2, t1
752    call .top_fixup
753    add             t1, 400*6
754    call .h_top
755    lea            r10, [lpfq+strideq*4]
756    mov           lpfq, dstq
757    add            r10, strideq
758    mov          [rsp], r10 ; below
759    mov             t0, t2
760    dec             hd
761    jz .height1
762    or           edged, 16
763    call .h
764.main:
765    add           lpfq, strideq
766    call .hv
767    call .prep_n
768    sub             hd, 2
769    jl .extend_bottom
770.main_loop:
771    add           lpfq, strideq
772    test            hd, hd
773    jz .odd_height
774    call .h
775    add           lpfq, strideq
776    call .hv
777    call .n0
778    call .n1
779    sub             hd, 2
780    jge .main_loop
781    test         edgeb, 8 ; LR_HAVE_BOTTOM
782    jz .extend_bottom
783    mov           lpfq, [rsp]
784    call .h_top
785    add           lpfq, strideq
786    call .hv_bottom
787.end:
788    call .n0
789    call .n1
790.end2:
791    RET
792.height1:
793    call .hv
794    call .prep_n
795    jmp .odd_height_end
796.odd_height:
797    call .hv
798    call .n0
799    call .n1
800.odd_height_end:
801    call .v
802    call .n0
803    jmp .end2
804.extend_bottom:
805    call .v
806    jmp .end
807.no_top:
808    lea            r10, [lpfq+strideq*4]
809    mov           lpfq, dstq
810    lea            r10, [r10+strideq*2]
811    mov          [rsp], r10
812    call .h
813    lea             t2, [t1+400*6]
814    call .top_fixup
815    dec             hd
816    jz .no_top_height1
817    or           edged, 16
818    mov             t0, t1
819    mov             t1, t2
820    jmp .main
821.no_top_height1:
822    call .v
823    call .prep_n
824    jmp .odd_height_end
825.extend_right:
826    movd           xm2, r10d
827    mova            m0, [sgr_r_ext]
828    vpbroadcastb    m2, xm2
829    psubb           m0, m2
830    pminub          m0, [pb_0to63]
831    pshufb          m5, m0
832    ret
833.h: ; horizontal boxsum
834    lea            r10, [wq-2]
835    test         edgeb, 1 ; LR_HAVE_LEFT
836    jz .h_extend_left
837    vpbroadcastd   xm0, [leftq]
838    mova           xm5, [lpfq+wq]
839    palignr        xm5, xm0, 12
840    add          leftq, 4
841    jmp .h_main
842.h_extend_left:
843    mova           xm5, [lpfq+wq]
844    pshufb         xm5, [base+sgr_l_shuf]
845    jmp .h_main
846.h_top:
847    lea            r10, [wq-2]
848    test         edgeb, 1 ; LR_HAVE_LEFT
849    jz .h_extend_left
850.h_loop:
851    movu           xm5, [lpfq+r10-2]
852.h_main:
853    vinserti128     m5, [lpfq+r10+6], 1
854    test         edgeb, 2 ; LR_HAVE_RIGHT
855    jnz .h_have_right
856    cmp           r10d, -18
857    jl .h_have_right
858    call .extend_right
859.h_have_right:
860    pshufb          m3, m5, m8
861    pmullw          m4, m3, m3
862    pshufb          m2, m5, m9
863    paddw           m0, m3, m2
864    shufps          m3, m2, q2121
865    paddw           m0, m3
866    punpcklwd       m1, m2, m3
867    pmaddwd         m1, m1
868    punpckhwd       m2, m3
869    pmaddwd         m2, m2
870    punpcklwd       m3, m4, m6
871    paddd           m1, m3
872    punpckhwd       m4, m6
873    paddd           m2, m4
874    pshufb          m4, m5, m10
875    paddw           m0, m4
876    pshufb          m5, m11
877    paddw           m0, m5 ; sum
878    punpcklwd       m3, m4, m5
879    pmaddwd         m3, m3
880    punpckhwd       m4, m5
881    pmaddwd         m4, m4
882    test         edgeb, 16 ; y > 0
883    jz .h_loop_end
884    paddw           m0, [t1+r10*2+400*0]
885    paddd           m1, [t1+r10*2+400*2]
886    paddd           m2, [t1+r10*2+400*4]
887.h_loop_end:
888    paddd           m1, m3 ; sumsq
889    paddd           m2, m4
890    mova [t1+r10*2+400*0], m0
891    mova [t1+r10*2+400*2], m1
892    mova [t1+r10*2+400*4], m2
893    add            r10, 16
894    jl .h_loop
895    ret
896.top_fixup:
897    lea            r10, [wq-2]
898.top_fixup_loop: ; the sums of the first row needs to be doubled
899    mova            m0, [t1+r10*2+400*0]
900    mova            m1, [t1+r10*2+400*2]
901    mova            m2, [t1+r10*2+400*4]
902    paddw           m0, m0
903    paddd           m1, m1
904    paddd           m2, m2
905    mova [t2+r10*2+400*0], m0
906    mova [t2+r10*2+400*2], m1
907    mova [t2+r10*2+400*4], m2
908    add            r10, 16
909    jl .top_fixup_loop
910    ret
911ALIGN function_align
912.hv: ; horizontal boxsum + vertical boxsum + ab
913    lea            r10, [wq-2]
914    test         edgeb, 1 ; LR_HAVE_LEFT
915    jz .hv_extend_left
916    vpbroadcastd   xm0, [leftq]
917    mova           xm5, [lpfq+wq]
918    palignr        xm5, xm0, 12
919    add          leftq, 4
920    jmp .hv_main
921.hv_extend_left:
922    mova           xm5, [lpfq+wq]
923    pshufb         xm5, [base+sgr_l_shuf]
924    jmp .hv_main
925.hv_bottom:
926    lea            r10, [wq-2]
927    test         edgeb, 1 ; LR_HAVE_LEFT
928    jz .hv_extend_left
929.hv_loop:
930    movu           xm5, [lpfq+r10-2]
931.hv_main:
932    vinserti128     m5, [lpfq+r10+6], 1
933    test         edgeb, 2 ; LR_HAVE_RIGHT
934    jnz .hv_have_right
935    cmp           r10d, -18
936    jl .hv_have_right
937    call .extend_right
938.hv_have_right:
939    pshufb          m1, m5, m8
940    pmullw          m4, m1, m1
941    pshufb          m3, m5, m9
942    paddw           m0, m1, m3
943    shufps          m1, m3, q2121
944    paddw           m0, m1
945    punpcklwd       m2, m3, m1
946    pmaddwd         m2, m2
947    punpckhwd       m3, m1
948    pmaddwd         m3, m3
949    punpcklwd       m1, m4, m6
950    paddd           m2, m1
951    punpckhwd       m4, m6
952    paddd           m3, m4
953    pshufb          m1, m5, m10
954    paddw           m0, m1
955    pshufb          m5, m11
956    paddw           m0, m5               ; h sum
957    punpcklwd       m4, m5, m1
958    pmaddwd         m4, m4
959    punpckhwd       m5, m1
960    pmaddwd         m5, m5
961    paddw           m1, m0, [t1+r10*2+400*0]
962    paddd           m2, m4               ; h sumsq
963    paddd           m3, m5
964    paddd           m4, m2, [t1+r10*2+400*2]
965    paddd           m5, m3, [t1+r10*2+400*4]
966    test            hd, hd
967    jz .hv_last_row
968.hv_main2:
969    paddw           m1, [t2+r10*2+400*0] ; hv sum
970    paddd           m4, [t2+r10*2+400*2] ; hv sumsq
971    paddd           m5, [t2+r10*2+400*4]
972    mova [t0+r10*2+400*0], m0
973    mova [t0+r10*2+400*2], m2
974    mova [t0+r10*2+400*4], m3
975    vpbroadcastd    m2, [pd_25]
976    punpcklwd       m0, m1, m6           ; b
977    punpckhwd       m1, m6
978    pmulld          m4, m2               ; a * 25
979    pmulld          m5, m2
980    pmaddwd         m2, m0, m0           ; b * b
981    pmaddwd         m3, m1, m1
982    psubd           m4, m2               ; p
983    psubd           m5, m3
984    pmulld          m4, m12              ; p * s
985    pmulld          m5, m12
986    pmaddwd         m0, m13              ; b * 164
987    pmaddwd         m1, m13
988    paddusw         m4, m13
989    paddusw         m5, m13
990    psrad           m3, m4, 20           ; min(z, 255) - 256
991    vpgatherdd      m2, [r12+m3*4], m4   ; x
992    psrad           m4, m5, 20
993    vpgatherdd      m3, [r12+m4*4], m5
994    pmulld          m0, m2
995    pmulld          m1, m3
996    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
997    paddd           m1, m14
998    pand            m0, m15
999    pand            m1, m15
1000    por             m0, m2               ; a | (b << 12)
1001    por             m1, m3
1002    mova         [t3+r10*4+ 8], xm0      ; The neighbor calculations requires
1003    vextracti128 [t3+r10*4+40], m0, 1    ; 13 bits for a and 21 bits for b.
1004    mova         [t3+r10*4+24], xm1      ; Packing them allows for 12+20, but
1005    vextracti128 [t3+r10*4+56], m1, 1    ; that gets us most of the way.
1006    add            r10, 16
1007    jl .hv_loop
1008    mov             t2, t1
1009    mov             t1, t0
1010    mov             t0, t2
1011    ret
1012.hv_last_row: ; esoteric edge case for odd heights
1013    mova [t1+r10*2+400*0], m1
1014    paddw              m1, m0
1015    mova [t1+r10*2+400*2], m4
1016    paddd              m4, m2
1017    mova [t1+r10*2+400*4], m5
1018    paddd              m5, m3
1019    jmp .hv_main2
1020.v: ; vertical boxsum + ab
1021    lea            r10, [wq-2]
1022.v_loop:
1023    mova            m0, [t1+r10*2+400*0]
1024    mova            m2, [t1+r10*2+400*2]
1025    mova            m3, [t1+r10*2+400*4]
1026    paddw           m1, m0, [t2+r10*2+400*0]
1027    paddd           m4, m2, [t2+r10*2+400*2]
1028    paddd           m5, m3, [t2+r10*2+400*4]
1029    paddw           m0, m0
1030    paddd           m2, m2
1031    paddd           m3, m3
1032    paddw           m1, m0               ; hv sum
1033    paddd           m4, m2               ; hv sumsq
1034    paddd           m5, m3
1035    vpbroadcastd    m2, [pd_25]
1036    punpcklwd       m0, m1, m6           ; b
1037    punpckhwd       m1, m6
1038    pmulld          m4, m2               ; a * 25
1039    pmulld          m5, m2
1040    pmaddwd         m2, m0, m0           ; b * b
1041    pmaddwd         m3, m1, m1
1042    psubd           m4, m2               ; p
1043    psubd           m5, m3
1044    pmulld          m4, m12              ; p * s
1045    pmulld          m5, m12
1046    pmaddwd         m0, m13              ; b * 164
1047    pmaddwd         m1, m13
1048    paddusw         m4, m13
1049    paddusw         m5, m13
1050    psrad           m3, m4, 20           ; min(z, 255) - 256
1051    vpgatherdd      m2, [r12+m3*4], m4   ; x
1052    psrad           m4, m5, 20
1053    vpgatherdd      m3, [r12+m4*4], m5
1054    pmulld          m0, m2
1055    pmulld          m1, m3
1056    paddd           m0, m14              ; x * b * 164 + (1 << 11) + (1 << 15)
1057    paddd           m1, m14
1058    pand            m0, m15
1059    pand            m1, m15
1060    por             m0, m2               ; a | (b << 12)
1061    por             m1, m3
1062    mova         [t3+r10*4+ 8], xm0
1063    vextracti128 [t3+r10*4+40], m0, 1
1064    mova         [t3+r10*4+24], xm1
1065    vextracti128 [t3+r10*4+56], m1, 1
1066    add            r10, 16
1067    jl .v_loop
1068    ret
1069.prep_n: ; initial neighbor setup
1070    mov            r10, wq
1071.prep_n_loop:
1072    movu            m0, [t3+r10*4+ 4]
1073    movu            m1, [t3+r10*4+36]
1074    paddd           m2, m0, [t3+r10*4+ 0]
1075    paddd           m3, m1, [t3+r10*4+32]
1076    paddd           m2, [t3+r10*4+ 8]
1077    paddd           m3, [t3+r10*4+40]
1078    paddd           m0, m2
1079    pslld           m2, 2
1080    paddd           m1, m3
1081    pslld           m3, 2
1082    paddd           m2, m0                ; ab 565
1083    paddd           m3, m1
1084    pandn           m0, m15, m2           ; a
1085    psrld           m2, 12                ; b
1086    pandn           m1, m15, m3
1087    psrld           m3, 12
1088    mova [t3+r10*4+400*4+ 0], m0
1089    mova [t3+r10*4+400*8+ 0], m2
1090    mova [t3+r10*4+400*4+32], m1
1091    mova [t3+r10*4+400*8+32], m3
1092    add            r10, 16
1093    jl .prep_n_loop
1094    ret
1095ALIGN function_align
1096.n0: ; neighbor + output (even rows)
1097    mov            r10, wq
1098.n0_loop:
1099    movu            m0, [t3+r10*4+ 4]
1100    movu            m1, [t3+r10*4+36]
1101    paddd           m2, m0, [t3+r10*4+ 0]
1102    paddd           m3, m1, [t3+r10*4+32]
1103    paddd           m2, [t3+r10*4+ 8]
1104    paddd           m3, [t3+r10*4+40]
1105    paddd           m0, m2
1106    pslld           m2, 2
1107    paddd           m1, m3
1108    pslld           m3, 2
1109    paddd           m2, m0
1110    paddd           m3, m1
1111    pandn           m0, m15, m2
1112    psrld           m2, 12
1113    pandn           m1, m15, m3
1114    psrld           m3, 12
1115    paddd           m4, m0, [t3+r10*4+400*4+ 0] ; a
1116    paddd           m5, m1, [t3+r10*4+400*4+32]
1117    mova [t3+r10*4+400*4+ 0], m0
1118    mova [t3+r10*4+400*4+32], m1
1119    paddd           m0, m2, [t3+r10*4+400*8+ 0] ; b
1120    paddd           m1, m3, [t3+r10*4+400*8+32]
1121    mova [t3+r10*4+400*8+ 0], m2
1122    mova [t3+r10*4+400*8+32], m3
1123    pmovzxbd        m2, [dstq+r10+0]
1124    pmovzxbd        m3, [dstq+r10+8]
1125    pmaddwd         m4, m2 ; a * src
1126    pmaddwd         m5, m3
1127    packssdw        m2, m3
1128    psubd           m0, m4 ; b - a * src + (1 << 8)
1129    psubd           m1, m5
1130    psrad           m0, 9
1131    psrad           m1, 9
1132    packssdw        m0, m1
1133    pmulhrsw        m0, m7
1134    paddw           m0, m2
1135    vextracti128   xm1, m0, 1
1136    packuswb       xm0, xm1
1137    pshufd         xm0, xm0, q3120
1138    mova    [dstq+r10], xm0
1139    add            r10, 16
1140    jl .n0_loop
1141    add           dstq, strideq
1142    ret
1143ALIGN function_align
1144.n1: ; neighbor + output (odd rows)
1145    mov            r10, wq
1146.n1_loop:
1147    pmovzxbd        m2, [dstq+r10+0]
1148    pmovzxbd        m3, [dstq+r10+8]
1149    pmaddwd         m4, m2, [t3+r10*4+400*4+ 0] ; a * src
1150    pmaddwd         m5, m3, [t3+r10*4+400*4+32]
1151    mova            m0, [t3+r10*4+400*8+ 0]     ; b
1152    mova            m1, [t3+r10*4+400*8+32]
1153    packssdw        m2, m3
1154    psubd           m0, m4                      ; b - a * src + (1 << 7)
1155    psubd           m1, m5
1156    psrad           m0, 8
1157    psrad           m1, 8
1158    packssdw        m0, m1
1159    pmulhrsw        m0, m7
1160    paddw           m0, m2
1161    vextracti128   xm1, m0, 1
1162    packuswb       xm0, xm1
1163    pshufd         xm0, xm0, q3120
1164    mova    [dstq+r10], xm0
1165    add            r10, 16
1166    jl .n1_loop
1167    add           dstq, strideq
1168    ret
1169
1170cglobal sgr_filter_3x3_8bpc, 4, 15, 15, -400*28-16, dst, stride, left, lpf, \
1171                                                    w, h, edge, params
1172%define base r14-sgr_x_by_x_avx2-256*4
1173    mov        paramsq, r6mp
1174    mov             wd, wm
1175    movifnidn       hd, hm
1176    mov          edged, r7m
1177    lea            r14, [sgr_x_by_x_avx2+256*4]
1178    vbroadcasti128  m8, [base+sgr_shuf+2]
1179    add           lpfq, wq
1180    vbroadcasti128  m9, [base+sgr_shuf+4]
1181    add           dstq, wq
1182    vbroadcasti128 m10, [base+sgr_shuf+6]
1183    lea             t3, [rsp+wq*4+16+400*12]
1184    vpbroadcastd   m11, [paramsq+ 4] ; s1
1185    pxor            m6, m6
1186    vpbroadcastw    m7, [paramsq+10] ; w1
1187    lea             t1, [rsp+wq*2+20]
1188    vpbroadcastd   m12, [base+pd_0xf00801c7]
1189    neg             wq
1190    vpbroadcastd   m13, [base+pd_34816] ; (1 << 11) + (1 << 15)
1191    psllw           m7, 4
1192    vpbroadcastd   m14, [base+pd_m4096]
1193    test         edgeb, 4 ; LR_HAVE_TOP
1194    jz .no_top
1195    call .h_top
1196    add           lpfq, strideq
1197    mov             t2, t1
1198    add             t1, 400*6
1199    call .h_top
1200    lea             t4, [lpfq+strideq*4]
1201    mov           lpfq, dstq
1202    add             t4, strideq
1203    mov          [rsp], t4 ; below
1204    mov             t0, t2
1205    call .hv
1206.main:
1207    mov             t5, t3
1208    add             t3, 400*4
1209    dec             hd
1210    jz .height1
1211    add           lpfq, strideq
1212    call .hv
1213    call .prep_n
1214    dec             hd
1215    jz .extend_bottom
1216.main_loop:
1217    add           lpfq, strideq
1218    call .hv
1219    call .n
1220    dec             hd
1221    jnz .main_loop
1222    test         edgeb, 8 ; LR_HAVE_BOTTOM
1223    jz .extend_bottom
1224    mov           lpfq, [rsp]
1225    call .hv_bottom
1226    call .n
1227    add           lpfq, strideq
1228    call .hv_bottom
1229.end:
1230    call .n
1231    RET
1232.height1:
1233    call .v
1234    call .prep_n
1235    mov             t2, t1
1236    call .v
1237    jmp .end
1238.extend_bottom:
1239    call .v
1240    call .n
1241    mov             t2, t1
1242    call .v
1243    jmp .end
1244.no_top:
1245    lea             t4, [lpfq+strideq*4]
1246    mov           lpfq, dstq
1247    lea             t4, [t4+strideq*2]
1248    mov          [rsp], t4
1249    call .h
1250    lea             t0, [t1+400*6]
1251    mov             t2, t1
1252    call .v
1253    jmp .main
1254.h: ; horizontal boxsum
1255    lea            r10, [wq-2]
1256    test         edgeb, 1 ; LR_HAVE_LEFT
1257    jz .h_extend_left
1258    vpbroadcastd   xm0, [leftq]
1259    mova           xm5, [lpfq+wq]
1260    palignr        xm5, xm0, 12
1261    add          leftq, 4
1262    jmp .h_main
1263.h_extend_left:
1264    mova           xm5, [lpfq+wq]
1265    pshufb         xm5, [base+sgr_l_shuf]
1266    jmp .h_main
1267.h_top:
1268    lea            r10, [wq-2]
1269    test         edgeb, 1 ; LR_HAVE_LEFT
1270    jz .h_extend_left
1271.h_loop:
1272    movu           xm5, [lpfq+r10-2]
1273.h_main:
1274    vinserti128     m5, [lpfq+r10+6], 1
1275    test         edgeb, 2 ; LR_HAVE_RIGHT
1276    jnz .h_have_right
1277    cmp           r10d, -17
1278    jl .h_have_right
1279    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1280.h_have_right:
1281    pshufb          m0, m5, m8
1282    pmullw          m2, m0, m0
1283    pshufb          m4, m5, m9
1284    paddw           m0, m4
1285    pshufb          m5, m10
1286    paddw           m0, m5 ; sum
1287    punpcklwd       m3, m4, m5
1288    pmaddwd         m3, m3
1289    punpckhwd       m4, m5
1290    pmaddwd         m4, m4
1291    punpcklwd       m1, m2, m6
1292    punpckhwd       m2, m6
1293    mova [t1+r10*2+400*0], m0
1294    paddd           m1, m3 ; sumsq
1295    paddd           m2, m4
1296    mova [t1+r10*2+400*2], m1
1297    mova [t1+r10*2+400*4], m2
1298    add            r10, 16
1299    jl .h_loop
1300    ret
1301ALIGN function_align
1302.hv: ; horizontal boxsum + vertical boxsum + ab
1303    lea            r10, [wq-2]
1304    test         edgeb, 1 ; LR_HAVE_LEFT
1305    jz .hv_extend_left
1306    vpbroadcastd   xm0, [leftq]
1307    mova           xm5, [lpfq+wq]
1308    palignr        xm5, xm0, 12
1309    add          leftq, 4
1310    jmp .hv_main
1311.hv_extend_left:
1312    mova           xm5, [lpfq+wq]
1313    pshufb         xm5, [base+sgr_l_shuf]
1314    jmp .hv_main
1315.hv_bottom:
1316    lea            r10, [wq-2]
1317    test         edgeb, 1 ; LR_HAVE_LEFT
1318    jz .hv_extend_left
1319.hv_loop:
1320    movu           xm5, [lpfq+r10-2]
1321.hv_main:
1322    vinserti128     m5, [lpfq+r10+6], 1
1323    test         edgeb, 2 ; LR_HAVE_RIGHT
1324    jnz .hv_have_right
1325    cmp           r10d, -17
1326    jl .hv_have_right
1327    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1328.hv_have_right:
1329    pshufb          m0, m5, m8
1330    pmullw          m3, m0, m0
1331    pshufb          m1, m5, m9
1332    paddw           m0, m1
1333    pshufb          m5, m10
1334    paddw           m0, m5               ; h sum
1335    punpcklwd       m4, m5, m1
1336    pmaddwd         m4, m4
1337    punpckhwd       m5, m1
1338    pmaddwd         m5, m5
1339    paddw           m1, m0, [t2+r10*2+400*0]
1340    paddw           m1, [t1+r10*2+400*0] ; hv sum
1341    punpcklwd       m2, m3, m6
1342    punpckhwd       m3, m6
1343    paddd           m4, m2               ; h sumsq
1344    paddd           m5, m3
1345    paddd           m2, m4, [t2+r10*2+400*2]
1346    paddd           m3, m5, [t2+r10*2+400*4]
1347    paddd           m2, [t1+r10*2+400*2] ; hv sumsq
1348    paddd           m3, [t1+r10*2+400*4]
1349    mova [t0+r10*2+400*0], m0
1350    punpcklwd       m0, m1, m6           ; b
1351    punpckhwd       m1, m6
1352    mova [t0+r10*2+400*2], m4
1353    pslld           m4, m2, 3
1354    mova [t0+r10*2+400*4], m5
1355    pslld           m5, m3, 3
1356    paddd           m4, m2               ; a * 9
1357    pmaddwd         m2, m0, m0           ; b * b
1358    paddd           m5, m3
1359    pmaddwd         m3, m1, m1
1360    psubd           m4, m2               ; p
1361    psubd           m5, m3
1362    pmulld          m4, m11              ; p * s
1363    pmulld          m5, m11
1364    pmaddwd         m0, m12              ; b * 455
1365    pmaddwd         m1, m12
1366    paddusw         m4, m12
1367    paddusw         m5, m12
1368    psrad           m3, m4, 20           ; min(z, 255) - 256
1369    vpgatherdd      m2, [r14+m3*4], m4
1370    psrad           m4, m5, 20
1371    vpgatherdd      m3, [r14+m4*4], m5
1372    pmulld          m0, m2
1373    pmulld          m1, m3
1374    paddd           m0, m13              ; x * b * 455 + (1 << 11) + (1 << 15)
1375    paddd           m1, m13
1376    pand            m0, m14
1377    pand            m1, m14
1378    por             m0, m2               ; a | (b << 12)
1379    por             m1, m3
1380    mova         [t3+r10*4+ 8], xm0
1381    vextracti128 [t3+r10*4+40], m0, 1
1382    mova         [t3+r10*4+24], xm1
1383    vextracti128 [t3+r10*4+56], m1, 1
1384    add            r10, 16
1385    jl .hv_loop
1386    mov             t2, t1
1387    mov             t1, t0
1388    mov             t0, t2
1389    ret
1390.v: ; vertical boxsum + ab
1391    lea            r10, [wq-2]
1392.v_loop:
1393    mova            m1, [t1+r10*2+400*0]
1394    paddw           m1, m1
1395    paddw           m1, [t2+r10*2+400*0] ; hv sum
1396    mova            m2, [t1+r10*2+400*2]
1397    mova            m3, [t1+r10*2+400*4]
1398    paddd           m2, m2
1399    paddd           m3, m3
1400    paddd           m2, [t2+r10*2+400*2] ; hv sumsq
1401    paddd           m3, [t2+r10*2+400*4]
1402    punpcklwd       m0, m1, m6           ; b
1403    punpckhwd       m1, m6
1404    pslld           m4, m2, 3
1405    pslld           m5, m3, 3
1406    paddd           m4, m2               ; a * 9
1407    pmaddwd         m2, m0, m0           ; b * b
1408    paddd           m5, m3
1409    pmaddwd         m3, m1, m1
1410    psubd           m4, m2               ; p
1411    psubd           m5, m3
1412    pmulld          m4, m11              ; p * s
1413    pmulld          m5, m11
1414    pmaddwd         m0, m12              ; b * 455
1415    pmaddwd         m1, m12
1416    paddusw         m4, m12
1417    paddusw         m5, m12
1418    psrad           m3, m4, 20           ; min(z, 255) - 256
1419    vpgatherdd      m2, [r14+m3*4], m4
1420    psrad           m4, m5, 20
1421    vpgatherdd      m3, [r14+m4*4], m5
1422    pmulld          m0, m2
1423    pmulld          m1, m3
1424    paddd           m0, m13              ; x * b * 455 + (1 << 11) + (1 << 15)
1425    paddd           m1, m13
1426    pand            m0, m14
1427    pand            m1, m14
1428    por             m0, m2               ; a | (b << 12)
1429    por             m1, m3
1430    mova         [t3+r10*4+ 8], xm0
1431    vextracti128 [t3+r10*4+40], m0, 1
1432    mova         [t3+r10*4+24], xm1
1433    vextracti128 [t3+r10*4+56], m1, 1
1434    add            r10, 16
1435    jl .v_loop
1436    ret
1437.prep_n: ; initial neighbor setup
1438    mov            r10, wq
1439    mov             t4, t3
1440    add             t3, 400*4
1441.prep_n_loop:
1442    mova            m2, [t5+r10*4+0]
1443    mova            m3, [t4+r10*4+0]
1444    paddd           m2, [t5+r10*4+8]
1445    paddd           m3, [t4+r10*4+8]
1446    paddd           m0, m2, [t5+r10*4+4]
1447    paddd           m1, m3, [t4+r10*4+4]
1448    pslld           m0, 2
1449    paddd           m1, m1                ; ab[ 0] 222
1450    psubd           m0, m2                ; ab[-1] 343
1451    mova [t3+r10*4+400*4], m1
1452    paddd           m1, m1
1453    mova    [t5+r10*4], m0
1454    psubd           m1, m3                ; ab[ 0] 343
1455    mova    [t4+r10*4], m1
1456    add            r10, 8
1457    jl .prep_n_loop
1458    ret
1459; a+b are packed together in a single dword, but we can't do the
1460; full neighbor calculations before splitting them since we don't
1461; have sufficient precision. The solution is to do the calculations
1462; in two equal halves and split a and b before doing the final sum.
1463ALIGN function_align
1464.n: ; neighbor + output
1465    mov            r10, wq
1466.n_loop:
1467    mova            m4, [t3+r10*4+ 0]
1468    paddd           m4, [t3+r10*4+ 8]
1469    paddd           m5, m4, [t3+r10*4+ 4]
1470    paddd           m5, m5                ; ab[+1] 222
1471    mova            m2, [t3+r10*4+400*4+ 0]
1472    paddd           m0, m2, [t5+r10*4+ 0] ; ab[ 0] 222 + ab[-1] 343
1473    mova            m3, [t3+r10*4+400*4+32]
1474    paddd           m1, m3, [t5+r10*4+32]
1475    mova [t3+r10*4+400*4+ 0], m5
1476    paddd           m5, m5
1477    psubd           m5, m4                ; ab[+1] 343
1478    mova [t5+r10*4+ 0], m5
1479    paddd           m2, m5                ; ab[ 0] 222 + ab[+1] 343
1480    mova            m4, [t3+r10*4+32]
1481    paddd           m4, [t3+r10*4+40]
1482    paddd           m5, m4, [t3+r10*4+36]
1483    paddd           m5, m5
1484    mova [t3+r10*4+400*4+32], m5
1485    paddd           m5, m5
1486    psubd           m5, m4
1487    mova [t5+r10*4+32], m5
1488    pandn           m4, m14, m0
1489    psrld           m0, 12
1490    paddd           m3, m5
1491    pandn           m5, m14, m2
1492    psrld           m2, 12
1493    paddd           m4, m5                ; a
1494    pandn           m5, m14, m1
1495    psrld           m1, 12
1496    paddd           m0, m2                ; b + (1 << 8)
1497    pandn           m2, m14, m3
1498    psrld           m3, 12
1499    paddd           m5, m2
1500    pmovzxbd        m2, [dstq+r10+0]
1501    paddd           m1, m3
1502    pmovzxbd        m3, [dstq+r10+8]
1503    pmaddwd         m4, m2                ; a * src
1504    pmaddwd         m5, m3
1505    packssdw        m2, m3
1506    psubd           m0, m4                ; b - a * src + (1 << 8)
1507    psubd           m1, m5
1508    psrad           m0, 9
1509    psrad           m1, 9
1510    packssdw        m0, m1
1511    pmulhrsw        m0, m7
1512    paddw           m0, m2
1513    vextracti128   xm1, m0, 1
1514    packuswb       xm0, xm1
1515    pshufd         xm0, xm0, q3120
1516    mova    [dstq+r10], xm0
1517    add            r10, 16
1518    jl .n_loop
1519    mov            r10, t5
1520    mov             t5, t4
1521    mov             t4, r10
1522    add           dstq, strideq
1523    ret
1524
1525cglobal sgr_filter_mix_8bpc, 4, 13, 16, 400*56+8, dst, stride, left, lpf, \
1526                                                  w, h, edge, params
1527%define base r12-sgr_x_by_x_avx2-256*4
1528    lea            r12, [sgr_x_by_x_avx2+256*4]
1529    mov        paramsq, r6mp
1530    mov             wd, wm
1531    movifnidn       hd, hm
1532    mov          edged, r7m
1533    vbroadcasti128  m9, [base+sgr_shuf+0]
1534    vbroadcasti128 m10, [base+sgr_shuf+8]
1535    add           lpfq, wq
1536    vbroadcasti128 m11, [base+sgr_shuf+2]
1537    vbroadcasti128 m12, [base+sgr_shuf+6]
1538    add           dstq, wq
1539    vpbroadcastd   m15, [paramsq+8] ; w0 w1
1540    lea             t3, [rsp+wq*4+400*24+8]
1541    vpbroadcastd   m13, [paramsq+0] ; s0
1542    pxor            m7, m7
1543    vpbroadcastd   m14, [paramsq+4] ; s1
1544    lea             t1, [rsp+wq*2+12]
1545    neg             wq
1546    psllw          m15, 2 ; to reuse existing pd_m4096 register for rounding
1547    test         edgeb, 4 ; LR_HAVE_TOP
1548    jz .no_top
1549    call .h_top
1550    add           lpfq, strideq
1551    mov             t2, t1
1552    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).top_fixup
1553    add             t1, 400*12
1554    call .h_top
1555    lea            r10, [lpfq+strideq*4]
1556    mov           lpfq, dstq
1557    add            r10, strideq
1558    mov          [rsp], r10 ; below
1559    call .hv0
1560.main:
1561    dec             hd
1562    jz .height1
1563    add           lpfq, strideq
1564    call .hv1
1565    call .prep_n
1566    sub             hd, 2
1567    jl .extend_bottom
1568.main_loop:
1569    add           lpfq, strideq
1570    call .hv0
1571    test            hd, hd
1572    jz .odd_height
1573    add           lpfq, strideq
1574    call .hv1
1575    call .n0
1576    call .n1
1577    sub             hd, 2
1578    jge .main_loop
1579    test         edgeb, 8 ; LR_HAVE_BOTTOM
1580    jz .extend_bottom
1581    mov           lpfq, [rsp]
1582    call .hv0_bottom
1583    add           lpfq, strideq
1584    call .hv1_bottom
1585.end:
1586    call .n0
1587    call .n1
1588.end2:
1589    RET
1590.height1:
1591    call .v1
1592    call .prep_n
1593    jmp .odd_height_end
1594.odd_height:
1595    call .v1
1596    call .n0
1597    call .n1
1598.odd_height_end:
1599    call .v0
1600    call .v1
1601    call .n0
1602    jmp .end2
1603.extend_bottom:
1604    call .v0
1605    call .v1
1606    jmp .end
1607.no_top:
1608    lea            r10, [lpfq+strideq*4]
1609    mov           lpfq, dstq
1610    lea            r10, [r10+strideq*2]
1611    mov          [rsp], r10
1612    call .h
1613    lea             t2, [t1+400*12]
1614    lea            r10, [wq-2]
1615.top_fixup_loop:
1616    mova            m0, [t1+r10*2+400* 0]
1617    mova            m1, [t1+r10*2+400* 2]
1618    mova            m2, [t1+r10*2+400* 4]
1619    paddw           m0, m0
1620    mova            m3, [t1+r10*2+400* 6]
1621    paddd           m1, m1
1622    mova            m4, [t1+r10*2+400* 8]
1623    paddd           m2, m2
1624    mova            m5, [t1+r10*2+400*10]
1625    mova [t2+r10*2+400* 0], m0
1626    mova [t2+r10*2+400* 2], m1
1627    mova [t2+r10*2+400* 4], m2
1628    mova [t2+r10*2+400* 6], m3
1629    mova [t2+r10*2+400* 8], m4
1630    mova [t2+r10*2+400*10], m5
1631    add            r10, 16
1632    jl .top_fixup_loop
1633    call .v0
1634    jmp .main
1635.h: ; horizontal boxsums
1636    lea            r10, [wq-2]
1637    test         edgeb, 1 ; LR_HAVE_LEFT
1638    jz .h_extend_left
1639    vpbroadcastd   xm0, [leftq]
1640    mova           xm5, [lpfq+wq]
1641    palignr        xm5, xm0, 12
1642    add          leftq, 4
1643    jmp .h_main
1644.h_extend_left:
1645    mova           xm5, [lpfq+wq]
1646    pshufb         xm5, [base+sgr_l_shuf]
1647    jmp .h_main
1648.h_top:
1649    lea            r10, [wq-2]
1650    test         edgeb, 1 ; LR_HAVE_LEFT
1651    jz .h_extend_left
1652.h_loop:
1653    movu           xm5, [lpfq+r10-2]
1654.h_main:
1655    vinserti128     m5, [lpfq+r10+6], 1
1656    test         edgeb, 2 ; LR_HAVE_RIGHT
1657    jnz .h_have_right
1658    cmp           r10d, -18
1659    jl .h_have_right
1660    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1661.h_have_right:
1662    pshufb          m6, m5, m9
1663    pshufb          m4, m5, m10
1664    paddw           m8, m6, m4
1665    shufps          m0, m6, m4, q2121
1666    pmullw          m3, m0, m0
1667    pshufb          m2, m5, m11
1668    paddw           m0, m2
1669    pshufb          m5, m12
1670    paddw           m0, m5 ; sum3
1671    punpcklwd       m1, m2, m5
1672    pmaddwd         m1, m1
1673    punpckhwd       m2, m5
1674    pmaddwd         m2, m2
1675    punpcklwd       m5, m6, m4
1676    pmaddwd         m5, m5
1677    punpckhwd       m6, m4
1678    pmaddwd         m6, m6
1679    punpcklwd       m4, m3, m7
1680    paddd           m1, m4 ; sumsq3
1681    punpckhwd       m3, m7
1682    paddd           m2, m3
1683    mova [t1+r10*2+400* 6], m0
1684    mova [t1+r10*2+400* 8], m1
1685    mova [t1+r10*2+400*10], m2
1686    paddw           m8, m0 ; sum5
1687    paddd           m5, m1 ; sumsq5
1688    paddd           m6, m2
1689    mova [t1+r10*2+400* 0], m8
1690    mova [t1+r10*2+400* 2], m5
1691    mova [t1+r10*2+400* 4], m6
1692    add            r10, 16
1693    jl .h_loop
1694    ret
1695ALIGN function_align
1696.hv0: ; horizontal boxsums + vertical boxsum3 + ab3 (even rows)
1697    lea            r10, [wq-2]
1698    test         edgeb, 1 ; LR_HAVE_LEFT
1699    jz .hv0_extend_left
1700    vpbroadcastd   xm0, [leftq]
1701    mova           xm5, [lpfq+wq]
1702    palignr        xm5, xm0, 12
1703    add          leftq, 4
1704    jmp .hv0_main
1705.hv0_extend_left:
1706    mova           xm5, [lpfq+wq]
1707    pshufb         xm5, [base+sgr_l_shuf]
1708    jmp .hv0_main
1709.hv0_bottom:
1710    lea            r10, [wq-2]
1711    test         edgeb, 1 ; LR_HAVE_LEFT
1712    jz .hv0_extend_left
1713.hv0_loop:
1714    movu           xm5, [lpfq+r10-2]
1715.hv0_main:
1716    vinserti128     m5, [lpfq+r10+6], 1
1717    test         edgeb, 2 ; LR_HAVE_RIGHT
1718    jnz .hv0_have_right
1719    cmp           r10d, -18
1720    jl .hv0_have_right
1721    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1722.hv0_have_right:
1723    pshufb          m6, m5, m9
1724    pshufb          m4, m5, m10
1725    paddw           m8, m6, m4
1726    shufps          m1, m6, m4, q2121
1727    pmullw          m0, m1, m1
1728    pshufb          m3, m5, m11
1729    paddw           m1, m3
1730    pshufb          m5, m12
1731    paddw           m1, m5 ; sum3
1732    punpcklwd       m2, m3, m5
1733    pmaddwd         m2, m2
1734    punpckhwd       m3, m5
1735    pmaddwd         m3, m3
1736    punpcklwd       m5, m6, m4
1737    pmaddwd         m5, m5
1738    punpckhwd       m6, m4
1739    pmaddwd         m6, m6
1740    punpcklwd       m4, m0, m7
1741    paddd           m2, m4 ; sumsq3
1742    punpckhwd       m0, m7
1743    paddd           m3, m0
1744    paddw           m8, m1 ; sum5
1745    paddd           m5, m2 ; sumsq5
1746    paddd           m6, m3
1747    mova [t3+r10*4+400*8+ 8], m8 ; we need a clean copy of the last row
1748    mova [t3+r10*4+400*0+ 8], m5 ; in case height is odd
1749    mova [t3+r10*4+400*0+40], m6
1750    paddw           m8, [t1+r10*2+400* 0]
1751    paddd           m5, [t1+r10*2+400* 2]
1752    paddd           m6, [t1+r10*2+400* 4]
1753    mova [t1+r10*2+400* 0], m8
1754    mova [t1+r10*2+400* 2], m5
1755    mova [t1+r10*2+400* 4], m6
1756    paddw           m0, m1, [t1+r10*2+400* 6]
1757    paddd           m4, m2, [t1+r10*2+400* 8]
1758    paddd           m5, m3, [t1+r10*2+400*10]
1759    mova [t1+r10*2+400* 6], m1
1760    mova [t1+r10*2+400* 8], m2
1761    mova [t1+r10*2+400*10], m3
1762    paddw           m1, m0, [t2+r10*2+400* 6]
1763    paddd           m2, m4, [t2+r10*2+400* 8]
1764    paddd           m3, m5, [t2+r10*2+400*10]
1765    mova [t2+r10*2+400* 6], m0
1766    mova [t2+r10*2+400* 8], m4
1767    mova [t2+r10*2+400*10], m5
1768    punpcklwd       m0, m1, m7           ; b3
1769    punpckhwd       m1, m7
1770    pslld           m4, m2, 3
1771    pslld           m5, m3, 3
1772    paddd           m4, m2               ; a3 * 9
1773    pmaddwd         m2, m0, m0           ; b3 * b
1774    paddd           m5, m3
1775    pmaddwd         m3, m1, m1
1776    psubd           m4, m2               ; p3
1777    vpbroadcastd    m2, [base+pd_0xf00801c7]
1778    psubd           m5, m3
1779    pmulld          m4, m14              ; p3 * s1
1780    pmulld          m5, m14
1781    pmaddwd         m0, m2               ; b3 * 455
1782    pmaddwd         m1, m2
1783    paddusw         m4, m2
1784    paddusw         m5, m2
1785    psrad           m3, m4, 20           ; min(z3, 255) - 256
1786    vpgatherdd      m2, [r12+m3*4], m4
1787    psrad           m4, m5, 20
1788    vpgatherdd      m3, [r12+m4*4], m5
1789    vpbroadcastd    m4, [base+pd_34816]
1790    pmulld          m0, m2
1791    vpbroadcastd    m5, [base+pd_m4096]
1792    pmulld          m1, m3
1793    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1794    paddd           m1, m4
1795    pand            m0, m5
1796    pand            m1, m5
1797    por             m0, m2               ; a3 | (b3 << 12)
1798    por             m1, m3
1799    mova         [t3+r10*4+400*4+ 8], xm0
1800    vextracti128 [t3+r10*4+400*4+40], m0, 1
1801    mova         [t3+r10*4+400*4+24], xm1
1802    vextracti128 [t3+r10*4+400*4+56], m1, 1
1803    add            r10, 16
1804    jl .hv0_loop
1805    ret
1806ALIGN function_align
1807.hv1: ; horizontal boxsums + vertical boxsums + ab (odd rows)
1808    lea            r10, [wq-2]
1809    test         edgeb, 1 ; LR_HAVE_LEFT
1810    jz .hv1_extend_left
1811    vpbroadcastd   xm0, [leftq]
1812    mova           xm5, [lpfq+wq]
1813    palignr        xm5, xm0, 12
1814    add          leftq, 4
1815    jmp .hv1_main
1816.hv1_extend_left:
1817    mova           xm5, [lpfq+wq]
1818    pshufb         xm5, [base+sgr_l_shuf]
1819    jmp .hv1_main
1820.hv1_bottom:
1821    lea            r10, [wq-2]
1822    test         edgeb, 1 ; LR_HAVE_LEFT
1823    jz .hv1_extend_left
1824.hv1_loop:
1825    movu           xm5, [lpfq+r10-2]
1826.hv1_main:
1827    vinserti128     m5, [lpfq+r10+6], 1
1828    test         edgeb, 2 ; LR_HAVE_RIGHT
1829    jnz .hv1_have_right
1830    cmp           r10d, -18
1831    jl .hv1_have_right
1832    call mangle(private_prefix %+ _sgr_filter_5x5_8bpc_avx2).extend_right
1833.hv1_have_right:
1834    pshufb          m6, m5, m9
1835    pshufb          m3, m5, m10
1836    paddw           m8, m6, m3
1837    shufps          m2, m6, m3, q2121
1838    pmullw          m1, m2, m2
1839    pshufb          m0, m5, m11
1840    paddw           m2, m0
1841    pshufb          m5, m12
1842    paddw           m2, m5 ; sum3
1843    punpcklwd       m4, m5, m0
1844    pmaddwd         m4, m4
1845    punpckhwd       m5, m0
1846    pmaddwd         m5, m5
1847    punpcklwd       m0, m6, m3
1848    pmaddwd         m0, m0
1849    punpckhwd       m6, m3
1850    pmaddwd         m6, m6
1851    punpcklwd       m3, m1, m7
1852    paddd           m4, m3 ; sumsq3
1853    punpckhwd       m1, m7
1854    paddd           m5, m1
1855    paddw           m1, m2, [t2+r10*2+400* 6]
1856    mova [t2+r10*2+400* 6], m2
1857    paddw           m8, m2 ; sum5
1858    paddd           m2, m4, [t2+r10*2+400* 8]
1859    paddd           m3, m5, [t2+r10*2+400*10]
1860    mova [t2+r10*2+400* 8], m4
1861    mova [t2+r10*2+400*10], m5
1862    paddd           m4, m0 ; sumsq5
1863    paddd           m5, m6
1864    punpcklwd       m0, m1, m7           ; b3
1865    punpckhwd       m1, m7
1866    pslld           m6, m2, 3
1867    pslld           m7, m3, 3
1868    paddd           m6, m2               ; a3 * 9
1869    pmaddwd         m2, m0, m0           ; b3 * b3
1870    paddd           m7, m3
1871    pmaddwd         m3, m1, m1
1872    psubd           m6, m2               ; p3
1873    vpbroadcastd    m2, [base+pd_0xf00801c7]
1874    psubd           m7, m3
1875    pmulld          m6, m14              ; p3 * s1
1876    pmulld          m7, m14
1877    pmaddwd         m0, m2               ; b3 * 455
1878    pmaddwd         m1, m2
1879    paddusw         m6, m2
1880    paddusw         m7, m2
1881    psrad           m3, m6, 20           ; min(z3, 255) - 256
1882    vpgatherdd      m2, [r12+m3*4], m6
1883    psrad           m6, m7, 20
1884    vpgatherdd      m3, [r12+m6*4], m7
1885    vpbroadcastd    m6, [base+pd_34816]  ; x3
1886    pmulld          m0, m2
1887    vpbroadcastd    m7, [base+pd_m4096]
1888    pmulld          m1, m3
1889    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1890    paddd           m1, m6
1891    pand            m0, m7
1892    pand            m7, m1
1893    por             m0, m2               ; a3 | (b3 << 12)
1894    por             m7, m3
1895    paddw           m1, m8, [t2+r10*2+400*0]
1896    paddd           m2, m4, [t2+r10*2+400*2]
1897    paddd           m3, m5, [t2+r10*2+400*4]
1898    paddw           m1, [t1+r10*2+400*0]
1899    paddd           m2, [t1+r10*2+400*2]
1900    paddd           m3, [t1+r10*2+400*4]
1901    mova [t2+r10*2+400*0], m8
1902    mova [t2+r10*2+400*2], m4
1903    mova [t2+r10*2+400*4], m5
1904    mova         [t3+r10*4+400*8+ 8], xm0
1905    vextracti128 [t3+r10*4+400*8+40], m0, 1
1906    mova         [t3+r10*4+400*8+24], xm7
1907    vextracti128 [t3+r10*4+400*8+56], m7, 1
1908    vpbroadcastd    m4, [base+pd_25]
1909    pxor            m7, m7
1910    punpcklwd       m0, m1, m7           ; b5
1911    punpckhwd       m1, m7
1912    pmulld          m2, m4               ; a5 * 25
1913    pmulld          m3, m4
1914    pmaddwd         m4, m0, m0           ; b5 * b5
1915    pmaddwd         m5, m1, m1
1916    psubd           m2, m4               ; p5
1917    vpbroadcastd    m4, [base+pd_0xf00800a4]
1918    psubd           m3, m5
1919    pmulld          m2, m13              ; p5 * s0
1920    pmulld          m3, m13
1921    pmaddwd         m0, m4               ; b5 * 164
1922    pmaddwd         m1, m4
1923    paddusw         m2, m4
1924    paddusw         m3, m4
1925    psrad           m5, m2, 20           ; min(z5, 255) - 256
1926    vpgatherdd      m4, [r12+m5*4], m2   ; x5
1927    psrad           m2, m3, 20
1928    vpgatherdd      m5, [r12+m2*4], m3
1929    pmulld          m0, m4
1930    pmulld          m1, m5
1931    paddd           m0, m6               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
1932    paddd           m1, m6
1933    vpbroadcastd    m6, [base+pd_m4096]
1934    pand            m0, m6
1935    pand            m1, m6
1936    por             m0, m4               ; a5 | (b5 << 12)
1937    por             m1, m5
1938    mova         [t3+r10*4+400*0+ 8], xm0
1939    vextracti128 [t3+r10*4+400*0+40], m0, 1
1940    mova         [t3+r10*4+400*0+24], xm1
1941    vextracti128 [t3+r10*4+400*0+56], m1, 1
1942    add            r10, 16
1943    jl .hv1_loop
1944    mov            r10, t2
1945    mov             t2, t1
1946    mov             t1, r10
1947    ret
1948.v0: ; vertical boxsums + ab3 (even rows)
1949    lea            r10, [wq-2]
1950    vpbroadcastd    m6, [base+pd_34816]
1951    vpbroadcastd    m8, [base+pd_m4096]
1952.v0_loop:
1953    mova            m0, [t1+r10*2+400* 6]
1954    mova            m4, [t1+r10*2+400* 8]
1955    mova            m5, [t1+r10*2+400*10]
1956    paddw           m0, m0
1957    paddd           m4, m4
1958    paddd           m5, m5
1959    paddw           m1, m0, [t2+r10*2+400* 6]
1960    paddd           m2, m4, [t2+r10*2+400* 8]
1961    paddd           m3, m5, [t2+r10*2+400*10]
1962    mova [t2+r10*2+400* 6], m0
1963    mova [t2+r10*2+400* 8], m4
1964    mova [t2+r10*2+400*10], m5
1965    punpcklwd       m0, m1, m7           ; b3
1966    punpckhwd       m1, m7
1967    pslld           m4, m2, 3
1968    pslld           m5, m3, 3
1969    paddd           m4, m2               ; a3 * 9
1970    pmaddwd         m2, m0, m0           ; b3 * b3
1971    paddd           m5, m3
1972    pmaddwd         m3, m1, m1
1973    psubd           m4, m2               ; p3
1974    vpbroadcastd    m2, [base+pd_0xf00801c7]
1975    psubd           m5, m3
1976    pmulld          m4, m14              ; p3 * s1
1977    pmulld          m5, m14
1978    pmaddwd         m0, m2               ; b3 * 455
1979    pmaddwd         m1, m2
1980    paddusw         m4, m2
1981    paddusw         m5, m2
1982    psrad           m3, m4, 20           ; min(z3, 255) - 256
1983    vpgatherdd      m2, [r12+m3*4], m4   ; x3
1984    psrad           m4, m5, 20
1985    vpgatherdd      m3, [r12+m4*4], m5
1986    pmulld          m0, m2
1987    pmulld          m1, m3
1988    paddd           m0, m6               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
1989    paddd           m1, m6
1990    pand            m0, m8
1991    pand            m1, m8
1992    por             m0, m2               ; a3 | (b3 << 12)
1993    por             m1, m3
1994    mova            m2, [t1+r10*2+400*0]
1995    mova            m3, [t1+r10*2+400*2]
1996    mova            m4, [t1+r10*2+400*4]
1997    mova [t3+r10*4+400*8+ 8], m2
1998    mova [t3+r10*4+400*0+ 8], m3
1999    mova [t3+r10*4+400*0+40], m4
2000    paddw           m2, m2               ; cc5
2001    paddd           m3, m3
2002    paddd           m4, m4
2003    mova [t1+r10*2+400*0], m2
2004    mova [t1+r10*2+400*2], m3
2005    mova [t1+r10*2+400*4], m4
2006    mova         [t3+r10*4+400*4+ 8], xm0
2007    vextracti128 [t3+r10*4+400*4+40], m0, 1
2008    mova         [t3+r10*4+400*4+24], xm1
2009    vextracti128 [t3+r10*4+400*4+56], m1, 1
2010    add            r10, 16
2011    jl .v0_loop
2012    ret
2013.v1: ; vertical boxsums + ab (odd rows)
2014    lea            r10, [wq-2]
2015.v1_loop:
2016    mova            m4, [t1+r10*2+400* 6]
2017    mova            m5, [t1+r10*2+400* 8]
2018    mova            m6, [t1+r10*2+400*10]
2019    paddw           m1, m4, [t2+r10*2+400* 6]
2020    paddd           m2, m5, [t2+r10*2+400* 8]
2021    paddd           m3, m6, [t2+r10*2+400*10]
2022    mova [t2+r10*2+400* 6], m4
2023    mova [t2+r10*2+400* 8], m5
2024    mova [t2+r10*2+400*10], m6
2025    punpcklwd       m0, m1, m7           ; b3
2026    punpckhwd       m1, m7
2027    pslld           m4, m2, 3
2028    pslld           m5, m3, 3
2029    paddd           m4, m2               ; a3 * 9
2030    pmaddwd         m2, m0, m0           ; b3 * b3
2031    paddd           m5, m3
2032    pmaddwd         m3, m1, m1
2033    psubd           m4, m2               ; p3
2034    vpbroadcastd    m2, [base+pd_0xf00801c7]
2035    psubd           m5, m3
2036    pmulld          m4, m14              ; p3 * s1
2037    pmulld          m5, m14
2038    pmaddwd         m0, m2               ; b3 * 455
2039    pmaddwd         m1, m2
2040    paddusw         m4, m2
2041    paddusw         m5, m2
2042    psrad           m3, m4, 20           ; min(z3, 255) - 256
2043    vpgatherdd      m2, [r12+m3*4], m4   ; x3
2044    psrad           m4, m5, 20
2045    vpgatherdd      m3, [r12+m4*4], m5
2046    vpbroadcastd    m4, [base+pd_34816]
2047    pmulld          m0, m2
2048    vpbroadcastd    m8, [base+pd_m4096]
2049    pmulld          m1, m3
2050    paddd           m0, m4               ; x3 * b3 * 455 + (1 << 11) + (1 << 15)
2051    paddd           m1, m4
2052    pand            m0, m8
2053    pand            m8, m1
2054    por             m0, m2               ; a3 | (b3 << 12)
2055    por             m8, m3
2056    mova            m4, [t3+r10*4+400*8+ 8]
2057    mova            m5, [t3+r10*4+400*0+ 8]
2058    mova            m6, [t3+r10*4+400*0+40]
2059    paddw           m1, m4, [t2+r10*2+400*0]
2060    paddd           m2, m5, [t2+r10*2+400*2]
2061    paddd           m3, m6, [t2+r10*2+400*4]
2062    paddw           m1, [t1+r10*2+400*0]
2063    paddd           m2, [t1+r10*2+400*2]
2064    paddd           m3, [t1+r10*2+400*4]
2065    mova [t2+r10*2+400*0], m4
2066    mova [t2+r10*2+400*2], m5
2067    mova [t2+r10*2+400*4], m6
2068    vpbroadcastd    m4, [base+pd_25]
2069    mova         [t3+r10*4+400*8+ 8], xm0
2070    vextracti128 [t3+r10*4+400*8+40], m0, 1
2071    mova         [t3+r10*4+400*8+24], xm8
2072    vextracti128 [t3+r10*4+400*8+56], m8, 1
2073    punpcklwd       m0, m1, m7           ; b5
2074    punpckhwd       m1, m7
2075    pmulld          m2, m4               ; a5 * 25
2076    pmulld          m3, m4
2077    pmaddwd         m4, m0, m0           ; b5 * b5
2078    pmaddwd         m5, m1, m1
2079    psubd           m2, m4               ; p5
2080    vpbroadcastd    m4, [base+pd_0xf00800a4]
2081    psubd           m3, m5
2082    pmulld          m2, m13              ; p5 * s0
2083    pmulld          m3, m13
2084    pmaddwd         m0, m4               ; b5 * 164
2085    pmaddwd         m1, m4
2086    paddusw         m2, m4
2087    paddusw         m3, m4
2088    psrad           m5, m2, 20           ; min(z5, 255) - 256
2089    vpgatherdd      m4, [r12+m5*4], m2   ; x5
2090    psrad           m2, m3, 20
2091    vpgatherdd      m5, [r12+m2*4], m3
2092    pmulld          m0, m4
2093    vpbroadcastd    m6, [base+pd_34816]
2094    pmulld          m1, m5
2095    paddd           m0, m6               ; x5 * b5 * 164 + (1 << 11) + (1 << 15)
2096    paddd           m1, m6
2097    vpbroadcastd    m6, [base+pd_m4096]
2098    pand            m0, m6
2099    pand            m1, m6
2100    por             m0, m4               ; a5 | (b5 << 12)
2101    por             m1, m5
2102    mova         [t3+r10*4+400*0+ 8], xm0
2103    vextracti128 [t3+r10*4+400*0+40], m0, 1
2104    mova         [t3+r10*4+400*0+24], xm1
2105    vextracti128 [t3+r10*4+400*0+56], m1, 1
2106    add            r10, 16
2107    jl .v1_loop
2108    mov            r10, t2
2109    mov             t2, t1
2110    mov             t1, r10
2111    ret
2112.prep_n: ; initial neighbor setup
2113    mov            r10, wq
2114.prep_n_loop:
2115    movu            m0, [t3+r10*4+400*0+4]
2116    paddd           m1, m0, [t3+r10*4+400*0+0]
2117    mova            m4, [t3+r10*4+400*4+0]
2118    paddd           m1, [t3+r10*4+400*0+8]
2119    mova            m5, [t3+r10*4+400*8+0]
2120    paddd           m4, [t3+r10*4+400*4+8]
2121    paddd           m5, [t3+r10*4+400*8+8]
2122    paddd           m2, m4, [t3+r10*4+400*4+4]
2123    paddd           m3, m5, [t3+r10*4+400*8+4]
2124    paddd           m0, m1
2125    pslld           m1, 2
2126    pslld           m2, 2
2127    paddd           m1, m0                ; ab5 565
2128    paddd           m3, m3                ; ab3[ 0] 222
2129    psubd           m2, m4                ; ab3[-1] 343
2130    mova [t3+r10*4+400*20], m3
2131    pandn           m0, m6, m1            ; a5 565
2132    mova [t3+r10*4+400*24], m2
2133    psrld           m1, 12                ; b5 565
2134    mova [t3+r10*4+400*12], m0
2135    paddd           m3, m3
2136    mova [t3+r10*4+400*16], m1
2137    psubd           m3, m5                ; ab3[ 0] 343
2138    mova [t3+r10*4+400*28], m3
2139    add            r10, 8
2140    jl .prep_n_loop
2141    ret
2142ALIGN function_align
2143.n0: ; neighbor + output (even rows)
2144    mov            r10, wq
2145.n0_loop:
2146    movu            m0, [t3+r10*4+4]
2147    paddd           m4, m0, [t3+r10*4+0]
2148    paddd           m4, [t3+r10*4+8]
2149    paddd           m0, m4
2150    pslld           m4, 2
2151    paddd           m4, m0
2152    pandn           m0, m6, m4
2153    psrld           m4, 12
2154    paddd           m2, m0, [t3+r10*4+400*12] ; a5
2155    mova [t3+r10*4+400*12], m0
2156    paddd           m0, m4, [t3+r10*4+400*16] ; b5 + (1 << 8)
2157    mova [t3+r10*4+400*16], m4
2158    mova            m3, [t3+r10*4+400*4+0]
2159    paddd           m3, [t3+r10*4+400*4+8]
2160    paddd           m5, m3, [t3+r10*4+400*4+4]
2161    paddd           m5, m5                    ; ab3[ 1] 222
2162    mova            m4, [t3+r10*4+400*20]
2163    paddd           m1, m4, [t3+r10*4+400*24] ; ab3[ 0] 222 + ab3[-1] 343
2164    mova [t3+r10*4+400*20], m5
2165    paddd           m5, m5
2166    psubd           m5, m3                    ; ab3[ 1] 343
2167    mova [t3+r10*4+400*24], m5
2168    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2169    pandn           m3, m6, m1
2170    psrld           m1, 12
2171    pandn           m5, m6, m4
2172    psrld           m4, 12
2173    paddd           m3, m5                    ; a3
2174    paddd           m1, m4                    ; b3 + (1 << 8)
2175    pmovzxbd        m4, [dstq+r10]
2176    pmaddwd         m2, m4                    ; a5 * src
2177    pmaddwd         m3, m4                    ; a3 * src
2178    psubd           m0, m2                    ; b5 - a5 * src + (1 << 8)
2179    psubd           m1, m3                    ; b3 - a3 * src + (1 << 8)
2180    psrld           m0, 9
2181    pslld           m1, 7
2182    pblendw         m0, m1, 0xaa
2183    pmaddwd         m0, m15
2184    psubd           m0, m6
2185    psrad           m0, 13
2186    paddd           m0, m4
2187    vextracti128   xm1, m0, 1
2188    packssdw       xm0, xm1
2189    packuswb       xm0, xm0
2190    movq    [dstq+r10], xm0
2191    add            r10, 8
2192    jl .n0_loop
2193    add           dstq, strideq
2194    ret
2195ALIGN function_align
2196.n1: ; neighbor + output (odd rows)
2197    mov            r10, wq
2198.n1_loop:
2199    mova            m3, [t3+r10*4+400*8+0]
2200    paddd           m3, [t3+r10*4+400*8+8]
2201    paddd           m5, m3, [t3+r10*4+400*8+4]
2202    paddd           m5, m5                    ; ab3[ 1] 222
2203    mova            m4, [t3+r10*4+400*20]
2204    paddd           m1, m4, [t3+r10*4+400*28] ; ab3[ 0] 222 + ab3[-1] 343
2205    mova [t3+r10*4+400*20], m5
2206    paddd           m5, m5
2207    psubd           m5, m3                    ; ab3[ 1] 343
2208    mova [t3+r10*4+400*28], m5
2209    paddd           m4, m5                    ; ab3[ 0] 222 + ab3[ 1] 343
2210    pandn           m3, m6, m1
2211    psrld           m1, 12
2212    pandn           m5, m6, m4
2213    psrld           m4, 12
2214    paddd           m3, m5                    ; -a3
2215    paddd           m1, m4                    ;  b3 + (1 << 8)
2216    pmovzxbd        m4, [dstq+r10]
2217    pmaddwd         m2, m4, [t3+r10*4+400*12] ; -a5 * src
2218    mova            m0, [t3+r10*4+400*16]     ;  b5 + (1 << 7)
2219    pmaddwd         m3, m4                    ; -a3 * src
2220    psubd           m0, m2                    ; a5 * src + b5 + (1 << 7)
2221    psubd           m1, m3                    ; a3 * src + b3 + (1 << 8)
2222    psrld           m0, 8
2223    pslld           m1, 7
2224    pblendw         m0, m1, 0xaa
2225    pmaddwd         m0, m15
2226    psubd           m0, m6
2227    psrad           m0, 13
2228    paddd           m0, m4
2229    vextracti128   xm1, m0, 1
2230    packssdw       xm0, xm1
2231    packuswb       xm0, xm0
2232    movq    [dstq+r10], xm0
2233    add            r10, 8
2234    jl .n1_loop
2235    add           dstq, strideq
2236    ret
2237
2238%endif ; ARCH_X86_64
2239