• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1; Copyright © 2021, VideoLAN and dav1d authors
2; Copyright © 2021, Two Orioles, LLC
3; All rights reserved.
4;
5; Redistribution and use in source and binary forms, with or without
6; modification, are permitted provided that the following conditions are met:
7;
8; 1. Redistributions of source code must retain the above copyright notice, this
9;    list of conditions and the following disclaimer.
10;
11; 2. Redistributions in binary form must reproduce the above copyright notice,
12;    this list of conditions and the following disclaimer in the documentation
13;    and/or other materials provided with the distribution.
14;
15; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25
26%include "config.asm"
27%include "ext/x86/x86inc.asm"
28
29SECTION_RODATA 64
30
31%macro SMOOTH_WEIGHTS 1-*
32const smooth_weights_1d_16bpc ; sm_weights[] << 7
33    %rep %0
34        dw %1*128
35        %rotate 1
36    %endrep
37const smooth_weights_2d_16bpc ; sm_weights[], 256 - sm_weights[]
38    %rep %0
39        dw %1, 256-%1
40        %rotate 1
41    %endrep
42%endmacro
43
44SMOOTH_WEIGHTS   0,   0, 255, 128, 255, 149,  85,  64, \
45               255, 197, 146, 105,  73,  50,  37,  32, \
46               255, 225, 196, 170, 145, 123, 102,  84, \
47                68,  54,  43,  33,  26,  20,  17,  16, \
48               255, 240, 225, 210, 196, 182, 169, 157, \
49               145, 133, 122, 111, 101,  92,  83,  74, \
50                66,  59,  52,  45,  39,  34,  29,  25, \
51                21,  17,  14,  12,  10,   9,   8,   8, \
52               255, 248, 240, 233, 225, 218, 210, 203, \
53               196, 189, 182, 176, 169, 163, 156, 150, \
54               144, 138, 133, 127, 121, 116, 111, 106, \
55               101,  96,  91,  86,  82,  77,  73,  69, \
56                65,  61,  57,  54,  50,  47,  44,  41, \
57                38,  35,  32,  29,  27,  25,  22,  20, \
58                18,  16,  15,  13,  12,  10,   9,   8, \
59                 7,   6,   6,   5,   5,   4,   4,   4
60
61%if ARCH_X86_64
62
63ipred_hv_shuf: db  6,  7,  6,  7,  0,  1,  2,  3,  2,  3,  2,  3,  8,  9, 10, 11
64               db  4,  5,  4,  5,  4,  5,  6,  7,  0,  1,  0,  1, 12, 13, 14, 15
65filter_shuf1:  db  8,  9,  0,  1,  2,  3,  4,  5,  6,  7, 14, 15, 12, 13, -1, -1
66filter_shuf2:  db  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,  4,  5,  2,  3, -1, -1
67filter_shuf3:  db 12, 13,  0,  1,  2,  3,  4,  5,  6,  7, 10, 11,  8,  9, -1, -1
68pal_pred_shuf: db  0,  2,  4,  6,  8, 10, 12, 14,  1,  3,  5,  7,  9, 11, 13, 15
69z_base_inc:    dw   0*64,   1*64,   2*64,   3*64,   4*64,   5*64,   6*64,   7*64
70               dw   8*64,   9*64,  10*64,  11*64,  12*64,  13*64,  14*64,  15*64
71z_filter_t0:   db 55,127, 39,127, 39,127,  7, 15, 31,  7, 15, 31,  0,  3, 31,  0
72z_filter_t1:   db 39, 63, 19, 47, 19, 47,  3,  3,  3,  3,  3,  3,  0,  0,  0,  0
73z_filter_wh:   db  7,  7, 11, 11, 15, 15, 19, 19, 19, 23, 23, 23, 31, 31, 31, 39
74               db 39, 39, 47, 47, 47, 63, 63, 63, 79, 79, 79, -1
75pw_m1024:      times 2 dw -1024
76pw_1to16:      dw  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16
77pw_16to1:      dw 16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1
78z2_ymul:       dw  1,  2,  1,  2,  1,  2,  1,  2,  3,  4,  3,  4,  3,  4,  3,  4
79z2_ymul8:      dw  1,  2,  5,  6,  3,  4,  7,  8,  5,  6, 16, 16,  7,  8
80pb_90:         times 4 db 90
81z2_y_shuf_h4:  dd  3,  7,  2,  6,  1,  5,  0,  4
82z_upsample:    db  0,  1,  4,  5,  8,  9, 12, 13,  2,  3,  6,  7, 10, 11, 14, 15
83z2_x_shuf:     db  0,  1,  2,  3,  4,  5,  6,  7,  2,  3,  4,  5,  6,  7,  8,  9
84z2_y_shuf:     db  6,  7, 14, 15,  4,  5, 12, 13,  4,  5, 12, 13,  2,  3, 10, 11
85z2_y_shuf_us:  db  6,  7, 14, 15,  2,  3, 10, 11,  4,  5, 12, 13,  0,  1,  8,  9
86z_filter_k:    dw  4,  4,  5,  5,  4,  4
87               dw  8,  8,  6,  6,  4,  4
88               dw  0,  0,  0,  0,  2,  2
89
90%define pw_2  (z_filter_k+32)
91%define pw_4  (z_filter_k+ 0)
92%define pw_16 (z2_ymul8  +20)
93
94pw_1:    times 2 dw 1
95pw_3:    times 2 dw 3
96pw_62:   times 2 dw 62
97pw_512:  times 2 dw 512
98pw_2048: times 2 dw 2048
99pd_8:    dd 8
100
101%macro JMP_TABLE 3-*
102    %xdefine %1_%2_table (%%table - 2*4)
103    %xdefine %%base mangle(private_prefix %+ _%1_%2)
104    %%table:
105    %rep %0 - 2
106        dd %%base %+ .%3 - (%%table - 2*4)
107        %rotate 1
108    %endrep
109%endmacro
110
111%define ipred_dc_splat_16bpc_avx2_table (ipred_dc_16bpc_avx2_table + 10*4)
112%define ipred_cfl_splat_16bpc_avx2_table (ipred_cfl_16bpc_avx2_table + 8*4)
113
114JMP_TABLE ipred_dc_16bpc,         avx2, h4, h8, h16, h32, h64, w4, w8, w16, w32, w64, \
115                                        s4-10*4, s8-10*4, s16-10*4, s32-10*4, s64-10*4
116JMP_TABLE ipred_dc_left_16bpc,    avx2, h4, h8, h16, h32, h64
117JMP_TABLE ipred_h_16bpc,          avx2, w4, w8, w16, w32, w64
118JMP_TABLE ipred_paeth_16bpc,      avx2, w4, w8, w16, w32, w64
119JMP_TABLE ipred_smooth_16bpc,     avx2, w4, w8, w16, w32, w64
120JMP_TABLE ipred_smooth_h_16bpc,   avx2, w4, w8, w16, w32, w64
121JMP_TABLE ipred_smooth_v_16bpc,   avx2, w4, w8, w16, w32, w64
122JMP_TABLE ipred_z1_16bpc,         avx2, w4, w8, w16, w32, w64
123JMP_TABLE ipred_z2_16bpc,         avx2, w4, w8, w16, w32, w64
124JMP_TABLE ipred_z3_16bpc,         avx2, h4, h8, h16, h32, h64
125JMP_TABLE ipred_filter_16bpc,     avx2, w4, w8, w16, w32
126JMP_TABLE ipred_cfl_16bpc,        avx2, h4, h8, h16, h32, w4, w8, w16, w32, \
127                                        s4-8*4, s8-8*4, s16-8*4, s32-8*4
128JMP_TABLE ipred_cfl_left_16bpc,   avx2, h4, h8, h16, h32
129JMP_TABLE ipred_cfl_ac_444_16bpc, avx2, w4, w8, w16, w32
130JMP_TABLE pal_pred_16bpc,         avx2, w4, w8, w16, w32, w64
131
132cextern dr_intra_derivative
133cextern filter_intra_taps
134
135SECTION .text
136
137INIT_YMM avx2
138cglobal ipred_dc_top_16bpc, 3, 7, 6, dst, stride, tl, w, h
139    movifnidn            hd, hm
140    add                 tlq, 2
141    movd                xm4, wd
142    pxor                xm3, xm3
143    pavgw               xm4, xm3
144    tzcnt                wd, wd
145    movd                xm5, wd
146    movu                 m0, [tlq]
147    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
148    movsxd               r6, [r5+wq*4]
149    add                  r6, r5
150    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
151    movsxd               wq, [r5+wq*4]
152    add                  wq, r5
153    jmp                  r6
154
155cglobal ipred_dc_left_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
156    mov                  hd, hm
157    sub                 tlq, hq
158    movd                xm4, hd
159    sub                 tlq, hq
160    pxor                xm3, xm3
161    pavgw               xm4, xm3
162    tzcnt               r6d, hd
163    movd                xm5, r6d
164    movu                 m0, [tlq]
165    lea                  r5, [ipred_dc_left_16bpc_avx2_table]
166    movsxd               r6, [r5+r6*4]
167    add                  r6, r5
168    add                  r5, ipred_dc_splat_16bpc_avx2_table-ipred_dc_left_16bpc_avx2_table
169    tzcnt                wd, wd
170    movsxd               wq, [r5+wq*4]
171    add                  wq, r5
172    jmp                  r6
173.h64:
174    paddw                m0, [tlq+96]
175    paddw                m0, [tlq+64]
176.h32:
177    paddw                m0, [tlq+32]
178.h16:
179    vextracti128        xm1, m0, 1
180    paddw               xm0, xm1
181.h8:
182    psrldq              xm1, xm0, 8
183    paddw               xm0, xm1
184.h4:
185    punpcklwd           xm0, xm3
186    psrlq               xm1, xm0, 32
187    paddd               xm0, xm1
188    psrldq              xm1, xm0, 8
189    paddd               xm0, xm1
190    paddd               xm0, xm4
191    psrld               xm0, xm5
192    lea            stride3q, [strideq*3]
193    vpbroadcastw         m0, xm0
194    mova                 m1, m0
195    mova                 m2, m0
196    mova                 m3, m0
197    jmp                  wq
198
199cglobal ipred_dc_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
200    movifnidn            hd, hm
201    tzcnt               r6d, hd
202    lea                 r5d, [wq+hq]
203    movd                xm4, r5d
204    tzcnt               r5d, r5d
205    movd                xm5, r5d
206    lea                  r5, [ipred_dc_16bpc_avx2_table]
207    tzcnt                wd, wd
208    movsxd               r6, [r5+r6*4]
209    movsxd               wq, [r5+wq*4+5*4]
210    pxor                 m3, m3
211    psrlw               xm4, 1
212    add                  r6, r5
213    add                  wq, r5
214    lea            stride3q, [strideq*3]
215    jmp                  r6
216.h4:
217    movq                xm0, [tlq-8]
218    jmp                  wq
219.w4:
220    movq                xm1, [tlq+2]
221    paddw                m0, m4
222    paddw                m0, m1
223    psrlq                m1, m0, 32
224    paddw                m0, m1
225    psrld                m1, m0, 16
226    paddw                m0, m1
227    cmp                  hd, 4
228    jg .w4_mul
229    psrlw               xm0, 3
230    jmp .w4_end
231.w4_mul:
232    vextracti128        xm1, m0, 1
233    paddw               xm0, xm1
234    lea                 r2d, [hq*2]
235    mov                 r6d, 0xAAAB6667
236    shrx                r6d, r6d, r2d
237    punpckhwd           xm1, xm0, xm3
238    punpcklwd           xm0, xm3
239    paddd               xm0, xm1
240    movd                xm1, r6d
241    psrld               xm0, 2
242    pmulhuw             xm0, xm1
243    psrlw               xm0, 1
244.w4_end:
245    vpbroadcastw        xm0, xm0
246.s4:
247    movq   [dstq+strideq*0], xm0
248    movq   [dstq+strideq*1], xm0
249    movq   [dstq+strideq*2], xm0
250    movq   [dstq+stride3q ], xm0
251    lea                dstq, [dstq+strideq*4]
252    sub                  hd, 4
253    jg .s4
254    RET
255ALIGN function_align
256.h8:
257    mova                xm0, [tlq-16]
258    jmp                  wq
259.w8:
260    vextracti128        xm1, m0, 1
261    paddw               xm0, [tlq+2]
262    paddw               xm0, xm4
263    paddw               xm0, xm1
264    psrld               xm1, xm0, 16
265    paddw               xm0, xm1
266    pblendw             xm0, xm3, 0xAA
267    psrlq               xm1, xm0, 32
268    paddd               xm0, xm1
269    psrldq              xm1, xm0, 8
270    paddd               xm0, xm1
271    psrld               xm0, xm5
272    cmp                  hd, 8
273    je .w8_end
274    mov                 r6d, 0xAAAB
275    mov                 r2d, 0x6667
276    cmp                  hd, 32
277    cmovz               r6d, r2d
278    movd                xm1, r6d
279    pmulhuw             xm0, xm1
280    psrlw               xm0, 1
281.w8_end:
282    vpbroadcastw        xm0, xm0
283.s8:
284    mova   [dstq+strideq*0], xm0
285    mova   [dstq+strideq*1], xm0
286    mova   [dstq+strideq*2], xm0
287    mova   [dstq+stride3q ], xm0
288    lea                dstq, [dstq+strideq*4]
289    sub                  hd, 4
290    jg .s8
291    RET
292ALIGN function_align
293.h16:
294    mova                 m0, [tlq-32]
295    jmp                  wq
296.w16:
297    paddw                m0, [tlq+2]
298    vextracti128        xm1, m0, 1
299    paddw               xm0, xm4
300    paddw               xm0, xm1
301    punpckhwd           xm1, xm0, xm3
302    punpcklwd           xm0, xm3
303    paddd               xm0, xm1
304    psrlq               xm1, xm0, 32
305    paddd               xm0, xm1
306    psrldq              xm1, xm0, 8
307    paddd               xm0, xm1
308    psrld               xm0, xm5
309    cmp                  hd, 16
310    je .w16_end
311    mov                 r6d, 0xAAAB
312    mov                 r2d, 0x6667
313    test                 hb, 8|32
314    cmovz               r6d, r2d
315    movd                xm1, r6d
316    pmulhuw             xm0, xm1
317    psrlw               xm0, 1
318.w16_end:
319    vpbroadcastw         m0, xm0
320.s16:
321    mova   [dstq+strideq*0], m0
322    mova   [dstq+strideq*1], m0
323    mova   [dstq+strideq*2], m0
324    mova   [dstq+stride3q ], m0
325    lea                dstq, [dstq+strideq*4]
326    sub                  hd, 4
327    jg .s16
328    RET
329ALIGN function_align
330.h32:
331    mova                 m0, [tlq-64]
332    paddw                m0, [tlq-32]
333    jmp                  wq
334.w32:
335    paddw                m0, [tlq+ 2]
336    paddw                m0, [tlq+34]
337    vextracti128        xm1, m0, 1
338    paddw               xm0, xm4
339    paddw               xm0, xm1
340    punpcklwd           xm1, xm0, xm3
341    punpckhwd           xm0, xm3
342    paddd               xm0, xm1
343    psrlq               xm1, xm0, 32
344    paddd               xm0, xm1
345    psrldq              xm1, xm0, 8
346    paddd               xm0, xm1
347    psrld               xm0, xm5
348    cmp                  hd, 32
349    je .w32_end
350    lea                 r2d, [hq*2]
351    mov                 r6d, 0x6667AAAB
352    shrx                r6d, r6d, r2d
353    movd                xm1, r6d
354    pmulhuw             xm0, xm1
355    psrlw               xm0, 1
356.w32_end:
357    vpbroadcastw         m0, xm0
358    mova                 m1, m0
359.s32:
360    mova [dstq+strideq*0+32*0], m0
361    mova [dstq+strideq*0+32*1], m1
362    mova [dstq+strideq*1+32*0], m0
363    mova [dstq+strideq*1+32*1], m1
364    mova [dstq+strideq*2+32*0], m0
365    mova [dstq+strideq*2+32*1], m1
366    mova [dstq+stride3q +32*0], m0
367    mova [dstq+stride3q +32*1], m1
368    lea                dstq, [dstq+strideq*4]
369    sub                  hd, 4
370    jg .s32
371    RET
372ALIGN function_align
373.h64:
374    mova                 m0, [tlq-128]
375    mova                 m1, [tlq- 96]
376    paddw                m0, [tlq- 64]
377    paddw                m1, [tlq- 32]
378    paddw                m0, m1
379    jmp                  wq
380.w64:
381    movu                 m1, [tlq+ 2]
382    paddw                m0, [tlq+34]
383    paddw                m1, [tlq+66]
384    paddw                m0, [tlq+98]
385    paddw                m0, m1
386    vextracti128        xm1, m0, 1
387    paddw               xm0, xm1
388    punpcklwd           xm1, xm0, xm3
389    punpckhwd           xm0, xm3
390    paddd               xm1, xm4
391    paddd               xm0, xm1
392    psrlq               xm1, xm0, 32
393    paddd               xm0, xm1
394    psrldq              xm1, xm0, 8
395    paddd               xm0, xm1
396    psrld               xm0, xm5
397    cmp                  hd, 64
398    je .w64_end
399    mov                 r6d, 0x6667AAAB
400    shrx                r6d, r6d, hd
401    movd                xm1, r6d
402    pmulhuw             xm0, xm1
403    psrlw               xm0, 1
404.w64_end:
405    vpbroadcastw         m0, xm0
406    mova                 m1, m0
407    mova                 m2, m0
408    mova                 m3, m0
409.s64:
410    mova [dstq+strideq*0+32*0], m0
411    mova [dstq+strideq*0+32*1], m1
412    mova [dstq+strideq*0+32*2], m2
413    mova [dstq+strideq*0+32*3], m3
414    mova [dstq+strideq*1+32*0], m0
415    mova [dstq+strideq*1+32*1], m1
416    mova [dstq+strideq*1+32*2], m2
417    mova [dstq+strideq*1+32*3], m3
418    lea                dstq, [dstq+strideq*2]
419    sub                  hd, 2
420    jg .s64
421    RET
422
423cglobal ipred_dc_128_16bpc, 2, 7, 6, dst, stride, tl, w, h, stride3
424    mov                 r6d, r8m
425    shr                 r6d, 11
426    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
427    tzcnt                wd, wd
428    movifnidn            hd, hm
429    movsxd               wq, [r5+wq*4]
430    vpbroadcastd         m0, [r5-ipred_dc_splat_16bpc_avx2_table+pw_512+r6*4]
431    mova                 m1, m0
432    mova                 m2, m0
433    mova                 m3, m0
434    add                  wq, r5
435    lea            stride3q, [strideq*3]
436    jmp                  wq
437
438cglobal ipred_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
439    movifnidn            hd, hm
440    movu                 m0, [tlq+ 2]
441    movu                 m1, [tlq+34]
442    movu                 m2, [tlq+66]
443    movu                 m3, [tlq+98]
444    lea                  r5, [ipred_dc_splat_16bpc_avx2_table]
445    tzcnt                wd, wd
446    movsxd               wq, [r5+wq*4]
447    add                  wq, r5
448    lea            stride3q, [strideq*3]
449    jmp                  wq
450
451%macro IPRED_H 2 ; w, store_type
452    vpbroadcastw         m0, [tlq-2]
453    vpbroadcastw         m1, [tlq-4]
454    vpbroadcastw         m2, [tlq-6]
455    vpbroadcastw         m3, [tlq-8]
456    sub                 tlq, 8
457    mov%2  [dstq+strideq*0], m0
458    mov%2  [dstq+strideq*1], m1
459    mov%2  [dstq+strideq*2], m2
460    mov%2  [dstq+stride3q ], m3
461    lea                dstq, [dstq+strideq*4]
462    sub                  hd, 4
463    jg .w%1
464    RET
465ALIGN function_align
466%endmacro
467
468cglobal ipred_h_16bpc, 3, 6, 4, dst, stride, tl, w, h, stride3
469    movifnidn            hd, hm
470    lea                  r5, [ipred_h_16bpc_avx2_table]
471    tzcnt                wd, wd
472    movsxd               wq, [r5+wq*4]
473    add                  wq, r5
474    lea            stride3q, [strideq*3]
475    jmp                  wq
476INIT_XMM avx2
477.w4:
478    IPRED_H               4, q
479.w8:
480    IPRED_H               8, a
481INIT_YMM avx2
482.w16:
483    IPRED_H              16, a
484.w32:
485    vpbroadcastw         m0, [tlq-2]
486    vpbroadcastw         m1, [tlq-4]
487    vpbroadcastw         m2, [tlq-6]
488    vpbroadcastw         m3, [tlq-8]
489    sub                 tlq, 8
490    mova [dstq+strideq*0+32*0], m0
491    mova [dstq+strideq*0+32*1], m0
492    mova [dstq+strideq*1+32*0], m1
493    mova [dstq+strideq*1+32*1], m1
494    mova [dstq+strideq*2+32*0], m2
495    mova [dstq+strideq*2+32*1], m2
496    mova [dstq+stride3q +32*0], m3
497    mova [dstq+stride3q +32*1], m3
498    lea                dstq, [dstq+strideq*4]
499    sub                  hd, 4
500    jg .w32
501    RET
502.w64:
503    vpbroadcastw         m0, [tlq-2]
504    vpbroadcastw         m1, [tlq-4]
505    sub                 tlq, 4
506    mova [dstq+strideq*0+32*0], m0
507    mova [dstq+strideq*0+32*1], m0
508    mova [dstq+strideq*0+32*2], m0
509    mova [dstq+strideq*0+32*3], m0
510    mova [dstq+strideq*1+32*0], m1
511    mova [dstq+strideq*1+32*1], m1
512    mova [dstq+strideq*1+32*2], m1
513    mova [dstq+strideq*1+32*3], m1
514    lea                dstq, [dstq+strideq*2]
515    sub                  hd, 2
516    jg .w64
517    RET
518
519%macro PAETH 3 ; top, signed_ldiff, ldiff
520    paddw               m0, m%2, m1
521    psubw               m7, m3, m0  ; tldiff
522    psubw               m0, m%1     ; tdiff
523    pabsw               m7, m7
524    pabsw               m0, m0
525    pminsw              m7, m0
526    pcmpeqw             m0, m7
527    pcmpgtw             m7, m%3, m7
528    vpblendvb           m0, m3, m%1, m0
529    vpblendvb           m0, m1, m0, m7
530%endmacro
531
532cglobal ipred_paeth_16bpc, 3, 6, 8, dst, stride, tl, w, h
533%define base r5-ipred_paeth_16bpc_avx2_table
534    movifnidn           hd, hm
535    lea                 r5, [ipred_paeth_16bpc_avx2_table]
536    tzcnt               wd, wd
537    movsxd              wq, [r5+wq*4]
538    vpbroadcastw        m3, [tlq]   ; topleft
539    add                 wq, r5
540    jmp                 wq
541.w4:
542    vpbroadcastq        m2, [tlq+2] ; top
543    movsldup            m6, [base+ipred_hv_shuf]
544    lea                 r3, [strideq*3]
545    psubw               m4, m2, m3
546    pabsw               m5, m4
547.w4_loop:
548    sub                tlq, 8
549    vpbroadcastq        m1, [tlq]
550    pshufb              m1, m6      ; left
551    PAETH                2, 4, 5
552    vextracti128       xm1, m0, 1
553    movq  [dstq+strideq*0], xm0
554    movq  [dstq+strideq*1], xm1
555    movhps [dstq+strideq*2], xm0
556    movhps [dstq+r3       ], xm1
557    lea               dstq, [dstq+strideq*4]
558    sub                 hd, 4
559    jg .w4_loop
560    RET
561ALIGN function_align
562.w8:
563    vbroadcasti128      m2, [tlq+2]
564    movsldup            m6, [base+ipred_hv_shuf]
565    psubw               m4, m2, m3
566    pabsw               m5, m4
567.w8_loop:
568    sub                tlq, 4
569    vpbroadcastd        m1, [tlq]
570    pshufb              m1, m6
571    PAETH                2, 4, 5
572    mova         [dstq+strideq*0], xm0
573    vextracti128 [dstq+strideq*1], m0, 1
574    lea               dstq, [dstq+strideq*2]
575    sub                 hd, 2
576    jg .w8_loop
577    RET
578ALIGN function_align
579.w16:
580    movu                m2, [tlq+2]
581    psubw               m4, m2, m3
582    pabsw               m5, m4
583.w16_loop:
584    sub                tlq, 2
585    vpbroadcastw        m1, [tlq]
586    PAETH                2, 4, 5
587    mova            [dstq], m0
588    add               dstq, strideq
589    dec                 hd
590    jg .w16_loop
591    RET
592ALIGN function_align
593.w32:
594    movu                m2, [tlq+2]
595    movu                m6, [tlq+34]
596%if WIN64
597    movaps             r4m, xmm8
598    movaps             r6m, xmm9
599%endif
600    psubw               m4, m2, m3
601    psubw               m8, m6, m3
602    pabsw               m5, m4
603    pabsw               m9, m8
604.w32_loop:
605    sub                tlq, 2
606    vpbroadcastw        m1, [tlq]
607    PAETH                2, 4, 5
608    mova       [dstq+32*0], m0
609    PAETH                6, 8, 9
610    mova       [dstq+32*1], m0
611    add               dstq, strideq
612    dec                 hd
613    jg .w32_loop
614%if WIN64
615    movaps            xmm8, r4m
616    movaps            xmm9, r6m
617%endif
618    RET
619ALIGN function_align
620.w64:
621    WIN64_SPILL_XMM 16
622    movu                m2, [tlq+ 2]
623    movu                m6, [tlq+34]
624    movu               m10, [tlq+66]
625    movu               m13, [tlq+98]
626    psubw               m4, m2, m3
627    psubw               m8, m6, m3
628    psubw              m11, m10, m3
629    psubw              m14, m13, m3
630    pabsw               m5, m4
631    pabsw               m9, m8
632    pabsw              m12, m11
633    pabsw              m15, m14
634.w64_loop:
635    sub                tlq, 2
636    vpbroadcastw        m1, [tlq]
637    PAETH                2, 4, 5
638    mova       [dstq+32*0], m0
639    PAETH                6, 8, 9
640    mova       [dstq+32*1], m0
641    PAETH               10, 11, 12
642    mova       [dstq+32*2], m0
643    PAETH               13, 14, 15
644    mova       [dstq+32*3], m0
645    add               dstq, strideq
646    dec                 hd
647    jg .w64_loop
648    RET
649
650cglobal ipred_smooth_v_16bpc, 3, 7, 6, dst, stride, tl, w, h, weights
651%define base r6-ipred_smooth_v_16bpc_avx2_table
652    lea                  r6, [ipred_smooth_v_16bpc_avx2_table]
653    tzcnt                wd, wm
654    mov                  hd, hm
655    movsxd               wq, [r6+wq*4]
656    lea            weightsq, [base+smooth_weights_1d_16bpc+hq*4]
657    neg                  hq
658    vpbroadcastw         m5, [tlq+hq*2] ; bottom
659    add                  wq, r6
660    jmp                  wq
661.w4:
662    vpbroadcastq         m4, [tlq+2]    ; top
663    movsldup             m3, [base+ipred_hv_shuf]
664    lea                  r6, [strideq*3]
665    psubw                m4, m5         ; top - bottom
666.w4_loop:
667    vpbroadcastq         m0, [weightsq+hq*2]
668    pshufb               m0, m3
669    pmulhrsw             m0, m4
670    paddw                m0, m5
671    vextracti128        xm1, m0, 1
672    movhps [dstq+strideq*0], xm1
673    movhps [dstq+strideq*1], xm0
674    movq   [dstq+strideq*2], xm1
675    movq   [dstq+r6       ], xm0
676    lea                dstq, [dstq+strideq*4]
677    add                  hq, 4
678    jl .w4_loop
679.ret:
680    RET
681.w8:
682    vbroadcasti128       m4, [tlq+2]
683    movsldup             m3, [base+ipred_hv_shuf]
684    lea                  r6, [strideq*3]
685    psubw                m4, m5
686.w8_loop:
687    vpbroadcastd         m0, [weightsq+hq*2+0]
688    vpbroadcastd         m1, [weightsq+hq*2+4]
689    pshufb               m0, m3
690    pshufb               m1, m3
691    pmulhrsw             m0, m4
692    pmulhrsw             m1, m4
693    paddw                m0, m5
694    paddw                m1, m5
695    vextracti128 [dstq+strideq*0], m0, 1
696    mova         [dstq+strideq*1], xm0
697    vextracti128 [dstq+strideq*2], m1, 1
698    mova         [dstq+r6       ], xm1
699    lea                dstq, [dstq+strideq*4]
700    add                  hq, 4
701    jl .w8_loop
702    RET
703.w16:
704    movu                 m4, [tlq+2]
705    lea                  r6, [strideq*3]
706    psubw                m4, m5
707.w16_loop:
708    vpbroadcastw         m0, [weightsq+hq*2+0]
709    vpbroadcastw         m1, [weightsq+hq*2+2]
710    vpbroadcastw         m2, [weightsq+hq*2+4]
711    vpbroadcastw         m3, [weightsq+hq*2+6]
712    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
713    REPX   {paddw    x, m5}, m0, m1, m2, m3
714    mova   [dstq+strideq*0], m0
715    mova   [dstq+strideq*1], m1
716    mova   [dstq+strideq*2], m2
717    mova   [dstq+r6       ], m3
718    lea                dstq, [dstq+strideq*4]
719    add                  hq, 4
720    jl .w16_loop
721    RET
722.w32:
723    WIN64_SPILL_XMM       7
724    movu                 m4, [tlq+ 2]
725    movu                 m6, [tlq+34]
726    psubw                m4, m5
727    psubw                m6, m5
728.w32_loop:
729    vpbroadcastw         m1, [weightsq+hq*2+0]
730    vpbroadcastw         m3, [weightsq+hq*2+2]
731    pmulhrsw             m0, m4, m1
732    pmulhrsw             m1, m6
733    pmulhrsw             m2, m4, m3
734    pmulhrsw             m3, m6
735    REPX      {paddw x, m5}, m0, m1, m2, m3
736    mova [dstq+strideq*0+32*0], m0
737    mova [dstq+strideq*0+32*1], m1
738    mova [dstq+strideq*1+32*0], m2
739    mova [dstq+strideq*1+32*1], m3
740    lea                dstq, [dstq+strideq*2]
741    add                  hq, 2
742    jl .w32_loop
743    RET
744.w64:
745    WIN64_SPILL_XMM       8
746    movu                 m3, [tlq+ 2]
747    movu                 m4, [tlq+34]
748    movu                 m6, [tlq+66]
749    movu                 m7, [tlq+98]
750    REPX      {psubw x, m5}, m3, m4, m6, m7
751.w64_loop:
752    vpbroadcastw         m2, [weightsq+hq*2]
753    pmulhrsw             m0, m3, m2
754    pmulhrsw             m1, m4, m2
755    paddw                m0, m5
756    paddw                m1, m5
757    mova        [dstq+32*0], m0
758    pmulhrsw             m0, m6, m2
759    mova        [dstq+32*1], m1
760    pmulhrsw             m1, m7, m2
761    paddw                m0, m5
762    paddw                m1, m5
763    mova        [dstq+32*2], m0
764    mova        [dstq+32*3], m1
765    add                dstq, strideq
766    inc                  hq
767    jl .w64_loop
768    RET
769
770cglobal ipred_smooth_h_16bpc, 3, 7, 6, dst, stride, tl, w, h, stride3
771%define base r6-ipred_smooth_h_16bpc_avx2_table
772    lea                  r6, [ipred_smooth_h_16bpc_avx2_table]
773    mov                  wd, wm
774    movifnidn            hd, hm
775    vpbroadcastw         m5, [tlq+wq*2] ; right
776    tzcnt                wd, wd
777    add                  hd, hd
778    movsxd               wq, [r6+wq*4]
779    sub                 tlq, hq
780    lea            stride3q, [strideq*3]
781    add                  wq, r6
782    jmp                  wq
783.w4:
784    vpbroadcastq         m4, [base+smooth_weights_1d_16bpc+4*2]
785    movsldup             m3, [base+ipred_hv_shuf]
786.w4_loop:
787    vpbroadcastq         m0, [tlq+hq-8] ; left
788    pshufb               m0, m3
789    psubw                m0, m5         ; left - right
790    pmulhrsw             m0, m4
791    paddw                m0, m5
792    vextracti128        xm1, m0, 1
793    movq   [dstq+strideq*0], xm0
794    movq   [dstq+strideq*1], xm1
795    movhps [dstq+strideq*2], xm0
796    movhps [dstq+stride3q ], xm1
797    lea                dstq, [dstq+strideq*4]
798    sub                  hd, 4*2
799    jg .w4_loop
800    RET
801.w8:
802    vbroadcasti128       m4, [base+smooth_weights_1d_16bpc+8*2]
803    movsldup             m3, [base+ipred_hv_shuf]
804.w8_loop:
805    vpbroadcastd         m0, [tlq+hq-4]
806    vpbroadcastd         m1, [tlq+hq-8]
807    pshufb               m0, m3
808    pshufb               m1, m3
809    psubw                m0, m5
810    psubw                m1, m5
811    pmulhrsw             m0, m4
812    pmulhrsw             m1, m4
813    paddw                m0, m5
814    paddw                m1, m5
815    mova         [dstq+strideq*0], xm0
816    vextracti128 [dstq+strideq*1], m0, 1
817    mova         [dstq+strideq*2], xm1
818    vextracti128 [dstq+stride3q ], m1, 1
819    lea                dstq, [dstq+strideq*4]
820    sub                  hq, 4*2
821    jg .w8_loop
822    RET
823.w16:
824    movu                 m4, [base+smooth_weights_1d_16bpc+16*2]
825.w16_loop:
826    vpbroadcastq         m3, [tlq+hq-8]
827    punpcklwd            m3, m3
828    psubw                m3, m5
829    pshufd               m0, m3, q3333
830    pshufd               m1, m3, q2222
831    pshufd               m2, m3, q1111
832    pshufd               m3, m3, q0000
833    REPX   {pmulhrsw x, m4}, m0, m1, m2, m3
834    REPX   {paddw    x, m5}, m0, m1, m2, m3
835    mova   [dstq+strideq*0], m0
836    mova   [dstq+strideq*1], m1
837    mova   [dstq+strideq*2], m2
838    mova   [dstq+stride3q ], m3
839    lea                dstq, [dstq+strideq*4]
840    sub                  hq, 4*2
841    jg .w16_loop
842    RET
843.w32:
844    WIN64_SPILL_XMM       7
845    movu                 m4, [base+smooth_weights_1d_16bpc+32*2]
846    movu                 m6, [base+smooth_weights_1d_16bpc+32*3]
847.w32_loop:
848    vpbroadcastw         m1, [tlq+hq-2]
849    vpbroadcastw         m3, [tlq+hq-4]
850    psubw                m1, m5
851    psubw                m3, m5
852    pmulhrsw             m0, m4, m1
853    pmulhrsw             m1, m6
854    pmulhrsw             m2, m4, m3
855    pmulhrsw             m3, m6
856    REPX      {paddw x, m5}, m0, m1, m2, m3
857    mova [dstq+strideq*0+32*0], m0
858    mova [dstq+strideq*0+32*1], m1
859    mova [dstq+strideq*1+32*0], m2
860    mova [dstq+strideq*1+32*1], m3
861    lea                dstq, [dstq+strideq*2]
862    sub                  hq, 2*2
863    jg .w32_loop
864    RET
865.w64:
866    WIN64_SPILL_XMM       8
867    movu                 m3, [base+smooth_weights_1d_16bpc+32*4]
868    movu                 m4, [base+smooth_weights_1d_16bpc+32*5]
869    movu                 m6, [base+smooth_weights_1d_16bpc+32*6]
870    movu                 m7, [base+smooth_weights_1d_16bpc+32*7]
871.w64_loop:
872    vpbroadcastw         m2, [tlq+hq-2]
873    psubw                m2, m5
874    pmulhrsw             m0, m3, m2
875    pmulhrsw             m1, m4, m2
876    paddw                m0, m5
877    paddw                m1, m5
878    mova        [dstq+32*0], m0
879    pmulhrsw             m0, m6, m2
880    mova        [dstq+32*1], m1
881    pmulhrsw             m1, m7, m2
882    paddw                m0, m5
883    paddw                m1, m5
884    mova        [dstq+32*2], m0
885    mova        [dstq+32*3], m1
886    add                dstq, strideq
887    sub                  hq, 1*2
888    jg .w64_loop
889    RET
890
891%macro SMOOTH_2D_END 6 ; src[1-2], mul[1-2], add[1-2]
892    pmaddwd             m0, m%1, m%3
893    pmaddwd             m1, m%2, m%4
894    paddd               m0, m%5
895    paddd               m1, m%6
896    psrld               m0, 8
897    psrld               m1, 8
898    packssdw            m0, m1
899    pavgw               m0, m5
900%endmacro
901
902cglobal ipred_smooth_16bpc, 3, 7, 6, dst, stride, tl, w, h, v_weights
903%define base r6-ipred_smooth_16bpc_avx2_table
904    lea                 r6, [ipred_smooth_16bpc_avx2_table]
905    mov                 wd, wm
906    vpbroadcastw        m4, [tlq+wq*2] ; right
907    tzcnt               wd, wd
908    mov                 hd, hm
909    sub                tlq, hq
910    sub                tlq, hq
911    movsxd              wq, [r6+wq*4]
912    pxor                m5, m5
913    add                 wq, r6
914    lea         v_weightsq, [base+smooth_weights_2d_16bpc+hq*4]
915    jmp                 wq
916.w4:
917    WIN64_SPILL_XMM     11
918    vpbroadcastw        m0, [tlq] ; bottom
919    vpbroadcastq        m6, [tlq+hq*2+2]
920    movsldup            m7, [base+ipred_hv_shuf]
921    movshdup            m9, [base+ipred_hv_shuf]
922    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+4*4]
923    punpcklwd           m6, m0 ; top, bottom
924    punpcklqdq          m8, m9, m9
925    punpckhqdq          m9, m9
926    lea                 r3, [strideq*3]
927.w4_loop:
928    vpbroadcastq        m3, [tlq+hq*2-8]
929    vbroadcasti128      m1, [v_weightsq]
930    pshufb              m3, m7
931    punpcklwd           m2, m3, m4 ; left, right
932    punpckhwd           m3, m4
933    pmaddwd             m2, m10
934    pmaddwd             m3, m10
935    pshufb              m0, m1, m8
936    pshufb              m1, m9
937    SMOOTH_2D_END        0, 1, 6, 6, 2, 3
938    vextracti128       xm1, m0, 1
939    movq   [dstq+strideq*0], xm0
940    movq   [dstq+strideq*1], xm1
941    movhps [dstq+strideq*2], xm0
942    movhps [dstq+r3       ], xm1
943    lea               dstq, [dstq+strideq*4]
944    add         v_weightsq, 16
945    sub                 hd, 4
946    jg .w4_loop
947    RET
948.w8:
949    WIN64_SPILL_XMM     12
950    vpbroadcastw        m0, [tlq] ; bottom
951    vbroadcasti128      m7, [tlq+hq*2+2]
952    movsldup            m8, [base+ipred_hv_shuf]
953    movshdup            m9, [base+ipred_hv_shuf]
954    vbroadcasti128     m10, [base+smooth_weights_2d_16bpc+8*4+16*0]
955    vbroadcasti128     m11, [base+smooth_weights_2d_16bpc+8*4+16*1]
956    punpcklwd           m6, m7, m0 ; top, bottom
957    punpckhwd           m7, m0
958.w8_loop:
959    vpbroadcastd        m3, [tlq+hq*2-4]
960    vpbroadcastq        m1, [v_weightsq]
961    pshufb              m3, m8
962    punpcklwd           m2, m3, m4 ; left, right
963    punpckhwd           m3, m4
964    pmaddwd             m2, m10
965    pmaddwd             m3, m11
966    pshufb              m1, m9
967    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
968    mova         [dstq+strideq*0], xm0
969    vextracti128 [dstq+strideq*1], m0, 1
970    lea               dstq, [dstq+strideq*2]
971    add         v_weightsq, 8
972    sub                 hd, 2
973    jg .w8_loop
974    RET
975.w16:
976    WIN64_SPILL_XMM     11
977    vpbroadcastw        m0, [tlq] ; bottom
978    movu                m7, [tlq+hq*2+2]
979    mova               xm8, [base+smooth_weights_2d_16bpc+16*4+16*0]
980    mova               xm9, [base+smooth_weights_2d_16bpc+16*4+16*1]
981    vinserti128         m8, [base+smooth_weights_2d_16bpc+16*4+16*2], 1
982    vinserti128         m9, [base+smooth_weights_2d_16bpc+16*4+16*3], 1
983    punpcklwd           m6, m7, m0 ; top, bottom
984    punpckhwd           m7, m0
985.w16_loop:
986    vpbroadcastd        m3, [tlq+hq*2-4]
987    vpbroadcastd        m1, [v_weightsq+0]
988    punpcklwd           m3, m4     ; left, right
989    pshufd              m2, m3, q1111
990    pmaddwd            m10, m8, m2
991    pmaddwd             m2, m9
992    pshufd              m3, m3, q0000
993    SMOOTH_2D_END        1, 1, 6, 7, 10, 2
994    vpbroadcastd        m1, [v_weightsq+4]
995    pmaddwd             m2, m8, m3
996    pmaddwd             m3, m9
997    mova  [dstq+strideq*0], m0
998    SMOOTH_2D_END        1, 1, 6, 7, 2, 3
999    mova  [dstq+strideq*1], m0
1000    lea               dstq, [dstq+strideq*2]
1001    add         v_weightsq, 8
1002    sub                 hq, 2
1003    jg .w16_loop
1004    RET
1005.w32:
1006    WIN64_SPILL_XMM     15
1007    vpbroadcastw        m0, [tlq] ; bottom
1008    movu                m7, [tlq+hq*2+ 2]
1009    movu                m9, [tlq+hq*2+34]
1010    mova              xm10, [base+smooth_weights_2d_16bpc+32*4+16*0]
1011    mova              xm11, [base+smooth_weights_2d_16bpc+32*4+16*1]
1012    vinserti128        m10, [base+smooth_weights_2d_16bpc+32*4+16*2], 1
1013    vinserti128        m11, [base+smooth_weights_2d_16bpc+32*4+16*3], 1
1014    mova              xm12, [base+smooth_weights_2d_16bpc+32*4+16*4]
1015    mova              xm13, [base+smooth_weights_2d_16bpc+32*4+16*5]
1016    vinserti128        m12, [base+smooth_weights_2d_16bpc+32*4+16*6], 1
1017    vinserti128        m13, [base+smooth_weights_2d_16bpc+32*4+16*7], 1
1018    punpcklwd           m6, m7, m0
1019    punpckhwd           m7, m0
1020    punpcklwd           m8, m9, m0
1021    punpckhwd           m9, m0
1022.w32_loop:
1023    vpbroadcastw        m3, [tlq+hq*2-2]
1024    vpbroadcastd       m14, [v_weightsq]
1025    punpcklwd           m3, m4
1026    pmaddwd             m1, m10, m3
1027    pmaddwd             m2, m11, m3
1028    pmaddwd             m0, m6, m14
1029    paddd               m0, m1
1030    pmaddwd             m1, m7, m14
1031    paddd               m1, m2
1032    pmaddwd             m2, m12, m3
1033    pmaddwd             m3, m13
1034    psrld               m0, 8
1035    psrld               m1, 8
1036    packssdw            m0, m1
1037    pavgw               m0, m5
1038    mova       [dstq+32*0], m0
1039    SMOOTH_2D_END       14, 14, 8, 9, 2, 3
1040    mova       [dstq+32*1], m0
1041    add               dstq, strideq
1042    add         v_weightsq, 4
1043    dec                 hd
1044    jg .w32_loop
1045    RET
1046.w64:
1047    PROLOGUE 0, 11, 16, dst, stride, tl, tl_base, h, v_weights, dummy, v_weights_base, x, y, dst_base
1048    mov          dst_baseq, dstq
1049    mov           tl_baseq, tlq
1050    mov    v_weights_baseq, v_weightsq
1051    xor                 xq, xq
1052.w64_loop_x:
1053    mov                 yq, hq
1054    lea                tlq, [tl_baseq+hq*2]
1055    vpbroadcastw        m0, [tl_baseq] ; bottom
1056    movu                m7, [tlq+xq*2+ 2]
1057    movu                m9, [tlq+xq*2+34]
1058    mova              xm10, [base+smooth_weights_2d_16bpc+64*4+16*0]
1059    mova              xm11, [base+smooth_weights_2d_16bpc+64*4+16*1]
1060    vinserti128        m10, [base+smooth_weights_2d_16bpc+64*4+16*2], 1
1061    vinserti128        m11, [base+smooth_weights_2d_16bpc+64*4+16*3], 1
1062    mova              xm12, [base+smooth_weights_2d_16bpc+64*4+16*4]
1063    mova              xm13, [base+smooth_weights_2d_16bpc+64*4+16*5]
1064    vinserti128        m12, [base+smooth_weights_2d_16bpc+64*4+16*6], 1
1065    vinserti128        m13, [base+smooth_weights_2d_16bpc+64*4+16*7], 1
1066    punpcklwd           m6, m7, m0
1067    punpckhwd           m7, m0
1068    punpcklwd           m8, m9, m0
1069    punpckhwd           m9, m0
1070    lea                tlq, [tl_baseq-2]
1071.w64_loop_y:
1072    vpbroadcastw        m3, [tlq+yq*2]
1073    vpbroadcastd        m1, [v_weightsq]
1074    punpcklwd           m3, m4
1075    pmaddwd            m14, m10, m3
1076    pmaddwd            m15, m11, m3
1077    pmaddwd             m2, m12, m3
1078    pmaddwd             m3, m13
1079    pmaddwd             m0, m6, m1
1080    paddd               m0, m14
1081    pmaddwd            m14, m7, m1
1082    paddd              m14, m15
1083    psrld               m0, 8
1084    psrld              m14, 8
1085    packssdw            m0, m14
1086    pavgw               m0, m5
1087    mova       [dstq+32*0], m0
1088    SMOOTH_2D_END        8, 9, 1, 1, 2, 3
1089    mova       [dstq+32*1], m0
1090    add               dstq, strideq
1091    add         v_weightsq, 4
1092    dec                 yq
1093    jg .w64_loop_y
1094    lea               dstq, [dst_baseq+32*2]
1095    add                 r6, 16*8
1096    mov         v_weightsq, v_weights_baseq
1097    add                 xq, 32
1098    test                xb, 64
1099    jz .w64_loop_x
1100    RET
1101
1102cglobal ipred_z1_16bpc, 3, 8, 0, dst, stride, tl, w, h, angle, dx, maxbase
1103    lea                  r6, [ipred_z1_16bpc_avx2_table]
1104    tzcnt                wd, wm
1105    movifnidn        angled, anglem
1106    movifnidn            hd, hm
1107    lea                  r7, [dr_intra_derivative]
1108    movsxd               wq, [r6+wq*4]
1109    add                 tlq, 2
1110    add                  wq, r6
1111    mov                 dxd, angled
1112    and                 dxd, 0x7e
1113    add              angled, 165 ; ~90
1114    movzx               dxd, word [r7+dxq]
1115    xor              angled, 0x4ff ; d = 90 - angle
1116    vpbroadcastd         m5, [pw_62]
1117    jmp                  wq
1118.w4:
1119    ALLOC_STACK         -64, 7
1120    cmp              angleb, 40
1121    jae .w4_no_upsample
1122    lea                 r3d, [angleq-1024]
1123    sar                 r3d, 7
1124    add                 r3d, hd
1125    jg .w4_no_upsample ; !enable_intra_edge_filter || h > 8 || (h == 8 && is_sm)
1126    vpbroadcastw        xm3, [tlq+14]
1127    movu                xm1, [tlq+ 0]    ; 1 2 3 4 5 6 7 8
1128    palignr             xm0, xm3, xm1, 4 ; 3 4 5 6 7 8 8 8
1129    paddw               xm0, [tlq- 2]    ; 0 1 2 3 4 5 6 7
1130    add                 dxd, dxd
1131    palignr             xm2, xm3, xm1, 2 ; 2 3 4 5 6 7 8 8
1132    paddw               xm2, xm1         ; -1 * a + 9 * b + 9 * c + -1 * d
1133    psubw               xm0, xm2, xm0    ; = (b + c - a - d + (b + c) << 3 + 8) >> 4
1134    psraw               xm0, 3           ; = ((b + c - a - d) >> 3 + b + c + 1) >> 1
1135    pxor                xm4, xm4
1136    paddw               xm2, xm0
1137    vpbroadcastw        xm0, r8m         ; pixel_max
1138    mova           [rsp+32], xm3
1139    movd                xm3, dxd
1140    pmaxsw              xm2, xm4
1141    mov                 r3d, dxd
1142    pavgw               xm2, xm4
1143    vpbroadcastw         m3, xm3
1144    pminsw              xm2, xm0
1145    punpcklwd           xm0, xm1, xm2
1146    punpckhwd           xm1, xm2
1147    lea                  r5, [strideq*3]
1148    pslldq               m2, m3, 8
1149    mova           [rsp+ 0], xm0
1150    mova           [rsp+16], xm1
1151    paddw                m6, m3, m3
1152    paddw                m3, m2
1153    vpblendd             m4, m6, 0xf0
1154    paddw                m6, m6
1155    paddw                m3, m4 ; xpos0 xpos1 xpos2 xpos3
1156    vbroadcasti128       m4, [z_upsample]
1157.w4_upsample_loop:
1158    lea                 r2d, [r3+dxq]
1159    shr                 r3d, 6 ; base0
1160    movu                xm1, [rsp+r3*2]
1161    lea                 r3d, [r2+dxq]
1162    shr                 r2d, 6 ; base1
1163    movu                xm2, [rsp+r2*2]
1164    lea                 r2d, [r3+dxq]
1165    shr                 r3d, 6 ; base2
1166    vinserti128          m1, [rsp+r3*2], 1 ; 0 2
1167    lea                 r3d, [r2+dxq]
1168    shr                 r2d, 6 ; base3
1169    vinserti128          m2, [rsp+r2*2], 1 ; 1 3
1170    pshufb               m1, m4
1171    pshufb               m2, m4
1172    punpcklqdq           m0, m1, m2
1173    punpckhqdq           m1, m2
1174    pand                 m2, m5, m3 ; frac
1175    psllw                m2, 9      ; (a * (64 - frac) + b * frac + 32) >> 6
1176    psubw                m1, m0     ; = a + (((b - a) * frac + 32) >> 6)
1177    pmulhrsw             m1, m2     ; = a + (((b - a) * (frac << 9) + 16384) >> 15)
1178    paddw                m3, m6     ; xpos += dx
1179    paddw                m0, m1
1180    vextracti128        xm1, m0, 1
1181    movq   [dstq+strideq*0], xm0
1182    movhps [dstq+strideq*1], xm0
1183    movq   [dstq+strideq*2], xm1
1184    movhps [dstq+r5       ], xm1
1185    lea                dstq, [dstq+strideq*4]
1186    sub                  hd, 4
1187    jg .w4_upsample_loop
1188    RET
1189ALIGN function_align
1190.filter_strength: ; w4/w8/w16
1191%define base r3-z_filter_t0
1192    movd                xm0, maxbased
1193    lea                  r3, [z_filter_t0]
1194    movd                xm1, angled
1195    shr              angled, 8 ; is_sm << 1
1196    vpbroadcastb         m0, xm0
1197    vpbroadcastb         m1, xm1
1198    pcmpeqb              m0, [base+z_filter_wh]
1199    mova                xm2, [r3+angleq*8]
1200    pand                 m0, m1
1201    pcmpgtb              m0, m2
1202    pmovmskb            r5d, m0
1203    ret
1204.w4_no_upsample:
1205    mov            maxbased, 7
1206    test             angled, 0x400 ; !enable_intra_edge_filter
1207    jnz .w4_main
1208    lea            maxbased, [hq+3]
1209    call .filter_strength
1210    mov            maxbased, 7
1211    test                r5d, r5d
1212    jz .w4_main ; filter_strength == 0
1213    popcnt              r5d, r5d
1214    vpbroadcastw        xm3, [tlq+14]
1215    mova                xm0, [tlq- 2]      ; 0 1 2 3 4 5 6 7
1216    vpbroadcastd        xm1, [base+z_filter_k-4+r5*4+12*1]
1217    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
1218    palignr             xm2, xm3, xm0, 4   ; 2 3 4 5 6 7 8 8
1219    pmullw              xm1, [tlq+ 0]      ; 1 2 3 4 5 6 7 8
1220    paddw               xm2, xm0
1221    pmullw              xm2, xm4
1222    movd           [rsp+16], xm3
1223    cmp                 r5d, 3
1224    jne .w4_3tap
1225    paddw               xm1, xm2
1226    palignr             xm2, xm3, xm0, 6   ; 3 4 5 6 7 8 8 8
1227    pblendw             xm0, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6
1228    movzx               r3d, word [tlq+14]
1229    movzx               r2d, word [tlq+12]
1230    inc            maxbased
1231    paddw               xm2, xm0
1232    sub                 r2d, r3d
1233    paddw               xm2, xm2
1234    lea                 r2d, [r2+r3*8+4]
1235    shr                 r2d, 3 ; (1 * top[6] + 7 * top[7] + 4) >> 3
1236    mov            [rsp+16], r2w
1237.w4_3tap:
1238    pxor                xm0, xm0
1239    paddw               xm1, xm2
1240    mov                 tlq, rsp
1241    psrlw               xm1, 3
1242    cmp                  hd, 8
1243    sbb            maxbased, -1
1244    pavgw               xm0, xm1
1245    mova              [tlq], xm0
1246.w4_main:
1247    movd                xm3, dxd
1248    vpbroadcastq         m1, [z_base_inc]
1249    vpbroadcastw         m6, [tlq+maxbaseq*2] ; top[max_base_x]
1250    shl            maxbased, 6
1251    vpbroadcastw         m3, xm3
1252    movd                xm0, maxbased
1253    mov                 r3d, dxd      ; xpos
1254    vpbroadcastw         m0, xm0
1255    paddw                m4, m3, m3
1256    psubw                m1, m0       ; -max_base_x
1257    vpblendd             m3, m4, 0xcc
1258    paddw                m0, m4, m3
1259    vpblendd             m3, m0, 0xf0 ; xpos0 xpos1 xpos2 xpos3
1260    paddw                m4, m4
1261    paddw                m3, m1
1262.w4_loop:
1263    lea                 r5d, [r3+dxq]
1264    shr                 r3d, 6 ; base0
1265    movu                xm1, [tlq+r3*2]
1266    lea                 r3d, [r5+dxq]
1267    shr                 r5d, 6 ; base1
1268    movu                xm2, [tlq+r5*2]
1269    lea                 r5d, [r3+dxq]
1270    shr                 r3d, 6 ; base2
1271    vinserti128          m1, [tlq+r3*2], 1 ; 0 2
1272    lea                 r3d, [r5+dxq]
1273    shr                 r5d, 6 ; base3
1274    vinserti128          m2, [tlq+r5*2], 1 ; 1 3
1275    punpcklqdq           m0, m1, m2
1276    psrldq               m1, 2
1277    pslldq               m2, 6
1278    vpblendd             m1, m2, 0xcc
1279    pand                 m2, m5, m3
1280    psllw                m2, 9
1281    psubw                m1, m0
1282    pmulhrsw             m1, m2
1283    psraw                m2, m3, 15 ; xpos < max_base_x
1284    paddw                m3, m4
1285    paddw                m0, m1
1286    vpblendvb            m0, m6, m0, m2
1287    vextracti128        xm1, m0, 1
1288    movq   [dstq+strideq*0], xm0
1289    movhps [dstq+strideq*1], xm0
1290    lea                dstq, [dstq+strideq*2]
1291    movq   [dstq+strideq*0], xm1
1292    movhps [dstq+strideq*1], xm1
1293    sub                  hd, 4
1294    jz .w4_end
1295    lea                dstq, [dstq+strideq*2]
1296    cmp                 r3d, maxbased
1297    jb .w4_loop
1298    lea                  r6, [strideq*3]
1299.w4_end_loop:
1300    movq   [dstq+strideq*0], xm6
1301    movq   [dstq+strideq*1], xm6
1302    movq   [dstq+strideq*2], xm6
1303    movq   [dstq+r6       ], xm6
1304    lea                dstq, [dstq+strideq*4]
1305    sub                  hd, 4
1306    jg .w4_end_loop
1307.w4_end:
1308    RET
1309.w8:
1310    ALLOC_STACK         -64, 7
1311    lea                 r3d, [angleq+216]
1312    mov                 r3b, hb
1313    cmp                 r3d, 8
1314    ja .w8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || h > 8
1315    movu                 m2, [tlq+2]    ; 2 3 4 5 6 7 8 9   a b c d e f g _
1316    movu                 m0, [tlq+4]    ; 3 4 5 6 7 8 9 a   b c d e f g _ _
1317    movu                 m1, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1318    cmp                  hd, 4
1319    jne .w8_upsample_h8 ; awkward single-pixel edge case
1320    vpblendd             m0, m2, 0x20   ; 3 4 5 6 7 8 9 a   b c c _ _ _ _ _
1321.w8_upsample_h8:
1322    paddw                m2, m1
1323    paddw                m0, [tlq-2]    ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1324    add                 dxd, dxd
1325    psubw                m0, m2, m0
1326    psraw                m0, 3
1327    pxor                 m4, m4
1328    paddw                m2, m0
1329    vpbroadcastw         m0, r8m
1330    movd                xm3, dxd
1331    pmaxsw               m2, m4
1332    mov                 r3d, dxd
1333    pavgw                m2, m4
1334    vpbroadcastw         m3, xm3
1335    pminsw               m2, m0
1336    punpcklwd            m0, m1, m2
1337    punpckhwd            m1, m2
1338    vbroadcasti128       m4, [z_upsample]
1339    mova           [rsp+ 0], xm0
1340    mova           [rsp+16], xm1
1341    paddw                m6, m3, m3
1342    vextracti128   [rsp+32], m0, 1
1343    vextracti128   [rsp+48], m1, 1
1344    vpblendd             m3, m6, 0xf0 ; xpos0 xpos1
1345.w8_upsample_loop:
1346    lea                 r2d, [r3+dxq]
1347    shr                 r3d, 6 ; base0
1348    movu                xm1, [rsp+r3*2]
1349    movu                xm2, [rsp+r3*2+16]
1350    lea                 r3d, [r2+dxq]
1351    shr                 r2d, 6 ; base1
1352    vinserti128          m1, [rsp+r2*2], 1
1353    vinserti128          m2, [rsp+r2*2+16], 1
1354    pshufb               m1, m4
1355    pshufb               m2, m4
1356    punpcklqdq           m0, m1, m2
1357    punpckhqdq           m1, m2
1358    pand                 m2, m5, m3
1359    psllw                m2, 9
1360    psubw                m1, m0
1361    pmulhrsw             m1, m2
1362    paddw                m3, m6
1363    paddw                m0, m1
1364    mova         [dstq+strideq*0], xm0
1365    vextracti128 [dstq+strideq*1], m0, 1
1366    lea                dstq, [dstq+strideq*2]
1367    sub                  hd, 2
1368    jg .w8_upsample_loop
1369    RET
1370.w8_no_intra_edge_filter:
1371    and            maxbased, 7
1372    or             maxbased, 8 ; imin(h+7, 15)
1373    jmp .w8_main
1374.w8_no_upsample:
1375    lea            maxbased, [hq+7]
1376    test             angled, 0x400
1377    jnz .w8_no_intra_edge_filter
1378    call .filter_strength
1379    test                r5d, r5d
1380    jz .w8_main
1381    popcnt              r5d, r5d
1382    vpbroadcastd         m1, [base+z_filter_k-4+r5*4+12*1]
1383    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1384    mova                 m0, [tlq-2]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1385    movu                 m2, [tlq+0]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1386    pmullw               m1, m2
1387    cmp                  hd, 8
1388    jl .w8_filter_h4
1389    punpckhwd            m2, m2
1390    vpblendd             m3, m2, [tlq+2], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1391    je .w8_filter_end ; 8x4 and 8x8 are always 3-tap
1392    movzx               r3d, word [tlq+30]
1393    mov            maxbased, 16
1394    mov            [rsp+32], r3d
1395    cmp                 r5d, 3
1396    jne .w8_filter_end
1397    punpcklwd           xm6, xm0, xm0
1398    vpblendd             m2, [tlq+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
1399    vpblendd             m6, [tlq-4], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1400    movzx               r5d, word [tlq+28]
1401    mov            [rsp+34], r3w
1402    paddw                m2, m6
1403    sub                 r5d, r3d
1404    inc            maxbased
1405    paddw                m2, m2
1406    lea                 r3d, [r5+r3*8+4]
1407    paddw                m1, m2
1408    shr                 r3d, 3
1409    mov            [rsp+32], r3w
1410    jmp .w8_filter_end
1411.w8_filter_h4:
1412    pshuflw              m3, m2, q3321
1413    vinserti128          m3, [tlq+2], 0        ; 2 3 4 5 6 7 8 9   a b c c _ _ _ _
1414.w8_filter_end:
1415    paddw                m0, m3
1416    pmullw               m0, m4
1417    mov                 tlq, rsp
1418    pxor                 m2, m2
1419    paddw                m0, m1
1420    psrlw                m0, 3
1421    pavgw                m0, m2
1422    mova              [tlq], m0
1423.w8_main:
1424    movd                xm3, dxd
1425    vbroadcasti128       m1, [z_base_inc]
1426    vpbroadcastw         m6, [tlq+maxbaseq*2]
1427    shl            maxbased, 6
1428    vpbroadcastw         m3, xm3
1429    movd                xm0, maxbased
1430    mov                 r3d, dxd
1431    vpbroadcastw         m0, xm0
1432    paddw                m4, m3, m3
1433    psubw                m1, m0
1434    vpblendd             m3, m4, 0xf0 ; xpos0 xpos1
1435    paddw                m3, m1
1436.w8_loop:
1437    lea                 r5d, [r3+dxq]
1438    shr                 r3d, 6
1439    movu                xm0, [tlq+r3*2]
1440    movu                xm1, [tlq+r3*2+2]
1441    lea                 r3d, [r5+dxq]
1442    shr                 r5d, 6
1443    vinserti128          m0, [tlq+r5*2], 1
1444    vinserti128          m1, [tlq+r5*2+2], 1
1445    pand                 m2, m5, m3
1446    psllw                m2, 9
1447    psubw                m1, m0
1448    pmulhrsw             m1, m2
1449    psraw                m2, m3, 15
1450    paddw                m3, m4
1451    paddw                m0, m1
1452    vpblendvb            m0, m6, m0, m2
1453    mova         [dstq+strideq*0], xm0
1454    vextracti128 [dstq+strideq*1], m0, 1
1455    sub                  hd, 2
1456    jz .w8_end
1457    lea                dstq, [dstq+strideq*2]
1458    cmp                 r3d, maxbased
1459    jb .w8_loop
1460.w8_end_loop:
1461    mova   [dstq+strideq*0], xm6
1462    mova   [dstq+strideq*1], xm6
1463    lea                dstq, [dstq+strideq*2]
1464    sub                  hd, 2
1465    jg .w8_end_loop
1466.w8_end:
1467    RET
1468.w16_no_intra_edge_filter:
1469    and            maxbased, 15
1470    or             maxbased, 16 ; imin(h+15, 31)
1471    jmp .w16_main
1472.w16:
1473    ALLOC_STACK         -96, 7
1474    lea            maxbased, [hq+15]
1475    test             angled, 0x400
1476    jnz .w16_no_intra_edge_filter
1477    call .filter_strength
1478    test                r5d, r5d
1479    jz .w16_main
1480    popcnt              r5d, r5d
1481    mova                 m0, [tlq-2]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1482    paddw                m1, m0, [tlq+2]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
1483    cmp                 r5d, 3
1484    jne .w16_filter_3tap
1485    vpbroadcastd         m2, [base+pw_3]
1486    punpcklwd           xm0, xm0
1487    vpblendd             m0, [tlq-4], 0xfe      ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1488    paddw                m1, [tlq+0]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1489    paddw                m0, m2
1490    pavgw                m0, [tlq+4]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
1491    paddw                m0, m1
1492    psrlw                m0, 2
1493    movu                 m3, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1494    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1495    paddw                m1, m3, [tlq+30]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1496    cmp                  hd, 8
1497    jl .w16_filter_5tap_h4
1498    punpckhwd            m3, m3
1499    je .w16_filter_5tap_h8
1500    vpblendd             m4, m3, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1501    vpblendd             m3, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1502    movzx               r3d, word [tlq+62]
1503    movzx               r2d, word [tlq+60]
1504    pavgw                m2, m4
1505    sub                 r2d, r3d
1506    paddw                m1, m3
1507    lea                 r2d, [r2+r3*8+4]
1508    paddw                m1, m2
1509    shr                 r2d, 3
1510    psrlw                m1, 2
1511    mov            [rsp+66], r3w
1512    mov            [rsp+64], r2w
1513    mov                 tlq, rsp
1514    mov                 r3d, 33
1515    cmp                  hd, 16
1516    cmovg          maxbased, r3d
1517    jmp .w16_filter_end2
1518.w16_filter_5tap_h8:
1519    vpblendd            xm4, xm3, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1520    vpblendd            xm3, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1521    pavgw               xm2, xm4
1522    paddw               xm1, xm3
1523    paddw               xm1, xm2
1524    psrlw               xm1, 2
1525    jmp .w16_filter_end2
1526.w16_filter_5tap_h4:
1527    pshuflw             xm4, xm3, q3332          ; 4 5 5 5
1528    pshuflw             xm3, xm3, q3321          ; 3 4 5 5
1529    pavgw               xm2, xm4
1530    paddw               xm1, xm3
1531    paddw               xm1, xm2
1532    psrlw               xm1, 2
1533    jmp .w16_filter_end2
1534.w16_filter_3tap:
1535    vpbroadcastd         m3, [base+z_filter_k-4+r5*4+12*1]
1536    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
1537    pmullw               m0, m3, [tlq+0]    ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1538    movu                 m2, [tlq+32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1539    pmullw               m1, m4
1540    pmullw               m3, m2
1541    paddw                m0, m1
1542    cmp                  hd, 8
1543    je .w16_filter_3tap_h8
1544    jl .w16_filter_3tap_h4
1545    punpckhwd            m2, m2
1546    vpblendd             m2, [tlq+34], 0x7f ; 2 3 4 5 6 7 8 9   a b c d e f g g
1547    jmp .w16_filter_end
1548.w16_filter_3tap_h4:
1549    pshuflw             xm2, xm2, q3321     ; 2 3 4 4 _ _ _ _
1550    jmp .w16_filter_end
1551.w16_filter_3tap_h8:
1552    psrldq              xm2, 2
1553    pshufhw             xm2, xm2, q2210     ; 2 3 4 5 6 7 8 8
1554.w16_filter_end:
1555    paddw                m2, [tlq+30]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1556    pmullw               m2, m4
1557    psrlw                m0, 3
1558    pxor                 m1, m1
1559    paddw                m2, m3
1560    psrlw                m2, 3
1561    pavgw                m0, m1
1562    pavgw                m1, m2
1563.w16_filter_end2:
1564    mov                 tlq, rsp
1565    mova           [tlq+ 0], m0
1566    mova           [tlq+32], m1
1567.w16_main:
1568    movd                xm4, dxd
1569    vpbroadcastw         m6, [tlq+maxbaseq*2]
1570    shl            maxbased, 6
1571    vpbroadcastw         m4, xm4
1572    movd                xm0, maxbased
1573    mov                 r3d, dxd
1574    vpbroadcastw         m0, xm0
1575    paddw                m3, m4, [z_base_inc]
1576    psubw                m3, m0
1577.w16_loop:
1578    lea                 r5d, [r3+dxq]
1579    shr                 r3d, 6
1580    movu                 m0, [tlq+r3*2]
1581    movu                 m1, [tlq+r3*2+2]
1582    lea                 r3d, [r5+dxq]
1583    shr                 r5d, 6
1584    pand                 m2, m5, m3
1585    psllw                m2, 9
1586    psubw                m1, m0
1587    pmulhrsw             m1, m2
1588    psraw                m2, m3, 15
1589    paddw                m3, m4
1590    paddw                m1, m0
1591    movu                 m0, [tlq+r5*2]
1592    vpblendvb            m2, m6, m1, m2
1593    movu                 m1, [tlq+r5*2+2]
1594    mova   [dstq+strideq*0], m2
1595    pand                 m2, m5, m3
1596    psllw                m2, 9
1597    psubw                m1, m0
1598    pmulhrsw             m1, m2
1599    psraw                m2, m3, 15
1600    paddw                m3, m4
1601    paddw                m0, m1
1602    vpblendvb            m0, m6, m0, m2
1603    mova   [dstq+strideq*1], m0
1604    sub                  hd, 2
1605    jz .w16_end
1606    lea                dstq, [dstq+strideq*2]
1607    cmp                 r3d, maxbased
1608    jb .w16_loop
1609.w16_end_loop:
1610    mova   [dstq+strideq*0], m6
1611    mova   [dstq+strideq*1], m6
1612    lea                dstq, [dstq+strideq*2]
1613    sub                  hd, 2
1614    jg .w16_end_loop
1615.w16_end:
1616    RET
1617.w32:
1618    ALLOC_STACK        -160, 8
1619    lea            maxbased, [hq+31]
1620    mov                 r3d, 63
1621    cmp                  hd, 32
1622    cmova          maxbased, r3d
1623    test             angled, 0x400
1624    jnz .w32_main
1625    vpbroadcastd         m2, [pw_3]
1626    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1627    punpcklwd           xm1, xm0, xm0
1628    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1629    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1630    paddw                m1, m2
1631    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1632    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1633    mov                  r3, rsp
1634    paddw                m0, m1
1635    lea                 r5d, [maxbaseq-31]
1636    psrlw                m0, 2
1637    mova               [r3], m0
1638.w32_filter_loop:
1639    mova                 m0, [tlq+30]
1640    paddw                m1, m2, [tlq+28]
1641    add                 tlq, 32
1642    paddw                m0, [tlq+0]
1643    pavgw                m1, [tlq+4]
1644    paddw                m0, [tlq+2]
1645    add                  r3, 32
1646    paddw                m0, m1
1647    psrlw                m0, 2
1648    mova               [r3], m0
1649    sub                 r5d, 16
1650    jg .w32_filter_loop
1651    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1652    punpckhwd            m1, m0, m0
1653    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1654    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1655    jl .w32_filter_h8
1656    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1657    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1658    movzx               r5d, word [tlq+62]
1659    movzx               r2d, word [tlq+60]
1660    pavgw                m2, m3
1661    sub                 r2d, r5d
1662    paddw                m0, m1
1663    lea                 r2d, [r2+r5*8+4]
1664    paddw                m0, m2
1665    shr                 r2d, 3
1666    psrlw                m0, 2
1667    mova            [r3+32], m0
1668    mov             [r3+66], r5w
1669    mov             [r3+64], r2w
1670    mov                 tlq, rsp
1671    mov                 r3d, 65
1672    cmp                  hd, 64
1673    cmove          maxbased, r3d
1674    jmp .w32_main
1675.w32_filter_h8:
1676    vpblendd            xm3, xm1, [tlq+36], 0x07 ; 4 5 6 7 8 9 9 9
1677    vpblendd            xm1, [tlq+34], 0x07      ; 3 4 5 6 7 8 9 9
1678    pavgw               xm2, xm3
1679    paddw               xm0, xm1
1680    mov                 tlq, rsp
1681    paddw               xm0, xm2
1682    psrlw               xm0, 2
1683    mova            [r3+32], xm0
1684.w32_main:
1685    movd                xm4, dxd
1686    vpbroadcastw         m6, [tlq+maxbaseq*2]
1687    shl            maxbased, 6
1688    vpbroadcastw         m4, xm4
1689    movd                xm0, maxbased
1690    mov                 r5d, dxd
1691    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1692    vpbroadcastw         m0, xm0
1693    paddw                m3, m4, [z_base_inc]
1694    psubw                m3, m0
1695.w32_loop:
1696    mov                 r3d, r5d
1697    shr                 r3d, 6
1698    movu                 m0, [tlq+r3*2]
1699    movu                 m1, [tlq+r3*2+2]
1700    pand                 m2, m5, m3
1701    psllw                m2, 9
1702    psubw                m1, m0
1703    pmulhrsw             m1, m2
1704    paddw                m0, m1
1705    psraw                m1, m3, 15
1706    vpblendvb            m0, m6, m0, m1
1707    mova        [dstq+32*0], m0
1708    movu                 m0, [tlq+r3*2+32]
1709    movu                 m1, [tlq+r3*2+34]
1710    add                 r5d, dxd
1711    psubw                m1, m0
1712    pmulhrsw             m1, m2
1713    pcmpgtw              m2, m7, m3
1714    paddw                m3, m4
1715    paddw                m0, m1
1716    vpblendvb            m0, m6, m0, m2
1717    mova        [dstq+32*1], m0
1718    dec                  hd
1719    jz .w32_end
1720    add                dstq, strideq
1721    cmp                 r5d, maxbased
1722    jb .w32_loop
1723.w32_end_loop:
1724    mova        [dstq+32*0], m6
1725    mova        [dstq+32*1], m6
1726    add                dstq, strideq
1727    dec                  hd
1728    jg .w32_end_loop
1729.w32_end:
1730    RET
1731.w64:
1732    ALLOC_STACK        -256, 10
1733    lea            maxbased, [hq+63]
1734    test             angled, 0x400
1735    jnz .w64_main
1736    vpbroadcastd         m2, [pw_3]
1737    mova                 m0, [tlq-2]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1738    punpcklwd           xm1, xm0, xm0
1739    vpblendd             m1, [tlq-4], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
1740    paddw                m0, [tlq+0]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1741    paddw                m1, m2
1742    paddw                m0, [tlq+2]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
1743    pavgw                m1, [tlq+4]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
1744    mov                  r3, rsp
1745    paddw                m0, m1
1746    lea                 r5d, [hq+32]
1747    psrlw                m0, 2
1748    mova               [r3], m0
1749.w64_filter_loop:
1750    mova                 m0, [tlq+30]
1751    paddw                m1, m2, [tlq+28]
1752    add                 tlq, 32
1753    paddw                m0, [tlq+0]
1754    pavgw                m1, [tlq+4]
1755    paddw                m0, [tlq+2]
1756    add                  r3, 32
1757    paddw                m0, m1
1758    psrlw                m0, 2
1759    mova               [r3], m0
1760    sub                 r5d, 16
1761    jg .w64_filter_loop
1762    movu                 m0, [tlq+32]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
1763    punpckhwd            m1, m0, m0
1764    paddw                m2, [tlq+28]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
1765    paddw                m0, [tlq+30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
1766    vpblendd             m3, m1, [tlq+36], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
1767    vpblendd             m1, [tlq+34], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
1768    pavgw                m2, m3
1769    paddw                m0, m1
1770    paddw                m0, m2
1771    mov                 tlq, rsp
1772    psrlw                m0, 2
1773    mova            [r3+32], m0
1774.w64_main:
1775    movd                xm4, dxd
1776    vpbroadcastw         m6, [tlq+maxbaseq*2]
1777    shl            maxbased, 6
1778    vpbroadcastw         m4, xm4
1779    movd                xm0, maxbased
1780    mov                 r5d, dxd
1781    vpbroadcastd         m7, [pw_m1024] ; -16 * 64
1782    vpbroadcastw         m0, xm0
1783    paddw                m3, m4, [z_base_inc]
1784    paddw                m8, m7, m7     ; -32 * 64
1785    psubw                m3, m0
1786    paddw                m9, m8, m7     ; -48 * 64
1787.w64_loop:
1788    mov                 r3d, r5d
1789    shr                 r3d, 6
1790    movu                 m0, [tlq+r3*2]
1791    movu                 m1, [tlq+r3*2+2]
1792    pand                 m2, m5, m3
1793    psllw                m2, 9
1794    psubw                m1, m0
1795    pmulhrsw             m1, m2
1796    paddw                m0, m1
1797    psraw                m1, m3, 15
1798    vpblendvb            m0, m6, m0, m1
1799    mova        [dstq+32*0], m0
1800    movu                 m0, [tlq+r3*2+32]
1801    movu                 m1, [tlq+r3*2+34]
1802    psubw                m1, m0
1803    pmulhrsw             m1, m2
1804    paddw                m0, m1
1805    pcmpgtw              m1, m7, m3
1806    vpblendvb            m0, m6, m0, m1
1807    mova        [dstq+32*1], m0
1808    movu                 m0, [tlq+r3*2+64]
1809    movu                 m1, [tlq+r3*2+66]
1810    psubw                m1, m0
1811    pmulhrsw             m1, m2
1812    paddw                m0, m1
1813    pcmpgtw              m1, m8, m3
1814    vpblendvb            m0, m6, m0, m1
1815    mova        [dstq+32*2], m0
1816    movu                 m0, [tlq+r3*2+96]
1817    movu                 m1, [tlq+r3*2+98]
1818    add                 r5d, dxd
1819    psubw                m1, m0
1820    pmulhrsw             m1, m2
1821    pcmpgtw              m2, m9, m3
1822    paddw                m3, m4
1823    paddw                m0, m1
1824    vpblendvb            m0, m6, m0, m2
1825    mova        [dstq+32*3], m0
1826    dec                  hd
1827    jz .w64_end
1828    add                dstq, strideq
1829    cmp                 r5d, maxbased
1830    jb .w64_loop
1831.w64_end_loop:
1832    mova        [dstq+32*0], m6
1833    mova        [dstq+32*1], m6
1834    mova        [dstq+32*2], m6
1835    mova        [dstq+32*3], m6
1836    add                dstq, strideq
1837    dec                  hd
1838    jg .w64_end_loop
1839.w64_end:
1840    RET
1841
1842cglobal ipred_z2_16bpc, 3, 12, 12, 352, dst, stride, tl, w, h, angle, dx, dy
1843%define base r9-z_filter_t0
1844    lea                  r9, [ipred_z2_16bpc_avx2_table]
1845    tzcnt                wd, wm
1846    movifnidn        angled, anglem
1847    movifnidn            hd, hm
1848    lea                 dxq, [dr_intra_derivative-90]
1849    movsxd               wq, [r9+wq*4]
1850    mova                 m1, [tlq-  0]
1851    movzx               dyd, angleb
1852    xor              angled, 0x400
1853    mova                 m2, [tlq- 32]
1854    mov                  r8, dxq
1855    sub                 dxq, dyq
1856    mova                 m3, [tlq- 64]
1857    add                  wq, r9
1858    add                  r9, z_filter_t0-ipred_z2_16bpc_avx2_table
1859    mova                 m4, [tlq- 96]
1860    and                 dyd, ~1
1861    mova                 m5, [tlq-128]
1862    and                 dxq, ~1
1863    movzx               dyd, word [r8+dyq]  ; angle - 90
1864    movzx               dxd, word [dxq+270] ; 180 - angle
1865    vpbroadcastd        m11, [base+pw_62]
1866    mova          [rsp+128], m1
1867    mova          [rsp+ 96], m2
1868    mova          [rsp+ 64], m3
1869    neg                 dxd
1870    mova          [rsp+ 32], m4
1871    neg                 dyq
1872    mova          [rsp+  0], m5
1873    jmp                  wq
1874.w4:
1875    vbroadcasti128      m10, [base+z2_x_shuf]
1876    vpbroadcastq         m6, [base+z_base_inc+2]
1877    lea                 r8d, [dxq+(65<<6)] ; xpos
1878    mov                r10d, (63-4)<<6
1879    test             angled, 0x400
1880    jnz .w4_main ; !enable_intra_edge_filter
1881    lea                 r3d, [hq+2]
1882    add              angled, 1022
1883    shl                 r3d, 6
1884    test                r3d, angled
1885    jnz .w4_no_upsample_above ; angle >= 130 || h > 8 || (is_sm && h == 8)
1886    movq                xm0, [tlq+2]    ; 1 2 3 4
1887    movq                xm1, [tlq+0]    ; 0 1 2 3
1888    pshuflw             xm2, xm0, q3321 ; 2 3 4 4
1889    pshuflw             xm3, xm1, q2100 ; 0 0 1 2
1890    vpbroadcastw        xm4, r8m        ; pixel_max
1891    vbroadcasti128      m10, [base+z_upsample]
1892    paddw               xm1, xm0
1893    paddw               xm2, xm3
1894    lea                 r8d, [r8+dxq+(1<<6)]
1895    psubw               xm2, xm1, xm2
1896    add                 dxd, dxd
1897    psraw               xm2, 3
1898    pxor                xm3, xm3
1899    sub                r10d, 3<<6
1900    paddw               xm1, xm2
1901    paddw                m6, m6
1902    pmaxsw              xm1, xm3
1903    sub              angled, 1075 ; angle - 53
1904    pavgw               xm1, xm3
1905    lea                 r3d, [hq+3]
1906    pminsw              xm1, xm4
1907    xor              angled, 0x7f ; 180 - angle
1908    punpcklwd           xm1, xm0
1909    movu          [rsp+130], xm1
1910    call .filter_strength
1911    jmp .w4_filter_left
1912ALIGN function_align
1913.filter_strength:
1914    movd                xm8, r3d
1915    mov                 r3d, angled
1916    movd                xm7, angled
1917    vpbroadcastb         m8, xm8
1918    shr                 r3d, 8 ; is_sm << 1
1919    vpbroadcastb         m7, xm7
1920    pcmpeqb              m8, [base+z_filter_wh]
1921    mova                xm9, [r9+r3*8]
1922    pand                 m0, m8, m7
1923    pcmpgtb              m0, m9
1924    pmovmskb            r3d, m0
1925    ret
1926ALIGN function_align
1927.upsample_left: ; h4/h8
1928    mova                xm0, [tlq-16]            ; 8 7 6 5 4 3 2 1
1929    movu                xm1, [tlq-14]            ; 7 6 5 4 3 2 1 0
1930    vpbroadcastw        xm4, r8m ; pixel_max
1931    cmp                  hd, 8
1932    je .upsample_left_h8
1933    pshufhw             xm2, xm0, q2100          ; _ _ _ _ 4 4 3 2
1934    pshufhw             xm3, xm1, q3321          ; _ _ _ _ 2 1 0 0
1935    jmp .upsample_left_end
1936.upsample_left_h8:
1937    pblendw             xm2, xm0, [tlq-18], 0xfe ; 8 8 7 6 5 4 3 2
1938    pblendw             xm3, xm1, [tlq-12], 0x7f ; 6 5 4 3 2 1 0 0
1939.upsample_left_end:
1940    paddw               xm1, xm0
1941    paddw               xm2, xm3
1942    psubw               xm2, xm1, xm2
1943    add                 dyq, dyq
1944    psraw               xm2, 3
1945    pxor                xm3, xm3
1946    paddw               xm1, xm2
1947    pmaxsw              xm1, xm3
1948    pavgw               xm1, xm3
1949    pminsw              xm1, xm4
1950    punpcklwd           xm2, xm0, xm1
1951    punpckhwd           xm0, xm1
1952    mova  [rsp+ 96+gprsize], xm2
1953    mova  [rsp+112+gprsize], xm0
1954    ret
1955.w4_no_upsample_above:
1956    lea                 r3d, [hq+3]
1957    sub              angled, 1112 ; angle - 90
1958    call .filter_strength
1959    test                r3d, r3d
1960    jz .w4_no_filter_above
1961    popcnt              r3d, r3d
1962    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
1963    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
1964    psrldq              xm0, xm1, 2     ; 1 2 3 4
1965    pshuflw             xm2, xm1, q2100 ; 0 0 1 2
1966    pmullw              xm4, xm0
1967    pshuflw             xm3, xm0, q3321 ; 2 3 4 4
1968    paddw               xm1, xm3
1969    pshuflw             xm3, xm0, q3332 ; 3 4 4 4
1970    pmullw              xm1, xm5
1971    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*2]
1972    paddw               xm2, xm3
1973    vpbroadcastd        xm3, r6m ; max_width
1974    pmullw              xm2, xm5
1975    packssdw            xm3, xm3
1976    paddw               xm1, xm4
1977    paddw               xm1, xm2
1978    psubw               xm3, [base+pw_1to16]
1979    pxor                xm4, xm4
1980    psrlw               xm1, 3
1981    pminsw              xm3, xm11 ; clip to byte range since there's no variable word blend
1982    pavgw               xm1, xm4
1983    vpblendvb           xm1, xm0, xm3
1984    movq          [rsp+130], xm1
1985.w4_no_filter_above:
1986    lea                 r3d, [hq+2]
1987    add              angled, 973 ; angle + 883
1988    shl                 r3d, 6
1989    test                r3d, angled
1990    jz .w4_upsample_left ; angle <= 140 || h > 8 || (is_sm && h == 8)
1991    vpbroadcastd        xm0, [base+pb_90]
1992    psubb               xm0, xm7 ; 180 - angle
1993    pand                xm0, xm8 ; reuse from previous filter_strength call
1994    pcmpgtb             xm0, xm9
1995    pmovmskb            r3d, xm0
1996.w4_filter_left:
1997    test                r3d, r3d
1998    jz .w4_main
1999    popcnt              r3d, r3d
2000    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2001    vpbroadcastd         m5, r7m ; max_height
2002    cmp                 r3d, 3
2003    je .w4_filter_left_s3
2004    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2005    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2006    pmullw               m2, m0
2007    cmp                  hd, 8
2008    jl .w4_filter_left_h4
2009    movu                 m4, [tlq-34]
2010    punpcklwd            m1, m0, m0
2011    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2012    je .w4_filter_left_end
2013    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2014    jmp .w4_filter_left_end
2015.w4_upsample_left:
2016    call .upsample_left
2017    mov                 r11, -16
2018    vbroadcasti128       m9, [base+z_upsample]
2019    jmp .w4_main_upsample_left
2020.w4_filter_left_s3: ; can only be h16
2021    movu                 m2, [tlq-30]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2022    vpbroadcastd         m4, [base+pw_3]
2023    paddw                m1, m0, m2
2024    punpckhwd            m2, m2
2025    vpblendd             m2, [tlq-28], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2026    punpcklwd           xm3, xm0, xm0
2027    paddw                m2, m4
2028    vpblendd             m4, m3, [tlq-34], 0xfe ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2029    vpblendd             m3, [tlq-36], 0xfe     ; 0 0 0 1 2 3 4 5   6 8 8 9 a b c d
2030    paddw                m1, m4
2031    pavgw                m2, m3
2032    paddw                m1, m2
2033    psrlw                m1, 2
2034    jmp .w4_filter_left_end2
2035.w4_filter_left_h4:
2036    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2037.w4_filter_left_end:
2038    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2039    pmullw               m1, m3
2040    paddw                m1, m2
2041    pxor                 m2, m2
2042    psrlw                m1, 3
2043    pavgw                m1, m2
2044.w4_filter_left_end2:
2045    packssdw             m5, m5
2046    psubw                m5, [base+pw_16to1]
2047    pminsw               m5, m11
2048    vpblendvb            m1, m0, m5
2049    mova           [rsp+96], m1
2050.w4_main:
2051    vbroadcasti128       m9, [base+z2_x_shuf]
2052    mov                 r11, -8
2053.w4_main_upsample_left:
2054    movd                xm5, dyd
2055    mova                 m4, [base+z2_y_shuf_h4]
2056    mov                 r2d, r8d
2057    movd                xm0, dxd
2058    vpbroadcastw         m5, xm5
2059    rorx                 r5, dyq, 5
2060    lea                 r8d, [dyq*3]
2061    pmullw               m5, [base+z2_ymul]
2062    rorx                 r9, dyq, 4
2063    sar                 dyd, 6
2064    vpbroadcastw         m0, xm0
2065    sar                 r8d, 6
2066    pand                 m5, m11       ; frac_y
2067    neg                 dyd
2068    psllw                m5, 9
2069    add                 r5d, dyd
2070    add                 r8d, dyd
2071    add                 r9d, dyd
2072    paddw                m7, m0, m0
2073    lea                 dyq, [rsp+dyq*2+126]
2074    vpblendd             m0, m7, 0xcc
2075    add                 dyq, r11
2076    neg                 r5d
2077    paddw                m1, m0, m7
2078    neg                 r8d
2079    vpblendd             m0, m1, 0xf0  ; xpos0 xpos1 xpos2 xpos3
2080    neg                 r9d
2081    paddw                m7, m7
2082    paddw                m6, m0
2083.w4_loop:
2084    lea                 r3d, [r2+dxq]
2085    shr                 r2d, 6         ; base_x0
2086    movu                xm1, [rsp+r2*2]
2087    lea                 r2d, [r3+dxq]
2088    shr                 r3d, 6         ; base_x1
2089    movu                xm3, [rsp+r3*2]
2090    lea                 r3d, [r2+dxq]
2091    shr                 r2d, 6         ; base_x2
2092    vinserti128          m1, [rsp+r2*2], 1
2093    lea                 r2d, [r3+dxq]
2094    shr                 r3d, 6         ; base_x3
2095    vinserti128          m3, [rsp+r3*2], 1
2096    pshufb               m1, m10 ; a0 a1 a2 a3 A0 A1 A2 A3
2097    pshufb               m3, m10 ; b0 b1 b2 b3 B0 B1 B2 B3
2098    pand                 m2, m11, m6
2099    punpcklqdq           m0, m1, m3
2100    punpckhqdq           m1, m3
2101    psllw                m2, 9
2102    psubw                m1, m0
2103    pmulhrsw             m1, m2
2104    paddw                m0, m1
2105    cmp                 r3d, 64
2106    jge .w4_toponly
2107    movu                xm2, [dyq]
2108    vinserti128          m2, [dyq+r8*2], 1
2109    movu                xm3, [dyq+r5*2]
2110    vinserti128          m3, [dyq+r9*2], 1
2111    pshufb               m2, m9
2112    pshufb               m3, m9
2113    punpckhwd            m1, m2, m3 ; a3 b3 a2 b2 a1 b1 a0 b0
2114    punpcklwd            m2, m3
2115    psubw                m2, m1
2116    pmulhrsw             m2, m5
2117    psraw                m3, m6, 15 ; base_x < topleft
2118    paddw                m1, m2
2119    vpermd               m1, m4, m1 ; a0 b0 c0 d0 a1 b1 c1 d1   a2 b2 c2 d2 a3 b3 c3 d3
2120    vpblendvb            m0, m1, m3
2121.w4_toponly:
2122    paddw                m6, m7     ; xpos += dx
2123    lea                  r3, [strideq*3]
2124    add                 dyq, r11
2125    vextracti128        xm1, m0, 1
2126    movq   [dstq+strideq*0], xm0
2127    movhps [dstq+strideq*1], xm0
2128    movq   [dstq+strideq*2], xm1
2129    movhps [dstq+r3       ], xm1
2130    sub                  hd, 4
2131    jz .w4_end
2132    lea                dstq, [dstq+strideq*4]
2133    cmp                 r2d, r10d
2134    jge .w4_loop
2135.w4_leftonly_loop:
2136    movu                xm1, [dyq]
2137    vinserti128          m1, [dyq+r8*2], 1
2138    movu                xm2, [dyq+r5*2]
2139    vinserti128          m2, [dyq+r9*2], 1
2140    add                 dyq, r11
2141    pshufb               m1, m9
2142    pshufb               m2, m9
2143    punpckhwd            m0, m1, m2
2144    punpcklwd            m1, m2
2145    psubw                m1, m0
2146    pmulhrsw             m1, m5
2147    paddw                m0, m1
2148    vpermd               m0, m4, m0
2149    vextracti128        xm1, m0, 1
2150    movq   [dstq+strideq*0], xm0
2151    movhps [dstq+strideq*1], xm0
2152    movq   [dstq+strideq*2], xm1
2153    movhps [dstq+r3       ], xm1
2154    lea                dstq, [dstq+strideq*4]
2155    sub                  hd, 4
2156    jg .w4_leftonly_loop
2157.w4_end:
2158    RET
2159.w8:
2160    mov                r10d, hd
2161    test             angled, 0x400
2162    jnz .w8_main
2163    lea                 r3d, [angleq+126]
2164    xor                 r8d, r8d
2165    mov                 r3b, hb
2166    cmp                 r3d, 8
2167    ja .w8_no_upsample_above ; angle >= 130 || h > 8 || is_sm
2168    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8
2169    mova                xm1, [tlq+0]            ; 0 1 2 3 4 5 6 7
2170    pblendw             xm2, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8
2171    pblendw             xm3, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6
2172    vpbroadcastw        xm4, r8m ; pixel_max
2173    paddw               xm1, xm0
2174    paddw               xm2, xm3
2175    not                 r8d
2176    psubw               xm2, xm1, xm2
2177    add                 dxd, dxd
2178    psraw               xm2, 3
2179    sub              angled, 53 ; angle - 53
2180    pxor                xm3, xm3
2181    paddw               xm2, xm1
2182    lea                 r3d, [hq+7]
2183    pmaxsw              xm2, xm3
2184    xor              angled, 0x7f ; 180 - angle
2185    pavgw               xm2, xm3
2186    pminsw              xm2, xm4
2187    punpcklwd           xm1, xm2, xm0
2188    punpckhwd           xm2, xm0
2189    movu          [rsp+130], xm1
2190    movu          [rsp+146], xm2
2191    call .filter_strength
2192    jmp .w8_filter_left
2193.w8_no_upsample_above:
2194    lea                 r3d, [hq+7]
2195    sub              angled, 90 ; angle - 90
2196    call .filter_strength
2197    test                r3d, r3d
2198    jz .w8_no_filter_above
2199    popcnt              r3d, r3d
2200    vpbroadcastd        xm4, [base+z_filter_k-4+r3*4+12*1]
2201    vpbroadcastd        xm5, [base+z_filter_k-4+r3*4+12*0]
2202    vpbroadcastd        xm6, [base+z_filter_k-4+r3*4+12*2]
2203    movu                xm0, [tlq+2]            ; 1 2 3 4 5 6 7 8 x
2204    pblendw             xm2, xm1, [tlq-2], 0xfe ; 0 0 1 2 3 4 5 6 x
2205    pmullw              xm4, xm0
2206    pblendw             xm3, xm0, [tlq+4], 0x7f ; 2 3 4 5 6 7 8 8 x
2207    paddw               xm1, xm3
2208    vpblendd            xm3, [tlq+6], 0x07      ; 3 4 5 6 7 8 8 8 x
2209    paddw               xm2, xm3
2210    vpbroadcastd        xm3, r6m ; max_width
2211    pmullw              xm1, xm5
2212    pmullw              xm2, xm6
2213    packssdw            xm3, xm3
2214    paddw               xm1, xm4
2215    paddw               xm1, xm2
2216    psubw               xm3, [base+pw_1to16]
2217    pxor                xm4, xm4
2218    psrlw               xm1, 3
2219    pminsw              xm3, xm11
2220    pavgw               xm1, xm4
2221    vpblendvb           xm1, xm0, xm3
2222    movu          [rsp+130], xm1
2223.w8_no_filter_above:
2224    lea                 r3d, [angleq-51]
2225    mov                 r3b, hb
2226    cmp                 r3d, 8
2227    jbe .w8_upsample_left ; angle > 140 && h <= 8 && !is_sm
2228    vpbroadcastd         m0, [base+pb_90]
2229    psubb                m0, m7
2230    pand                 m0, m8
2231    pcmpgtb              m0, m9
2232    pmovmskb            r3d, m0
2233.w8_filter_left:
2234    test                r3d, r3d
2235    jz .w8_main
2236    popcnt              r3d, r3d
2237    cmp                 r3d, 3
2238    jne .w8_filter_left_s12
2239    vpbroadcastd         m6, [base+pw_3]
2240    vpbroadcastd         m7, [base+pw_16]
2241    cmp                  hd, 16 ; flags needed for later
2242    jmp .filter_left_s3b
2243.w8_upsample_left:
2244    call .upsample_left
2245    vbroadcasti128       m7, [base+z2_y_shuf_us]
2246    lea                 r11, [rsp+118]
2247    mov                  r8, -8
2248    jmp .w8_main_upsample_left
2249.w16_filter_left_s12:
2250    xor                 r8d, r8d
2251.w8_filter_left_s12:
2252    mova                 m0, [tlq-32]  ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2253    vpbroadcastd         m5, r7m ; max_height
2254    vpbroadcastd         m2, [base+z_filter_k-4+r3*4+12*1]
2255    vpbroadcastd         m3, [base+z_filter_k-4+r3*4+12*0]
2256    pmullw               m2, m0
2257    cmp                  hd, 8
2258    jl .w8_filter_left_h4
2259    movu                 m4, [tlq-34]
2260    punpcklwd            m1, m0, m0
2261    vpblendd             m1, m4, 0xee  ; 0 0 1 2 3 4 5 6   8 8 9 a b c d e
2262    je .w8_filter_left_end
2263    vpblendd             m1, m4, 0x10  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2264    jmp .w8_filter_left_end
2265.w8_filter_left_h4:
2266    pshufhw              m1, m0, q2100 ; _ _ _ _ _ _ _ _   _ _ _ _ c c d e
2267.w8_filter_left_end:
2268    paddw                m1, [tlq-30]  ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2269    pmullw               m1, m3
2270    paddw                m1, m2
2271    pxor                 m2, m2
2272    psrlw                m1, 3
2273    pavgw                m1, m2
2274    packssdw             m5, m5
2275    psubw                m5, [base+pw_16to1]
2276    pminsw               m5, m11
2277    vpblendvb            m1, m0, m5
2278    mova           [rsp+96], m1
2279    test                r8d, r8d
2280    jz .w8_main
2281; upsample_main
2282    vbroadcasti128      m10, [base+z_upsample]
2283    vbroadcasti128       m7, [base+z2_y_shuf]
2284    lea                  r5, [rsp+120]
2285    movd                xm1, dyd
2286    vbroadcasti128       m4, [base+z_base_inc+2]
2287    movd                xm2, dxd
2288    vpbroadcastw         m1, xm1
2289    vpbroadcastw         m2, xm2
2290    mov                  r7, dstq
2291    paddw                m4, m4
2292    pmullw               m0, m1, [base+z2_ymul8]
2293    paddw                m5, m2, m2
2294    psllw               xm1, 3
2295    vpblendd             m2, m5, 0xf0
2296    lea                 r2d, [dxq+(66<<6)] ; xpos
2297    paddw                m4, m2
2298    pshufd               m6, m0, q2020
2299    psraw               xm0, 6
2300    pxor                xm1, xm1
2301    psubw               xm8, xm1, xm0
2302    pand                 m6, m11
2303    punpckhwd           xm9, xm8, xm1
2304    psllw                m6, 9
2305    punpcklwd           xm8, xm1
2306.w8_upsample_above_loop:
2307    lea                 r3d, [r2+dxq]
2308    shr                 r2d, 6
2309    movu                xm1, [rsp+r2*2]
2310    movu                xm2, [rsp+r2*2+16]
2311    lea                 r2d, [r3+dxq]
2312    shr                 r3d, 6
2313    vinserti128          m1, [rsp+r3*2], 1
2314    vinserti128          m2, [rsp+r3*2+16], 1
2315    pshufb               m1, m10
2316    pshufb               m2, m10
2317    punpcklqdq           m0, m1, m2   ; a0 b0 c0 d0 e0 f0 g0 h0
2318    punpckhqdq           m1, m2
2319    pand                 m2, m11, m4
2320    psubw                m1, m0
2321    psllw                m2, 9
2322    pmulhrsw             m1, m2
2323    paddw                m0, m1
2324    cmp                 r3d, 64
2325    jge .w8_upsample_above_toponly
2326    mova                 m1, m5
2327    vpgatherdq           m3, [r5+xm9*2], m5
2328    mova                 m5, m1
2329    vpgatherdq           m2, [r5+xm8*2], m1
2330    pshufb               m3, m7
2331    pshufb               m2, m7
2332    punpckldq            m1, m2, m3
2333    punpckhdq            m2, m3
2334    psubw                m2, m1
2335    pmulhrsw             m2, m6
2336    paddw                m1, m2
2337    vpermq               m1, m1, q3120
2338    psraw                m2, m4, 15
2339    vpblendvb            m0, m1, m2
2340.w8_upsample_above_toponly:
2341    paddw                m4, m5
2342    sub                  r5, 4
2343    mova         [dstq+strideq*0], xm0
2344    vextracti128 [dstq+strideq*1], m0, 1
2345    sub                  hd, 2
2346    jz .w8_ret
2347    lea                dstq, [dstq+strideq*2]
2348    jmp .w8_upsample_above_loop
2349.w8_main:
2350    vbroadcasti128       m7, [base+z2_y_shuf]
2351    lea                 r11, [rsp+120]
2352    mov                  r8, -4
2353.w8_main_upsample_left:
2354    movd                xm1, dyd
2355    vbroadcasti128       m4, [base+z_base_inc+2]
2356    movd                xm2, dxd
2357    vpbroadcastw         m1, xm1
2358    vpbroadcastw         m2, xm2
2359    mov                  r7, dstq
2360    pmullw               m0, m1, [base+z2_ymul8]
2361    paddw                m5, m2, m2
2362    psllw               xm1, 3
2363    vpblendd             m2, m5, 0xf0 ; xpos0 xpos1
2364    lea                 r9d, [dxq+(65<<6)] ; xpos
2365    paddw                m4, m2
2366    movd          [rsp+284], xm1
2367.w8_loop0:
2368    mov                 r2d, r9d
2369    mova          [rsp+288], m0
2370    mov                  r5, r11
2371    mova          [rsp+320], m4
2372    pshufd               m6, m0, q2020
2373    psraw               xm0, 6
2374    pxor                xm1, xm1
2375    psubw               xm8, xm1, xm0 ; base_y
2376    pand                 m6, m11      ; frac_y
2377    punpckhwd           xm9, xm8, xm1 ; base_y 2 3 6 7
2378    psllw                m6, 9
2379    punpcklwd           xm8, xm1      ; base_y 0 1 4 5
2380.w8_loop:
2381    lea                 r3d, [r2+dxq]
2382    shr                 r2d, 6        ; base_x0
2383    movu                xm0, [rsp+r2*2]
2384    movu                xm1, [rsp+r2*2+2]
2385    lea                 r2d, [r3+dxq]
2386    shr                 r3d, 6        ; base_x1
2387    vinserti128          m0, [rsp+r3*2], 1
2388    vinserti128          m1, [rsp+r3*2+2], 1
2389    pand                 m2, m11, m4
2390    psubw                m1, m0
2391    psllw                m2, 9
2392    pmulhrsw             m1, m2
2393    paddw                m0, m1
2394    cmp                 r3d, 64
2395    jge .w8_toponly
2396    mova                 m1, m5
2397    vpgatherdq           m3, [r5+xm9*2], m5
2398    mova                 m5, m1
2399    vpgatherdq           m2, [r5+xm8*2], m1
2400    pshufb               m3, m7       ; c0 d0 c1 d1               g0 h0 g1 h1
2401    pshufb               m2, m7       ; a0 b0 a1 b1               e0 f0 e1 f1
2402    punpckldq            m1, m2, m3   ; a0 b0 c0 d0 a1 b1 c1 d1   e0 f0 g0 h0 e1 f1 g1 h1
2403    punpckhdq            m2, m3
2404    psubw                m2, m1
2405    pmulhrsw             m2, m6
2406    paddw                m1, m2
2407    vpermq               m1, m1, q3120
2408    psraw                m2, m4, 15   ; base_x < topleft
2409    vpblendvb            m0, m1, m2
2410.w8_toponly:
2411    paddw                m4, m5       ; xpos += dx
2412    add                  r5, r8
2413    mova         [dstq+strideq*0], xm0
2414    vextracti128 [dstq+strideq*1], m0, 1
2415    sub                  hd, 2
2416    jz .w8_end
2417    lea                dstq, [dstq+strideq*2]
2418    cmp                 r2d, (63-8)<<6
2419    jge .w8_loop
2420.w8_leftonly_loop:
2421    mova                 m0, m5
2422    vpgatherdq           m4, [r5+xm9*2], m5
2423    mova                 m5, m0
2424    vpgatherdq           m3, [r5+xm8*2], m0
2425    add                  r5, r8
2426    pshufb               m2, m4, m7
2427    pshufb               m1, m3, m7
2428    punpckldq            m0, m1, m2
2429    punpckhdq            m1, m2
2430    psubw                m1, m0
2431    pmulhrsw             m1, m6
2432    paddw                m0, m1
2433    vpermq               m0, m0, q3120
2434    mova         [dstq+strideq*0], xm0
2435    vextracti128 [dstq+strideq*1], m0, 1
2436    lea                dstq, [dstq+strideq*2]
2437    sub                  hd, 2
2438    jg .w8_leftonly_loop
2439.w8_end:
2440    sub                r10d, 1<<8
2441    jl .w8_ret
2442    vpbroadcastd         m0, [rsp+284]
2443    add                  r7, 16
2444    paddw                m0, [rsp+288] ; base_y += 8*dy
2445    add                 r9d, 8<<6
2446    vpbroadcastd         m4, [pw_512]
2447    movzx                hd, r10b
2448    paddw                m4, [rsp+320] ; base_x += 8*64
2449    mov                dstq, r7
2450    jmp .w8_loop0
2451.w8_ret:
2452    RET
2453.w16:
2454    movd                xm0, [tlq+32]
2455    lea                r10d, [hq+(1<<8)]
2456    movd          [rsp+160], xm0
2457    test             angled, 0x400
2458    jnz .w8_main
2459    lea                 r3d, [hq+15]
2460    sub              angled, 90
2461    call .filter_strength
2462    test                r3d, r3d
2463    jz .w16_no_filter_above
2464    popcnt              r3d, r3d
2465    vpbroadcastd         m4, [base+z_filter_k-4+r3*4+12*1]
2466    vpbroadcastd         m5, [base+z_filter_k-4+r3*4+12*0]
2467    vpbroadcastd         m6, [base+z_filter_k-4+r3*4+12*2]
2468    movu                 m0, [tlq+2]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2469    punpcklwd           xm2, xm1, xm1
2470    vpblendd             m2, [tlq-2], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2471    punpckhwd            m3, m0, m0
2472    pmullw               m4, m0
2473    vpblendd             m3, [tlq+4], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
2474    paddw                m1, m3
2475    vpblendd             m3, [tlq+6], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g g g
2476    paddw                m2, m3
2477    vpbroadcastd         m3, r6m ; max_width
2478    pmullw               m1, m5
2479    pmullw               m2, m6
2480    packssdw             m3, m3
2481    paddw                m1, m4
2482    paddw                m1, m2
2483    psubw                m3, [base+pw_1to16]
2484    pxor                 m4, m4
2485    psrlw                m1, 3
2486    pminsw               m3, m11
2487    pavgw                m1, m4
2488    vpblendvb            m1, m0, m3
2489    movu          [rsp+130], m1
2490.w16_no_filter_above:
2491    vpbroadcastd         m0, [base+pb_90]
2492    psubb                m0, m7
2493    pand                 m0, m8
2494    pcmpgtb              m0, m9
2495    pmovmskb            r3d, m0
2496    test                r3d, r3d
2497    jz .w8_main
2498    popcnt              r3d, r3d
2499    cmp                 r3d, 3
2500    jne .w16_filter_left_s12
2501    vpbroadcastd         m6, [base+pw_3]
2502    vpbroadcastd         m7, [base+pw_16]
2503    cmp                  hd, 4
2504    jne .filter_left_s3
2505    movq                xm0, [tlq-8]    ; 0 1 2 3
2506    movq                xm1, [tlq-6]    ; 1 2 3 4
2507    vpbroadcastd        xm5, r7m ; max_height
2508    movq                xm4, [base+pw_16to1+24] ; 4to1
2509    pshuflw             xm2, xm0, q2100 ; 0 0 1 2
2510    pshuflw             xm3, xm1, q3321 ; 2 3 4 4
2511    paddw               xm1, xm0
2512    paddw               xm1, xm2
2513    pshuflw             xm2, xm0, q1000 ; 0 0 0 1
2514    paddw               xm3, xm6
2515    packssdw            xm5, xm5
2516    pavgw               xm2, xm3
2517    psubw               xm5, xm4
2518    paddw               xm1, xm2
2519    pminsw              xm5, xm11
2520    psrlw               xm1, 2
2521    vpblendvb           xm1, xm0, xm5
2522    movq          [rsp+120], xm1
2523    jmp .w8_main
2524.w32:
2525    mova                 m2, [tlq+32]
2526    movd                xm0, [tlq+64]
2527    lea                r10d, [hq+(3<<8)]
2528    mova          [rsp+160], m2
2529    movd          [rsp+192], xm0
2530    test             angled, 0x400
2531    jnz .w8_main
2532    vpbroadcastd         m6, [base+pw_3]
2533    vpbroadcastd         m0, r6m ; max_width
2534    vpbroadcastd         m7, [base+pw_16]
2535    mov                 r3d, 32
2536    packssdw             m0, m0
2537    psubw                m0, [base+pw_1to16]
2538    pminsw               m8, m0, m11
2539    psubw                m9, m8, m7
2540.w32_filter_above:
2541    movu                 m0, [tlq+2]
2542    punpcklwd           xm4, xm1, xm1
2543    paddw                m2, m6, [tlq+6]
2544    paddw                m1, m0
2545    vpblendd             m4, [tlq-2], 0xfe        ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
2546    paddw                m1, [tlq+4]
2547    movu                 m3, [tlq+r3+2]
2548    paddw                m5, m6, [tlq+r3-2]
2549    pavgw                m2, m4
2550    punpckhwd            m4, m3, m3
2551    paddw                m1, m2
2552    vpblendd             m2, m4, [tlq+r3+6], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h h h
2553    vpblendd             m4, [tlq+r3+4], 0x7f     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2554    pavgw                m2, m5
2555    paddw                m5, m3, [tlq+r3]
2556    paddw                m4, m5
2557    psrlw                m1, 2
2558    paddw                m2, m4
2559    vpblendvb            m1, m0, m8
2560    psrlw                m2, 2
2561    vpblendvb            m2, m3, m9
2562    movu          [rsp+130], m1
2563    movu       [rsp+r3+130], m2
2564.filter_left_s3:
2565    cmp                  hd, 16
2566    jl .filter_left_s3_h8 ; h8
2567.filter_left_s3b:
2568    mova                 m0, [tlq-32]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2569    movu                 m2, [tlq-30]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
2570    vpbroadcastd         m5, r7m ; max_height
2571    paddw                m1, m0, m2
2572    punpckhwd            m2, m2
2573    mov                 r3d, hd
2574    vpblendd             m2, [tlq-28], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
2575    packssdw             m5, m5
2576    not                  r3
2577    psubw                m5, [base+pw_16to1]
2578    paddw                m2, m6
2579    pminsw               m8, m11, m5
2580    je .filter_left_s3_end ; h16
2581    paddw                m1, [tlq-34]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2582    pavgw                m2, [tlq-36]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2583    paddw                m1, m2
2584    psrlw                m1, 2
2585    vpblendvb            m3, m1, m0, m8
2586    mova                 m0, [tlq-64]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
2587    paddw                m1, m0, [tlq-62]   ; 3 4 5 6 7 8 9 a   b c d e f g h i
2588    paddw                m2, m6, [tlq-60]   ; 4 5 6 7 8 9 a b   c d e f g h i j
2589    psubw                m8, m7
2590    mova           [rsp+96], m3
2591    jnp .filter_left_s3_end ; h32
2592    mova                 m5, [tlq-96]
2593    paddw                m1, [tlq-66]
2594    pavgw                m2, [tlq-68]
2595    paddw                m1, m2
2596    paddw                m4, m5, [tlq-94]
2597    paddw                m2, m6, [tlq-92]
2598    psrlw                m1, 2
2599    paddw                m4, [tlq- 98]
2600    pavgw                m2, [tlq-100]
2601    vpblendvb            m3, m1, m0, m8
2602    mova                 m0, [tlq-128]
2603    psubw                m8, m7
2604    paddw                m4, m2
2605    paddw                m1, m0, [tlq-126]
2606    paddw                m2, m6, [tlq-124]
2607    psrlw                m4, 2
2608    mova           [rsp+64], m3
2609    vpblendvb            m4, m5, m8
2610    psubw                m8, m7
2611    mova           [rsp+32], m4
2612.filter_left_s3_end:
2613    punpcklwd           xm3, xm0, xm0
2614    vpblendd             m4, m3, [tlq+r3*2], 0xfe ; 2 2 3 4 5 6 7 8   9 a b c d e f g
2615    vpblendd             m3, [tlq+r3*2-2], 0xfe   ; 2 2 2 3 4 5 6 7   8 9 a b c d e f
2616    paddw                m1, m4
2617    pavgw                m2, m3
2618    paddw                m1, m2
2619    psrlw                m1, 2
2620    vpblendvb            m1, m0, m8
2621    mova     [rsp+r3*2+130], m1
2622    jmp .w8_main
2623.filter_left_s3_h8:
2624    mova                xm0, [tlq-16]            ; 0 1 2 3 4 5 6 7
2625    movu                xm3, [tlq-14]            ; 1 2 3 4 5 6 7 8
2626    pblendw             xm2, xm0, [tlq-18], 0xfe ; 0 0 1 2 3 4 5 6
2627    vpbroadcastd        xm5, r7m ; max_height
2628    paddw               xm1, xm0, xm3
2629    pblendw             xm3, [tlq-12], 0x7f      ; 2 3 4 5 6 7 8 8
2630    paddw               xm1, xm2
2631    vpblendd            xm2, [tlq-20], 0x0e      ; 0 0 0 1 2 3 4 5
2632    paddw               xm3, xm6
2633    packssdw            xm5, xm5
2634    pavgw               xm2, xm3
2635    psubw               xm5, [base+pw_16to1+16] ; 8to1
2636    paddw               xm1, xm2
2637    pminsw              xm5, xm11
2638    psrlw               xm1, 2
2639    vpblendvb           xm1, xm0, xm5
2640    mova          [rsp+112], xm1
2641    jmp .w8_main
2642.w64:
2643    mova                 m2, [tlq+ 32]
2644    mova                 m3, [tlq+ 64]
2645    mova                 m4, [tlq+ 96]
2646    movd                xm0, [tlq+128]
2647    lea                r10d, [hq+(7<<8)]
2648    mova          [rsp+160], m2
2649    mova          [rsp+192], m3
2650    mova          [rsp+224], m4
2651    movd          [rsp+256], xm0
2652    test             angled, 0x400
2653    jnz .w8_main
2654    vpbroadcastd         m6, [base+pw_3]
2655    movu                 m0, [tlq+34]     ; 2 3 4 5 6 7 8 9   a b c d e f g h
2656    paddw                m2, m6, [tlq+30] ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2657    paddw                m5, m0, [tlq+32] ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2658    pavgw                m2, [tlq+38]     ; 4 5 6 7 8 9 a b   c d e f g h h h
2659    paddw                m5, [tlq+36]     ; 3 4 5 6 7 8 9 a   b c d e f g h h
2660    movu                 m4, [tlq+66]
2661    paddw                m3, m6, [tlq+62]
2662    paddw                m7, m4, [tlq+64]
2663    pavgw                m3, [tlq+70]
2664    paddw                m7, [tlq+68]
2665    paddw                m2, m5
2666    vpbroadcastd         m5, r6m ; max_width
2667    mov                 r3d, 96
2668    packssdw             m5, m5
2669    paddw                m3, m7
2670    psubw                m5, [base+pw_1to16]
2671    psrlw                m2, 2
2672    vpbroadcastd         m7, [base+pw_16]
2673    psrlw                m3, 2
2674    pminsw               m8, m11, m5
2675    psubw                m9, m8, m7
2676    vpblendvb            m2, m0, m9
2677    psubw                m9, m7
2678    vpblendvb            m3, m4, m9
2679    psubw                m9, m7
2680    movu          [rsp+162], m2
2681    movu          [rsp+194], m3
2682    jmp .w32_filter_above
2683
2684cglobal ipred_z3_16bpc, 4, 9, 0, dst, stride, tl, w, h, angle, dy, org_w, maxbase
2685    lea                  r6, [ipred_z3_16bpc_avx2_table]
2686    tzcnt                hd, hm
2687    movifnidn        angled, anglem
2688    lea                  r7, [dr_intra_derivative+45*2-1]
2689    sub                 tlq, 2
2690    movsxd               hq, [r6+hq*4]
2691    sub              angled, 180
2692    add                  hq, r6
2693    mov                 dyd, angled
2694    neg                 dyd
2695    xor              angled, 0x400
2696    or                  dyq, ~0x7e
2697    movzx               dyd, word [r7+dyq]
2698    vpbroadcastd         m5, [pw_62]
2699    mov              org_wd, wd
2700    jmp                  hq
2701.h4:
2702    ALLOC_STACK         -64, 7
2703    lea                  r7, [strideq*3]
2704    cmp              angleb, 40
2705    jae .h4_no_upsample
2706    lea                 r4d, [angleq-1024]
2707    sar                 r4d, 7
2708    add                 r4d, wd
2709    jg .h4_no_upsample ; !enable_intra_edge_filter || w > 8 || (w == 8 && is_sm)
2710    mova                xm2, [tlq-14]            ; 0 1 2 3 4 5 6 7
2711    pblendw             xm1, xm2, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2712    vpblendd            xm0, xm1, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2713    pshufd              xm3, xm1, q0000
2714    paddw               xm1, xm2
2715    paddw               xm0, [tlq-12]            ; 1 2 3 4 5 6 7 8
2716    vpbroadcastw        xm4, r8m ; pixel_max
2717    add                 dyd, dyd
2718    psubw               xm0, xm1, xm0
2719    mova           [rsp+ 0], xm3
2720    movd                xm3, dyd
2721    psraw               xm0, 3
2722    neg                 dyd
2723    paddw               xm1, xm0
2724    pxor                xm0, xm0
2725    lea                 r2d, [dyq+(16<<6)+63] ; ypos
2726    pmaxsw              xm1, xm0
2727    pavgw               xm1, xm0
2728    vpbroadcastw         m3, xm3
2729    pminsw              xm1, xm4
2730    punpckhwd           xm0, xm1, xm2
2731    punpcklwd           xm1, xm2
2732    paddw                m2, m3, m3
2733    mova           [rsp+32], xm0
2734    punpcklwd            m3, m2
2735    mova           [rsp+16], xm1
2736    paddw                m4, m2, m2
2737    paddw                m2, m3
2738    vpblendd             m3, m2, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2739.h4_upsample_loop:
2740    lea                 r4d, [r2+dyq]
2741    shr                 r2d, 6
2742    movu                xm1, [rsp+r2*2]
2743    lea                 r2d, [r4+dyq]
2744    shr                 r4d, 6
2745    movu                xm2, [rsp+r4*2]
2746    lea                 r4d, [r2+dyq]
2747    shr                 r2d, 6
2748    vinserti128          m1, [rsp+r2*2], 1
2749    lea                 r2d, [r4+dyq]
2750    shr                 r4d, 6
2751    vinserti128          m2, [rsp+r4*2], 1
2752    psrld                m0, m1, 16
2753    pblendw              m0, m2, 0xaa ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2754    pslld                m2, 16
2755    pblendw              m1, m2, 0xaa
2756    pand                 m2, m5, m3
2757    psllw                m2, 9
2758    psubw                m1, m0
2759    pmulhrsw             m1, m2
2760    paddw                m3, m4
2761    paddw                m1, m0
2762    vextracti128        xm2, m1, 1
2763    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2764    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2765    movhps [dstq+strideq*0], xm0
2766    movq   [dstq+strideq*1], xm0
2767    movhps [dstq+strideq*2], xm1
2768    movq   [dstq+r7       ], xm1
2769    add                dstq, 8
2770    sub                  wd, 4
2771    jg .h4_upsample_loop
2772    RET
2773ALIGN function_align
2774.filter_strength: ; h4/h8/h16
2775%define base r4-z_filter_t0
2776    lea                  r4, [z_filter_t0]
2777    movd                xm0, maxbased
2778    movd                xm1, angled
2779    shr              angled, 8 ; is_sm << 1
2780    vpbroadcastb         m0, xm0
2781    vpbroadcastb         m1, xm1
2782    pcmpeqb              m0, [base+z_filter_wh]
2783    pand                 m0, m1
2784    mova                xm1, [r4+angleq*8]
2785    pcmpgtb              m0, m1
2786    pmovmskb            r5d, m0
2787    ret
2788.h4_no_upsample:
2789    mov            maxbased, 7
2790    test             angled, 0x400 ; !enable_intra_edge_filter
2791    jnz .h4_main
2792    lea            maxbased, [wq+3]
2793    call .filter_strength
2794    mov            maxbased, 7
2795    test                r5d, r5d
2796    jz .h4_main ; filter_strength == 0
2797    popcnt              r5d, r5d
2798    mova                xm0, [tlq-14]       ; 0 1 2 3 4 5 6 7
2799    movu                xm3, [tlq-12]       ; 1 2 3 4 5 6 7 8
2800    vpbroadcastd        xm2, [base+z_filter_k-4+r5*4+12*1]
2801    vpbroadcastd        xm4, [base+z_filter_k-4+r5*4+12*0]
2802    pmullw              xm2, xm0
2803    pblendw             xm0, [tlq-16], 0xfe ; 0 0 1 2 3 4 5 6
2804    paddw               xm1, xm0, xm3
2805    movd           [rsp+12], xm0
2806    pmullw              xm1, xm4
2807    cmp                 r5d, 3
2808    jne .h4_filter_3tap
2809    pblendw             xm3, [tlq-10], 0x7f ; 2 3 4 5 6 7 8 8
2810    vpblendd            xm0, [tlq-18], 0x0e ; 0 0 0 1 2 3 4 5
2811    movzx               r4d, word [tlq-14]
2812    movzx               r2d, word [tlq-12]
2813    inc            maxbased
2814    paddw               xm1, xm2
2815    paddw               xm0, xm3
2816    sub                 r2d, r4d
2817    paddw               xm2, xm0, xm0
2818    lea                 r2d, [r2+r4*8+4]
2819    shr                 r2d, 3
2820    mov            [rsp+14], r2w
2821.h4_filter_3tap:
2822    pxor                xm0, xm0
2823    paddw               xm1, xm2
2824    lea                 tlq, [rsp+30]
2825    psrlw               xm1, 3
2826    cmp                  wd, 8
2827    sbb            maxbased, -1
2828    pavgw               xm0, xm1
2829    mova           [rsp+16], xm0
2830.h4_main:
2831    movd                xm3, dyd
2832    neg            maxbaseq
2833    vbroadcasti128       m1, [z_base_inc]
2834    vpbroadcastw         m6, [tlq+maxbaseq*2]
2835    shl            maxbased, 6
2836    vpbroadcastw         m3, xm3
2837    lea                 r4d, [maxbaseq+3*64]
2838    neg                 dyq
2839    movd                xm2, r4d
2840    sub                 tlq, 8
2841    lea                  r4, [dyq+63] ; ypos
2842    punpcklwd            m1, m1
2843    paddw                m0, m3, m3
2844    vpbroadcastw         m2, xm2
2845    punpcklwd            m3, m0
2846    paddw                m4, m0, m0
2847    paddw                m0, m3
2848    psubw                m2, m1
2849    vpblendd             m3, m0, 0xf0 ; ypos0 ypos1   ypos2 ypos3
2850    or             maxbased, 63
2851    paddw                m3, m2
2852.h4_loop:
2853    lea                  r5, [r4+dyq]
2854    sar                  r4, 6 ; base0
2855    movu                xm1, [tlq+r4*2]
2856    lea                  r4, [r5+dyq]
2857    sar                  r5, 6 ; base1
2858    movu                xm2, [tlq+r5*2]
2859    lea                  r5, [r4+dyq]
2860    sar                  r4, 6 ; base2
2861    vinserti128          m1, [tlq+r4*2], 1
2862    lea                  r4, [r5+dyq]
2863    sar                  r5, 6 ; base3
2864    vinserti128          m2, [tlq+r5*2], 1
2865    punpckhwd            m0, m1, m2
2866    punpcklwd            m1, m2
2867    pand                 m2, m5, m3
2868    palignr              m0, m1, 4    ; a3 b3 a2 b2 a1 b1 a0 b0   c3 d3 c2 d2 c1 d1 c0 d0
2869    psllw                m2, 9
2870    psubw                m1, m0
2871    pmulhrsw             m1, m2
2872    psraw                m2, m3, 15   ; ypos < max_base_y
2873    paddw                m3, m4
2874    paddw                m1, m0
2875    vpblendvb            m1, m6, m1, m2
2876    vextracti128        xm2, m1, 1
2877    punpckhdq           xm0, xm1, xm2 ; a1 b1 c1 d1 a0 b0 c0 d0
2878    punpckldq           xm1, xm2      ; a3 b3 c3 d3 a2 b2 c2 d2
2879    movhps [dstq+strideq*0], xm0
2880    movq   [dstq+strideq*1], xm0
2881    movhps [dstq+strideq*2], xm1
2882    movq   [dstq+r7       ], xm1
2883    sub                  wd, 4
2884    jz .h4_end
2885    add                dstq, 8
2886    cmp                 r4d, maxbased
2887    jg .h4_loop
2888.h4_end_loop:
2889    movq   [dstq+strideq*0], xm6
2890    movq   [dstq+strideq*1], xm6
2891    movq   [dstq+strideq*2], xm6
2892    movq   [dstq+r7       ], xm6
2893    add                dstq, 8
2894    sub                  wd, 4
2895    jg .h4_end_loop
2896.h4_end:
2897    RET
2898.h8:
2899    lea                 r4d, [angleq+216]
2900    ALLOC_STACK         -64, 8
2901    mov                 r4b, wb
2902    lea                  r7, [strideq*3]
2903    cmp                 r4d, 8
2904    ja .h8_no_upsample ; !enable_intra_edge_filter || is_sm || d >= 40 || w > 8
2905    mova                 m2, [tlq-30]     ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2906    paddw                m1, m2, [tlq-32] ; _ 0 1 2 3 4 5 6   7 8 9 a b c d e
2907    movu                 m0, [tlq-34]     ; _ _ 0 1 2 3 4 5   6 7 8 9 a b c d
2908    cmp                  wd, 8
2909    je .h8_upsample_w8
2910    pshufhw             xm3, xm2, q1000
2911    vpblendd             m0, m3, 0x0f     ; _ _ _ _ 4 4 4 5   6 7 8 9 a b c d
2912.h8_upsample_w8:
2913    paddw                m0, [tlq-28]     ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2914    vpbroadcastw         m4, r8m ; pixel_max
2915    add                 dyd, dyd
2916    psubw                m0, m1, m0
2917    movd                xm6, dyd
2918    psraw                m0, 3
2919    neg                 dyd
2920    paddw                m1, m0
2921    pxor                 m0, m0
2922    pmaxsw               m1, m0
2923    lea                 r4d, [dyq+(16<<6)+63] ; ypos
2924    pavgw                m1, m0
2925    vpbroadcastw         m6, xm6
2926    pminsw               m1, m4
2927    punpckhwd            m0, m1, m2
2928    punpcklwd            m1, m2
2929    vextracti128   [rsp+48], m0, 1
2930    vextracti128   [rsp+32], m1, 1
2931    paddw                m7, m6, m6
2932    mova           [rsp+16], xm0
2933    mova           [rsp+ 0], xm1
2934    punpcklwd            m6, m7 ; ypos0 ypos1
2935.h8_upsample_loop:
2936    lea                 r2d, [r4+dyq]
2937    shr                 r4d, 6 ; base0
2938    movu                 m1, [rsp+r4*2]
2939    lea                 r4d, [r2+dyq]
2940    shr                 r2d, 6 ; base1
2941    movu                 m2, [rsp+r2*2]
2942    lea                 r2d, [r4+dyq]
2943    shr                 r4d, 6 ; base2
2944    movu                 m3, [rsp+r4*2]
2945    lea                 r4d, [r2+dyq]
2946    shr                 r2d, 6 ; base3
2947    movu                 m4, [rsp+r2*2]
2948    psrld                m0, m1, 16
2949    pblendw              m0, m2, 0xaa ; a7 b7 a6 b6 a5 b5 a4 b4   a3 b3 a2 b2 a1 b1 a0 b0
2950    pslld                m2, 16
2951    pblendw              m1, m2, 0xaa
2952    psrld                m2, m3, 16
2953    pblendw              m2, m4, 0xaa ; c7 d7 c6 d6 c5 d5 c4 d4   c3 d3 c2 d2 c1 d1 c0 d0
2954    pslld                m4, 16
2955    pblendw              m3, m4, 0xaa
2956    pand                 m4, m5, m6
2957    paddw                m6, m7
2958    psllw                m4, 9
2959    psubw                m1, m0
2960    pmulhrsw             m1, m4
2961    pand                 m4, m5, m6
2962    psllw                m4, 9
2963    psubw                m3, m2
2964    pmulhrsw             m3, m4
2965    paddw                m6, m7
2966    lea                  r2, [dstq+strideq*4]
2967    paddw                m1, m0
2968    paddw                m3, m2
2969    punpckhdq            m0, m1, m3   ; a5 b5 c5 d5 a4 b4 c4 d4   a1 b1 c1 d1 a0 b0 c0 d0
2970    punpckldq            m1, m3       ; a7 b7 c7 d7 a6 b6 c6 d6   a3 b3 c3 d3 a2 b2 c2 d2
2971    vextracti128        xm2, m0, 1
2972    vextracti128        xm3, m1, 1
2973    movhps [r2  +strideq*0], xm0
2974    movq   [r2  +strideq*1], xm0
2975    movhps [r2  +strideq*2], xm1
2976    movq   [r2  +r7       ], xm1
2977    movhps [dstq+strideq*0], xm2
2978    movq   [dstq+strideq*1], xm2
2979    movhps [dstq+strideq*2], xm3
2980    movq   [dstq+r7       ], xm3
2981    add                dstq, 8
2982    sub                  wd, 4
2983    jg .h8_upsample_loop
2984    RET
2985.h8_no_intra_edge_filter:
2986    and            maxbased, 7
2987    or             maxbased, 8 ; imin(w+7, 15)
2988    jmp .h8_main
2989.h8_no_upsample:
2990    lea            maxbased, [wq+7]
2991    test             angled, 0x400
2992    jnz .h8_no_intra_edge_filter
2993    call .filter_strength
2994    test                r5d, r5d
2995    jz .h8_main
2996    popcnt              r5d, r5d
2997    mova                 m0, [tlq-30]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
2998    movu                 m3, [tlq-28]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
2999    vpbroadcastd         m2, [base+z_filter_k-4+r5*4+12*1]
3000    vpbroadcastd         m4, [base+z_filter_k-4+r5*4+12*0]
3001    pmullw               m2, m0
3002    cmp                  wd, 8
3003    jl .h8_filter_w4
3004    punpcklwd           xm0, xm0
3005    vpblendd             m1, m0, [tlq-32], 0xfe ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3006    movd           [rsp+28], xm0
3007    paddw                m1, m3
3008    mov                 r4d, 16
3009    pmullw               m1, m4
3010    cmovg          maxbased, r4d
3011    cmp                 r5d, 3
3012    jne .h8_filter_3tap
3013    punpckhwd            m3, m3
3014    vpblendd             m0, [tlq-34], 0xfe     ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3015    vpblendd             m3, [tlq-26], 0x7f     ; 2 3 4 5 6 7 8 9   a b c d e f g g
3016    movzx               r4d, word [tlq-30]
3017    movzx               r2d, word [tlq-28]
3018    inc            maxbased
3019    paddw                m1, m2
3020    paddw                m0, m3
3021    sub                 r2d, r4d
3022    paddw                m2, m0, m0
3023    lea                 r2d, [r2+r4*8+4]
3024    shr                 r2d, 3
3025    mov            [rsp+30], r2w
3026    jmp .h8_filter_3tap
3027.h8_filter_w4:
3028    pshufhw             xm1, xm0, q2100
3029    vinserti128          m1, [tlq-16], 1        ; _ _ _ _ 4 4 5 6   7 8 9 a b c d e
3030    paddw                m1, m3
3031    pmullw               m1, m4
3032.h8_filter_3tap:
3033    pxor                 m0, m0
3034    paddw                m1, m2
3035    lea                 tlq, [rsp+62]
3036    psrlw                m1, 3
3037    pavgw                m0, m1
3038    mova           [rsp+32], m0
3039.h8_main:
3040    movd                xm4, dyd
3041    neg            maxbaseq
3042    vbroadcasti128       m1, [z_base_inc]
3043    vpbroadcastw         m7, [tlq+maxbaseq*2]
3044    shl            maxbased, 6
3045    vpbroadcastw         m4, xm4
3046    lea                 r4d, [maxbaseq+7*64]
3047    neg                 dyq
3048    movd                xm2, r4d
3049    sub                 tlq, 16
3050    lea                  r4, [dyq+63]
3051    paddw                m6, m4, m4
3052    vpbroadcastw         m2, xm2
3053    vpblendd             m4, m6, 0xf0 ; ypos0 ypos1
3054    psubw                m2, m1
3055    or             maxbased, 63
3056    paddw                m4, m2
3057.h8_loop:
3058    lea                  r5, [r4+dyq]
3059    sar                  r4, 6 ; base0
3060    movu                xm0, [tlq+r4*2+2]
3061    movu                xm1, [tlq+r4*2]
3062    lea                  r4, [r5+dyq]
3063    sar                  r5, 6 ; base1
3064    vinserti128          m0, [tlq+r5*2+2], 1
3065    vinserti128          m1, [tlq+r5*2], 1
3066    lea                  r5, [r4+dyq]
3067    sar                  r4, 6 ; base2
3068    pand                 m3, m5, m4
3069    psllw                m3, 9
3070    psubw                m1, m0
3071    pmulhrsw             m1, m3
3072    psraw                m3, m4, 15
3073    paddw                m4, m6
3074    paddw                m0, m1
3075    movu                xm1, [tlq+r4*2+2]
3076    movu                xm2, [tlq+r4*2]
3077    lea                  r4, [r5+dyq]
3078    sar                  r5, 6 ; base3
3079    vpblendvb            m0, m7, m0, m3
3080    vinserti128          m1, [tlq+r5*2+2], 1
3081    vinserti128          m2, [tlq+r5*2], 1
3082    pand                 m3, m5, m4
3083    psllw                m3, 9
3084    psubw                m2, m1
3085    pmulhrsw             m2, m3
3086    psraw                m3, m4, 15
3087    paddw                m4, m6
3088    lea                  r5, [dstq+strideq*4]
3089    paddw                m1, m2
3090    vpblendvb            m1, m7, m1, m3
3091    punpckhwd            m2, m0, m1   ; a3 c3 a2 c2 a1 c1 a0 c0   b3 d3 b2 d2 b1 d1 b0 d0
3092    vextracti128        xm3, m2, 1
3093    punpcklwd            m0, m1       ; a7 c7 a6 c6 a5 c5 a4 c5   b7 d7 b6 d6 b5 d5 b4 d4
3094    punpckhwd           xm1, xm2, xm3 ; a1 b1 c1 d1 a0 b0 c0 d0
3095    punpcklwd           xm2, xm3      ; a3 b3 c3 d3 a2 b2 c2 d2
3096    vextracti128        xm3, m0, 1
3097    movhps [dstq+strideq*0], xm1
3098    movq   [dstq+strideq*1], xm1
3099    movhps [dstq+strideq*2], xm2
3100    movq   [dstq+r7       ], xm2
3101    punpckhwd           xm1, xm0, xm3 ; a5 b5 c5 d5 a4 b4 c4 d4
3102    punpcklwd           xm0, xm3      ; a7 b7 c7 d7 a6 b6 c6 d6
3103    movhps [r5  +strideq*0], xm1
3104    movq   [r5  +strideq*1], xm1
3105    movhps [r5  +strideq*2], xm0
3106    movq   [r5  +r7       ], xm0
3107    sub                  wd, 4
3108    jz .h8_end
3109    add                dstq, 8
3110    cmp                 r4d, maxbased
3111    jg .h8_loop
3112    lea                  r6, [strideq*5]
3113    lea                  r2, [strideq+r7*2] ; stride*7
3114    test                 wd, 4
3115    jz .h8_end_loop
3116    movq   [dstq+strideq*0], xm7
3117    movq   [dstq+strideq*1], xm7
3118    movq   [dstq+strideq*2], xm7
3119    movq   [dstq+r7       ], xm7
3120    movq   [dstq+strideq*4], xm7
3121    movq   [dstq+r6       ], xm7
3122    movq   [dstq+r7*2     ], xm7
3123    movq   [dstq+r2       ], xm7
3124    add                dstq, 8
3125    sub                  wd, 4
3126    jz .h8_end
3127.h8_end_loop:
3128    mova   [dstq+strideq*0], xm7
3129    mova   [dstq+strideq*1], xm7
3130    mova   [dstq+strideq*2], xm7
3131    mova   [dstq+r7       ], xm7
3132    mova   [dstq+strideq*4], xm7
3133    mova   [dstq+r6       ], xm7
3134    mova   [dstq+r7*2     ], xm7
3135    mova   [dstq+r2       ], xm7
3136    add                dstq, 16
3137    sub                  wd, 8
3138    jg .h8_end_loop
3139.h8_end:
3140    RET
3141.h16_no_intra_edge_filter:
3142    and            maxbased, 15
3143    or             maxbased, 16 ; imin(w+15, 31)
3144    jmp .h16_main
3145ALIGN function_align
3146.h16:
3147    ALLOC_STACK         -96, 10
3148    lea            maxbased, [wq+15]
3149    lea                  r7, [strideq*3]
3150    test             angled, 0x400
3151    jnz .h16_no_intra_edge_filter
3152    call .filter_strength
3153    test                r5d, r5d
3154    jz .h16_main ; filter_strength == 0
3155    popcnt              r5d, r5d
3156    movu                 m0, [tlq-28]            ; 3 4 5 6 7 8 9 a   b c d e f g h i
3157    paddw                m1, m0, [tlq-32]        ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3158    vpbroadcastd         m6, [base+z_filter_k-4+r5*4+12*1]
3159    vpbroadcastd         m7, [base+z_filter_k-4+r5*4+12*0]
3160    pmullw               m2, m6, [tlq-30]        ; 2 3 4 5 6 7 8 9   a b c d e f g h
3161    pmullw               m1, m7
3162    paddw                m1, m2
3163    cmp                  wd, 8
3164    jg .h16_filter_w16
3165    mova                xm3, [tlq-46]            ; 0 1 2 3 4 5 6 7
3166    pmullw              xm6, xm3
3167    jl .h16_filter_w4
3168    pblendw             xm3, [tlq-48], 0xfe      ; 0 0 1 2 3 4 5 6
3169    cmp                 r5d, 3
3170    jne .h16_filter_w8_3tap
3171    vpblendd            xm4, xm3, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3172.h16_filter_w8_5tap:
3173    punpckhwd            m0, m0
3174    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3175    paddw               xm4, [tlq-42]            ; 2 3 4 5 6 7 8 9
3176    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3177    paddw               xm4, xm4
3178    paddw                m0, m0
3179    paddw               xm6, xm4
3180    paddw                m1, m0
3181.h16_filter_w8_3tap:
3182    paddw               xm3, [tlq-44]            ; 1 2 3 4 5 6 7 8
3183    pmullw              xm3, xm7
3184    pxor                 m0, m0
3185    paddw               xm3, xm6
3186    psrlw               xm3, 3
3187    pavgw               xm3, xm0
3188    mova           [rsp+48], xm3
3189    jmp .h16_filter_end
3190.h16_filter_w4:
3191    pshufhw             xm3, xm3, q2100          ; _ _ _ _ 4 4 5 6
3192    cmp                 r5d, 3
3193    jne .h16_filter_w8_3tap
3194    pshufhw             xm4, xm3, q2100          ; _ _ _ _ 4 4 4 5
3195    jmp .h16_filter_w8_5tap
3196.h16_filter_w16:
3197    mova                 m3, [tlq-62]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3198    pmullw               m6, m3
3199    punpcklwd           xm3, xm3
3200    vpblendd             m4, m3, [tlq-64], 0xfe  ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3201    paddw                m4, [tlq-60]            ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3202    mov                 r4d, 32
3203    cmp                  wd, 16
3204    cmovg          maxbased, r4d
3205    movd           [rsp+28], xm3
3206    pmullw               m4, m7
3207    cmp                 r5d, 3
3208    jne .h16_filter_w16_3tap
3209    punpckhwd            m0, m0
3210    vpblendd             m3, [tlq-66], 0xfe      ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3211    vpblendd             m0, [tlq-26], 0x7f      ; 4 5 6 7 8 9 a b   c d e f g h i i
3212    paddw                m3, [tlq-58]            ; 2 3 4 5 6 7 8 9   a b c d e f g h
3213    paddw                m0, [tlq-34]            ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3214    movzx               r4d, word [tlq-62]
3215    movzx               r2d, word [tlq-60]
3216    or             maxbased, 1
3217    paddw                m3, m3
3218    sub                 r2d, r4d
3219    paddw                m0, m0
3220    lea                 r2d, [r2+r4*8+4]
3221    paddw                m4, m3
3222    shr                 r2d, 3
3223    paddw                m1, m0
3224    mov            [rsp+30], r2w
3225.h16_filter_w16_3tap:
3226    pxor                 m0, m0
3227    paddw                m4, m6
3228    psrlw                m4, 3
3229    pavgw                m4, m0
3230    mova           [rsp+32], m4
3231.h16_filter_end:
3232    psrlw                m1, 3
3233    lea                 tlq, [rsp+94]
3234    pavgw                m1, m0
3235    mova           [rsp+64], m1
3236.h16_main:
3237    movd                xm8, dyd
3238    neg            maxbaseq
3239    vpbroadcastw         m9, [tlq+maxbaseq*2]
3240    shl            maxbased, 6
3241    vpbroadcastw         m8, xm8
3242    lea                 r4d, [maxbaseq+dyq+15*64]
3243    neg                 dyq
3244    movd                xm7, r4d
3245    sub                 tlq, 32
3246    lea                  r4, [dyq+63]
3247    vpbroadcastw         m7, xm7
3248    or             maxbased, 63
3249    psubw                m7, [z_base_inc]
3250.h16_loop:
3251    lea                  r5, [r4+dyq]
3252    sar                  r4, 6 ; base0
3253    movu                 m0, [tlq+r4*2+2]
3254    movu                 m2, [tlq+r4*2]
3255    lea                  r4, [r5+dyq]
3256    sar                  r5, 6 ; base1
3257    movu                 m1, [tlq+r5*2+2]
3258    movu                 m3, [tlq+r5*2]
3259    lea                  r5, [r4+dyq]
3260    sar                  r4, 6 ; base3
3261    pand                 m6, m5, m7
3262    psllw                m6, 9
3263    psubw                m2, m0
3264    pmulhrsw             m2, m6
3265    psraw                m6, m7, 15
3266    paddw                m7, m8
3267    paddw                m0, m2
3268    movu                 m2, [tlq+r4*2+2]
3269    movu                 m4, [tlq+r4*2]
3270    lea                  r4, [r5+dyq]
3271    sar                  r5, 6 ; base3
3272    vpblendvb            m0, m9, m0, m6
3273    pand                 m6, m5, m7
3274    psllw                m6, 9
3275    psubw                m3, m1
3276    pmulhrsw             m3, m6
3277    psraw                m6, m7, 15
3278    paddw                m7, m8
3279    paddw                m1, m3
3280    vpblendvb            m1, m9, m1, m6
3281    pand                 m6, m5, m7
3282    psllw                m6, 9
3283    psubw                m4, m2
3284    pmulhrsw             m4, m6
3285    psraw                m6, m7, 15
3286    paddw                m7, m8
3287    paddw                m2, m4
3288    movu                 m3, [tlq+r5*2+2]
3289    movu                 m4, [tlq+r5*2]
3290    vpblendvb            m2, m9, m2, m6
3291    pand                 m6, m5, m7
3292    psllw                m6, 9
3293    psubw                m4, m3
3294    pmulhrsw             m4, m6
3295    psraw                m6, m7, 15
3296    paddw                m7, m8
3297    lea                  r5, [dstq+strideq*4]
3298    paddw                m3, m4
3299    vpblendvb            m3, m9, m3, m6
3300    punpckhwd            m4, m0, m1 ; ab bb aa ba a9 b9 a8 b8   a3 b3 a2 b2 a1 b1 a0 b0
3301    punpcklwd            m0, m1     ; af bf ae be ad bd ac bc   a7 b7 a6 b6 a5 b5 a4 b4
3302    punpckhwd            m1, m2, m3 ; cb db ca da c9 d9 c8 d8   c3 d3 c2 d2 c1 d1 c0 d0
3303    punpcklwd            m2, m3     ; cf df ce de cd dd cc dc   c7 d7 c6 d6 c5 d5 c4 d4
3304    punpckhdq            m3, m4, m1 ; a9 b9 c9 d9 a8 b8 c8 d8   a1 b1 c1 d1 a0 b0 c0 d0
3305    vextracti128        xm6, m3, 1
3306    punpckldq            m4, m1     ; ab bb cb db aa ba ca da   a3 b3 c3 d3 a2 b2 c2 d2
3307    punpckhdq            m1, m0, m2 ; ad bd cd dd ac bc cc dc   a5 b5 c5 d5 a4 b4 c4 d4
3308    punpckldq            m0, m2     ; af bf cf df ae be ce de   a7 b7 c7 d7 a6 b6 c6 d6
3309    vextracti128        xm2, m4, 1
3310    movhps [dstq+strideq*0], xm6
3311    movq   [dstq+strideq*1], xm6
3312    vextracti128        xm6, m1, 1
3313    movhps [dstq+strideq*2], xm2
3314    movq   [dstq+r7       ], xm2
3315    vextracti128        xm2, m0, 1
3316    movhps [r5  +strideq*0], xm6
3317    movq   [r5  +strideq*1], xm6
3318    movhps [r5  +strideq*2], xm2
3319    movq   [r5  +r7       ], xm2
3320    lea                  r5, [dstq+strideq*8]
3321    movhps [r5  +strideq*0], xm3
3322    movq   [r5  +strideq*1], xm3
3323    movhps [r5  +strideq*2], xm4
3324    movq   [r5  +r7       ], xm4
3325    lea                  r5, [r5+strideq*4]
3326    movhps [r5  +strideq*0], xm1
3327    movq   [r5  +strideq*1], xm1
3328    movhps [r5  +strideq*2], xm0
3329    movq   [r5  +r7       ], xm0
3330    sub                  wd, 4
3331    jz .h16_end
3332    add                dstq, 8
3333    cmp                 r4d, maxbased
3334    jg .h16_loop
3335    mov                  hd, 4
3336.h16_end_loop0:
3337    mov                 r6d, wd
3338    mov                  r2, dstq
3339    test                 wb, 4
3340    jz .h16_end_loop
3341    movq   [dstq+strideq*0], xm9
3342    movq   [dstq+strideq*1], xm9
3343    movq   [dstq+strideq*2], xm9
3344    movq   [dstq+r7       ], xm9
3345    and                 r6d, 120
3346    jz .h16_end_w4
3347    add                dstq, 8
3348.h16_end_loop:
3349    mova   [dstq+strideq*0], xm9
3350    mova   [dstq+strideq*1], xm9
3351    mova   [dstq+strideq*2], xm9
3352    mova   [dstq+r7       ], xm9
3353    add                dstq, 16
3354    sub                 r6d, 8
3355    jg .h16_end_loop
3356.h16_end_w4:
3357    lea                dstq, [r2+strideq*4]
3358    dec                  hd
3359    jg .h16_end_loop0
3360.h16_end:
3361    RET
3362.h32:
3363    ALLOC_STACK        -160, 9
3364    lea            maxbased, [wq+31]
3365    and            maxbased, 31
3366    or             maxbased, 32 ; imin(w+31, 63)
3367    test             angled, 0x400
3368    jnz .h32_main
3369    vpbroadcastd         m2, [pw_3]
3370    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3371    punpckhwd            m1, m0, m0
3372    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3373    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3374    paddw                m1, m2
3375    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3376    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3377    lea                  r4, [rsp+128]
3378    paddw                m0, m1
3379    lea                 r5d, [maxbaseq-31]
3380    psrlw                m0, 2
3381    mova               [r4], m0
3382.h32_filter_loop:
3383    mova                 m0, [tlq-62]
3384    paddw                m1, m2, [tlq-66]
3385    paddw                m0, [tlq-64]
3386    pavgw                m1, [tlq-58]
3387    paddw                m0, [tlq-60]
3388    sub                 tlq, 32
3389    sub                  r4, 32
3390    paddw                m0, m1
3391    psrlw                m0, 2
3392    mova               [r4], m0
3393    sub                 r5d, 16
3394    jg .h32_filter_loop
3395    jl .h32_filter_h8
3396    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3397    punpcklwd           xm1, xm0, xm0
3398    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3399    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3400    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3401    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3402    movzx               r5d, word [tlq-62]
3403    movzx               r2d, word [tlq-60]
3404    pavgw                m2, m3
3405    sub                 r2d, r5d
3406    paddw                m0, m1
3407    lea                 r2d, [r2+r5*8+4]
3408    paddw                m0, m2
3409    shr                 r2d, 3
3410    psrlw                m0, 2
3411    mova            [r4-32], m0
3412    mov             [r4-36], r5w
3413    mov             [r4-34], r2w
3414    lea                 tlq, [rsp+158]
3415    mov                 r4d, 65
3416    cmp                  wd, 64
3417    cmove          maxbased, r4d
3418    jmp .h32_main
3419.h32_filter_h8:
3420    mova                xm0, [tlq-46]            ; 0 1 2 3 4 5 6 7
3421    pblendw             xm1, xm0, [tlq-48], 0xfe ; 0 0 1 2 3 4 5 6
3422    paddw               xm2, [tlq-42]            ; 2 3 4 5 6 7 8 9
3423    paddw               xm0, [tlq-44]            ; 1 2 3 4 5 6 7 8
3424    vpblendd            xm3, xm1, [tlq-50], 0x0e ; 0 0 0 1 2 3 4 5
3425    lea                 tlq, [rsp+158]
3426    pavgw               xm2, xm3
3427    paddw               xm0, xm1
3428    paddw               xm0, xm2
3429    psrlw               xm0, 2
3430    mova            [r4-16], xm0
3431.h32_main:
3432    movd                xm6, dyd
3433    neg            maxbaseq
3434    vpbroadcastw         m7, [tlq+maxbaseq*2]
3435    shl            maxbased, 6
3436    vpbroadcastw         m6, xm6
3437    lea                 r4d, [maxbaseq+dyq+15*64]
3438    neg                 dyq
3439    movd                xm4, r4d
3440    vpbroadcastd         m8, [pw_m1024]
3441    lea                  r4, [dyq+63]
3442    vpbroadcastw         m4, xm4
3443    or             maxbased, 63
3444    psubw                m4, [z_base_inc]
3445.h32_loop:
3446    mov                  r5, r4
3447    sar                  r5, 6
3448    movu                 m1, [tlq+r5*2-64]
3449    movu                 m0, [tlq+r5*2-62]
3450    pand                 m3, m5, m4
3451    psllw                m3, 9
3452    psubw                m1, m0
3453    pmulhrsw             m1, m3
3454    pcmpgtw              m2, m8, m4
3455    paddw                m0, m1
3456    vpblendvb            m0, m7, m0, m2
3457    movu                 m2, [tlq+r5*2-32]
3458    movu                 m1, [tlq+r5*2-30]
3459    add                  r4, dyq
3460    sub                 rsp, 64
3461    psubw                m2, m1
3462    pmulhrsw             m2, m3
3463    psraw                m3, m4, 15
3464    paddw                m4, m6
3465    mova         [rsp+32*0], m0
3466    paddw                m1, m2
3467    vpblendvb            m1, m7, m1, m3
3468    mova         [rsp+32*1], m1
3469    dec                  wd
3470    jz .h32_transpose
3471    cmp                 r4d, maxbased
3472    jg .h32_loop
3473.h32_end_loop:
3474    sub                 rsp, 64
3475    mova         [rsp+32*0], m7
3476    mova         [rsp+32*1], m7
3477    dec                  wd
3478    jg .h32_end_loop
3479.h32_transpose:
3480    lea                  r3, [strideq*3]
3481    lea                  r4, [strideq*5]
3482    mov                  r8, dstq
3483    lea                  r5, [strideq+r3*2]
3484.h32_transpose_loop0:
3485    lea                  r6, [rsp+32]
3486    lea                  r2, [r8+org_wq*2-16]
3487.h32_transpose_loop:
3488    mova                 m0, [r6+64*7]
3489    mova                 m1, [r6+64*6]
3490    mova                 m2, [r6+64*5]
3491    mova                 m3, [r6+64*4]
3492    mova                 m4, [r6+64*3]
3493    mova                 m5, [r6+64*2]
3494    mova                 m6, [r6+64*1]
3495    mova                 m7, [r6+64*0]
3496    punpckhwd            m8, m0, m1 ; a3 b3 a2 b2 a1 b1 a0 b0
3497    punpcklwd            m0, m1     ; a7 b7 a6 b6 a5 b5 a4 b4
3498    punpckhwd            m1, m2, m3 ; c3 d3 c2 d2 c1 d1 c0 d0
3499    punpcklwd            m2, m3     ; c7 d7 c6 d6 c5 d5 c4 d4
3500    punpckhwd            m3, m4, m5 ; e3 f3 e2 f2 e1 f1 e0 f0
3501    punpcklwd            m4, m5     ; e7 f7 e6 f6 e5 f5 e4 f4
3502    punpckhwd            m5, m6, m7 ; g3 h3 g2 h2 g1 h1 g0 h0
3503    punpcklwd            m6, m7     ; g7 h7 g6 h6 g5 h5 g4 h4
3504    lea                dstq, [r2+strideq*8]
3505    sub                  r6, 32
3506    punpckhdq            m7, m8, m1 ; a1 b1 c1 d1 a0 b0 c0 d0
3507    punpckldq            m8, m1     ; a3 b3 c3 d3 a2 b2 c2 d2
3508    punpckhdq            m1, m3, m5 ; e1 f1 g1 h1 e0 f0 g0 h0
3509    punpckldq            m3, m5     ; e3 f3 g3 h3 e2 f2 g2 h2
3510    punpckhqdq           m5, m7, m1 ;  8  0
3511    vextracti128 [r2  +strideq*0], m5, 1
3512    punpcklqdq           m7, m1     ;  9  1
3513    mova         [dstq+strideq*0], xm5
3514    punpckhqdq           m1, m8, m3 ; 10  2
3515    vextracti128 [r2  +strideq*1], m7, 1
3516    punpcklqdq           m8, m3     ; 11  3
3517    mova         [dstq+strideq*1], xm7
3518    punpckhdq            m3, m0, m2 ; a5 b5 c5 d5 a4 b4 c4 d4
3519    vextracti128 [r2  +strideq*2], m1, 1
3520    punpckldq            m0, m2     ; a7 b7 c7 d7 a6 b6 c6 d6
3521    mova         [dstq+strideq*2], xm1
3522    punpckhdq            m2, m4, m6 ; e5 f5 g5 h5 e4 f4 g4 h4
3523    vextracti128 [r2  +r3       ], m8, 1
3524    punpckldq            m4, m6     ; e7 f7 g7 h7 e6 f6 g6 h6
3525    mova         [dstq+r3       ], xm8
3526    punpckhqdq           m6, m3, m2 ; 12  4
3527    vextracti128 [r2  +strideq*4], m6, 1
3528    punpcklqdq           m3, m2     ; 13  5
3529    mova         [dstq+strideq*4], xm6
3530    punpckhqdq           m2, m0, m4 ; 14  6
3531    vextracti128 [r2  +r4       ], m3, 1
3532    punpcklqdq           m0, m4     ; 15  7
3533    mova         [dstq+r4       ], xm3
3534    vextracti128 [r2  +r3*2     ], m2, 1
3535    mova         [dstq+r3*2     ], xm2
3536    vextracti128 [r2  +r5       ], m0, 1
3537    mova         [dstq+r5       ], xm0
3538    lea                  r2, [dstq+strideq*8]
3539    cmp                  r6, rsp
3540    jae .h32_transpose_loop
3541    add                 rsp, 64*8
3542    sub              org_wd, 8
3543    jg .h32_transpose_loop0
3544.h32_end:
3545    RET
3546.h64:
3547    ALLOC_STACK        -256, 10
3548    lea            maxbased, [wq+63]
3549    test             angled, 0x400
3550    jnz .h64_main
3551    vpbroadcastd         m2, [pw_3]
3552    movu                 m0, [tlq-28]       ; 3 4 5 6 7 8 9 a   b c d e f g h i
3553    punpckhwd            m1, m0, m0
3554    vpblendd             m1, [tlq-26], 0x7f ; 4 5 6 7 8 9 a b   c d e f g h i i
3555    paddw                m0, [tlq-30]       ; 2 3 4 5 6 7 8 9   a b c d e f g h
3556    paddw                m1, m2
3557    paddw                m0, [tlq-32]       ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3558    pavgw                m1, [tlq-34]       ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3559    lea                  r4, [rsp+224]
3560    paddw                m0, m1
3561    lea                 r5d, [wq+32]
3562    psrlw                m0, 2
3563    mova               [r4], m0
3564.h64_filter_loop:
3565    mova                 m0, [tlq-62]
3566    paddw                m1, m2, [tlq-66]
3567    paddw                m0, [tlq-64]
3568    pavgw                m1, [tlq-58]
3569    paddw                m0, [tlq-60]
3570    sub                 tlq, 32
3571    sub                  r4, 32
3572    paddw                m0, m1
3573    psrlw                m0, 2
3574    mova               [r4], m0
3575    sub                 r5d, 16
3576    jg .h64_filter_loop
3577    mova                 m0, [tlq-62]           ; 0 1 2 3 4 5 6 7   8 9 a b c d e f
3578    punpcklwd           xm1, xm0, xm0
3579    paddw                m2, [tlq-58]           ; 2 3 4 5 6 7 8 9   a b c d e f g h
3580    paddw                m0, [tlq-60]           ; 1 2 3 4 5 6 7 8   9 a b c d e f g
3581    vpblendd             m3, m1, [tlq-66], 0xfe ; 0 0 0 1 2 3 4 5   6 7 8 9 a b c d
3582    vpblendd             m1, [tlq-64], 0xfe     ; 0 0 1 2 3 4 5 6   7 8 9 a b c d e
3583    lea                 tlq, [rsp+254]
3584    pavgw                m2, m3
3585    paddw                m0, m1
3586    paddw                m0, m2
3587    psrlw                m0, 2
3588    mova            [r4-32], m0
3589.h64_main:
3590    neg            maxbaseq
3591    movd                xm4, dyd
3592    vpbroadcastw         m6, [tlq+maxbaseq*2]
3593    shl            maxbased, 6
3594    vpbroadcastw         m4, xm4
3595    lea                 r4d, [maxbaseq+dyq+15*64]
3596    neg                 dyq
3597    vpbroadcastd         m7, [pw_m1024]
3598    movd                xm3, r4d
3599    lea                  r4, [dyq+63]
3600    paddw                m8, m7, m7
3601    vpbroadcastw         m3, xm3
3602    or             maxbased, 63
3603    paddw                m9, m8, m7
3604    psubw                m3, [z_base_inc]
3605.h64_loop:
3606    mov                  r5, r4
3607    sar                  r5, 6
3608    movu                 m1, [tlq+r5*2-128]
3609    movu                 m0, [tlq+r5*2-126]
3610    pand                 m2, m5, m3
3611    psllw                m2, 9
3612    psubw                m1, m0
3613    pmulhrsw             m1, m2
3614    sub                 rsp, 128
3615    paddw                m0, m1
3616    pcmpgtw              m1, m9, m3
3617    vpblendvb            m0, m6, m0, m1
3618    mova         [rsp+32*0], m0
3619    movu                 m1, [tlq+r5*2-96]
3620    movu                 m0, [tlq+r5*2-94]
3621    psubw                m1, m0
3622    pmulhrsw             m1, m2
3623    paddw                m0, m1
3624    pcmpgtw              m1, m8, m3
3625    vpblendvb            m0, m6, m0, m1
3626    mova         [rsp+32*1], m0
3627    movu                 m1, [tlq+r5*2-64]
3628    movu                 m0, [tlq+r5*2-62]
3629    psubw                m1, m0
3630    pmulhrsw             m1, m2
3631    paddw                m0, m1
3632    pcmpgtw              m1, m7, m3
3633    vpblendvb            m0, m6, m0, m1
3634    mova         [rsp+32*2], m0
3635    movu                 m1, [tlq+r5*2-32]
3636    movu                 m0, [tlq+r5*2-30]
3637    psubw                m1, m0
3638    pmulhrsw             m1, m2
3639    add                  r4, dyq
3640    psraw                m2, m3, 15
3641    paddw                m3, m4
3642    paddw                m0, m1
3643    vpblendvb            m0, m6, m0, m2
3644    mova         [rsp+32*3], m0
3645    dec                  wd
3646    jz .h64_transpose
3647    cmp                 r4d, maxbased
3648    jg .h64_loop
3649.h64_end_loop:
3650    sub                 rsp, 128
3651    mova         [rsp+32*0], m6
3652    mova         [rsp+32*1], m6
3653    mova         [rsp+32*2], m6
3654    mova         [rsp+32*3], m6
3655    dec                  wd
3656    jg .h64_end_loop
3657.h64_transpose:
3658    lea                  r2, [strideq*3]
3659    lea                  r3, [strideq*5]
3660    mov                  r5, dstq
3661    lea                  r4, [strideq+r2*2]
3662.h64_transpose_loop0:
3663    lea                  r6, [rsp+112]
3664    lea                dstq, [r5+org_wq*2-32]
3665.h64_transpose_loop:
3666    mova                xm0, [r6+128*15]
3667    vinserti128          m0, [r6+128* 7], 1
3668    mova                xm1, [r6+128*14]
3669    vinserti128          m1, [r6+128* 6], 1
3670    mova                xm2, [r6+128*13]
3671    vinserti128          m2, [r6+128* 5], 1
3672    mova                xm3, [r6+128*12]
3673    vinserti128          m3, [r6+128* 4], 1
3674    mova                xm4, [r6+128*11]
3675    vinserti128          m4, [r6+128* 3], 1
3676    mova                xm5, [r6+128*10]
3677    vinserti128          m5, [r6+128* 2], 1
3678    mova                xm6, [r6+128* 9]
3679    vinserti128          m6, [r6+128* 1], 1
3680    mova                xm7, [r6+128* 8]
3681    vinserti128          m7, [r6+128* 0], 1
3682    punpckhwd            m8, m0, m1
3683    punpcklwd            m0, m1
3684    punpckhwd            m1, m2, m3
3685    punpcklwd            m2, m3
3686    punpckhwd            m3, m4, m5
3687    punpcklwd            m4, m5
3688    punpckhwd            m5, m6, m7
3689    punpcklwd            m6, m7
3690    sub                  r6, 16
3691    punpckhdq            m7, m8, m1
3692    punpckldq            m8, m1
3693    punpckhdq            m1, m3, m5
3694    punpckldq            m3, m5
3695    punpckhqdq           m5, m7, m1
3696    punpcklqdq           m7, m1
3697    punpckhqdq           m1, m8, m3
3698    punpcklqdq           m8, m3
3699    punpckhdq            m3, m0, m2
3700    mova   [dstq+strideq*0], m5
3701    punpckldq            m0, m2
3702    mova   [dstq+strideq*1], m7
3703    punpckhdq            m2, m4, m6
3704    mova   [dstq+strideq*2], m1
3705    punpckldq            m4, m6
3706    mova   [dstq+r2       ], m8
3707    punpckhqdq           m6, m3, m2
3708    mova   [dstq+strideq*4], m6
3709    punpcklqdq           m3, m2
3710    mova   [dstq+r3       ], m3
3711    punpckhqdq           m2, m0, m4
3712    mova   [dstq+r2*2     ], m2
3713    punpcklqdq           m0, m4
3714    mova   [dstq+r4       ], m0
3715    lea                dstq, [dstq+strideq*8]
3716    cmp                  r6, rsp
3717    jae .h64_transpose_loop
3718    add                 rsp, 128*16
3719    sub              org_wd, 16
3720    jg .h64_transpose_loop0
3721.h64_end:
3722    RET
3723
3724%macro FILTER_1BLK 5 ; dst, src, tmp, shuf, bdmax
3725%ifnum %4
3726    pshufb             xm%2, xm%4
3727%else
3728    pshufb             xm%2, %4
3729%endif
3730    vinserti128         m%2, xm%2, 1
3731    pshufd              m%1, m%2, q0000
3732    pmaddwd             m%1, m2
3733    pshufd              m%3, m%2, q1111
3734    pmaddwd             m%3, m3
3735    paddd               m%1, m1
3736    paddd               m%1, m%3
3737    pshufd              m%3, m%2, q2222
3738    pmaddwd             m%3, m4
3739    paddd               m%1, m%3
3740    pshufd              m%3, m%2, q3333
3741    pmaddwd             m%3, m5
3742    paddd               m%1, m%3
3743    psrad               m%1, 4
3744    packusdw            m%1, m%1
3745    pminsw              m%1, m%5
3746%endmacro
3747
3748%macro FILTER_2BLK 7 ; dst, src, tmp_dst, tmp_src, tmp, shuf, bdmax
3749    pshufb              m%2, m%6
3750    vpermq              m%4, m%2, q3232
3751    vinserti128         m%2, xm%2, 1
3752    pshufd              m%1, m%2, q0000
3753    pshufd              m%3, m%4, q0000
3754    pmaddwd             m%1, m2
3755    pmaddwd             m%3, m2
3756    paddd               m%1, m1
3757    paddd               m%3, m1
3758    pshufd              m%5, m%2, q1111
3759    pmaddwd             m%5, m3
3760    paddd               m%1, m%5
3761    pshufd              m%5, m%4, q1111
3762    pmaddwd             m%5, m3
3763    paddd               m%3, m%5
3764    pshufd              m%5, m%2, q2222
3765    pmaddwd             m%5, m4
3766    paddd               m%1, m%5
3767    pshufd              m%5, m%4, q2222
3768    pmaddwd             m%5, m4
3769    paddd               m%3, m%5
3770    pshufd              m%5, m%2, q3333
3771    pmaddwd             m%5, m5
3772    paddd               m%1, m%5
3773    pshufd              m%5, m%4, q3333
3774    pmaddwd             m%5, m5
3775    paddd               m%3, m%5
3776    psrad               m%1, 4
3777    psrad               m%3, 4
3778    packusdw            m%1, m%3
3779    pminsw              m%1, m%7
3780%endmacro
3781
3782; The ipred_filter SIMD processes 4x2 blocks in the following order which
3783; increases parallelism compared to doing things row by row. One redundant
3784; block is calculated for w8 and w16, two for w32.
3785;     w4     w8       w16             w32
3786;     1     1 2     1 2 3 5     1 2 3 5 b c d f
3787;     2     2 3     2 4 5 7     2 4 5 7 c e f h
3788;     3     3 4     4 6 7 9     4 6 7 9 e g h j
3789; ___ 4 ___ 4 5 ___ 6 8 9 a ___ 6 8 9 a g i j k ___
3790;           5       8           8       i
3791
3792cglobal ipred_filter_16bpc, 3, 9, 0, dst, stride, tl, w, h, filter
3793%define base r6-ipred_filter_16bpc_avx2_table
3794    lea                  r6, [filter_intra_taps]
3795    tzcnt                wd, wm
3796%ifidn filterd, filterm
3797    movzx           filterd, filterb
3798%else
3799    movzx           filterd, byte filterm
3800%endif
3801    shl             filterd, 6
3802    add             filterq, r6
3803    lea                  r6, [ipred_filter_16bpc_avx2_table]
3804    vbroadcasti128       m0, [tlq-6]
3805    movsxd               wq, [r6+wq*4]
3806    vpbroadcastd         m1, [base+pd_8]
3807    pmovsxbw             m2, [filterq+16*0]
3808    pmovsxbw             m3, [filterq+16*1]
3809    pmovsxbw             m4, [filterq+16*2]
3810    pmovsxbw             m5, [filterq+16*3]
3811    add                  wq, r6
3812    mov                  hd, hm
3813    jmp                  wq
3814.w4:
3815    WIN64_SPILL_XMM      10
3816    mova                xm8, [base+filter_shuf2]
3817    vpbroadcastw         m9, r8m ; bitdepth_max
3818    lea                  r7, [6+hq*2]
3819    sub                 tlq, r7
3820    jmp .w4_loop_start
3821.w4_loop:
3822    pinsrq              xm0, [tlq+hq*2], 0
3823    lea                dstq, [dstq+strideq*2]
3824.w4_loop_start:
3825    FILTER_1BLK           6, 0, 7, 8, 9
3826    vextracti128        xm0, m6, 1
3827    movq   [dstq+strideq*0], xm6
3828    movq   [dstq+strideq*1], xm0
3829    sub                  hd, 2
3830    jg .w4_loop
3831    RET
3832ALIGN function_align
3833.w8:
3834    WIN64_SPILL_XMM      16
3835    vbroadcasti128      m14, [base+filter_shuf3]
3836    vpbroadcastw        m15, r8m ; bitdepth_max
3837    FILTER_1BLK          10, 0, 7, [base+filter_shuf2], 15
3838    vpermq               m6, m10, q1302         ; ____ ____ | ____ 4321
3839    pslldq               m8, m0, 4
3840    psrldq               m7, m6, 2
3841    psrldq               m0, m6, 10
3842    punpcklwd            m7, m0
3843    vpblendd             m8, m6, 0x33           ; _0__ 4321 | ____ 4321
3844    vpblendd             m8, m7, 0x40           ; _056 4321 | ____ 4321
3845    vpblendd             m8, [tlq-6], 0x30      ; _056 4321 | ____ 4321
3846    lea                  r7, [16+hq*2]
3847    sub                 tlq, r7
3848    jmp .w8_loop_start
3849.w8_loop:
3850    vpermq               m8, m9, q1302          ; ____ 4321 | ____ 4321
3851    vpermq               m6, m9, q2031
3852    psrldq               m0, m6, 2
3853    psrldq               m6, 10
3854    punpcklwd            m6, m0
3855    vpblendd             m8, m7, 0x80           ; _0__ 4321 | ____ 4321
3856    vpblendd             m8, m6, 0x40           ; _056 4321 | ____ 4321
3857    mova                m10, m9
3858.w8_loop_start:
3859    vpblendd             m8, [tlq+hq*2], 0x0C   ; _056 4321 | _056 4321
3860    call .main
3861    vpblendd            m10, m9, 0xCC
3862    mova         [dstq+strideq*0], xm10
3863    vextracti128 [dstq+strideq*1], m10, 1
3864    lea                dstq, [dstq+strideq*2]
3865    sub                  hd, 2
3866    jg .w8_loop
3867    RET
3868ALIGN function_align
3869.w16:
3870    ALLOC_STACK          32, 16
3871    vpbroadcastw        m15, r8m ; bitdepth_max
3872    sub                  hd, 2
3873    TAIL_CALL .w16_main, 0
3874.w16_main:
3875    mova               xm10, [base+filter_shuf2]
3876    FILTER_1BLK          13, 0, 6, 10, 15
3877    vpermq              m12, m13, q3120
3878    mova               xm14, [base+filter_shuf3]
3879    vinserti128         m14, [base+filter_shuf1], 1
3880    vpbroadcastq         m0, [tlq+10]
3881    vpblendd             m0, [tlq-16], 0x4C     ; ___0 4321 | _056 ____
3882    psrldq               m6, m12, 8
3883    vpblendd             m0, m6, 0x03           ; ___0 4321 | _056 4321
3884    punpcklwd            m6, m12
3885    vpblendd             m0, m6, 0x80           ; 56_0 4321 | _056 4321
3886    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
3887    vpblendd            m13, m12, 0xCC
3888    vpermq              m12, m12, q2031         ; 6___ 5___
3889    psrldq              xm6, xm12, 2
3890    psrldq              xm8, xm12, 12
3891    vpblendd            xm6, xm8, 0x01
3892    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
3893    FILTER_1BLK          11, 6, 8, 10, 15
3894    vpermq              m11, m11, q3120
3895    pshufd               m9, m11, q1032
3896    movu                 m8, [tlq+6]            ; __43 210_ | ____ ____
3897    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
3898    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
3899    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
3900    mova         [dstq+strideq*0], xm13
3901    vextracti128 [dstq+strideq*1], m13, 1
3902    lea                  r7, [20+hq*2]
3903    sub                 tlq, r7
3904    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
3905    jmp .w16_loop_start
3906.w16_loop:
3907    vpermq              m13, m13, q3322
3908    vpermq              m11,  m9, q2020
3909    vpermq               m9,  m9, q1302
3910    vpermq               m6, m12, q0123
3911    psrldq               m7, 4
3912    vpblendd            m13, m10, 0xCC
3913    vpblendd             m9, m7, 0x40
3914    mova                 m0, [rsp+8]
3915    mova         [dstq+strideq*0], xm13
3916    vextracti128 [dstq+strideq*1], m13, 1
3917.w16_loop_start:
3918    mova                m13, m12
3919    vpblendd             m0, [tlq+hq*2], 0x0C
3920    psrldq               m7, m12, 8
3921    punpcklwd            m7, m12
3922    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
3923    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
3924    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
3925    vpermq              m12, m10, q2031
3926    mova            [rsp+8], m0
3927    psrldq               m8, m11, 8
3928    psrldq              xm6, xm12, 2
3929    psrldq              xm7, xm12, 10
3930    psrldq              xm0, xm13, 2
3931    punpcklwd            m8, m11
3932    punpcklwd           xm7, xm6
3933    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
3934    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
3935    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
3936    call .main
3937    vpermq               m8, m11, q3120
3938    vpblendd             m6, m8, m9, 0xCC
3939    mova         [dstq+strideq*0+16], xm6
3940    vextracti128 [dstq+strideq*1+16], m6, 1
3941    lea                dstq, [dstq+strideq*2]
3942    sub                  hd, 2
3943    jg .w16_loop
3944    vpermq               m8, m9, q3120
3945    vextracti128        xm0, m8, 1              ; 4321 ____
3946    pshufd             xm11, xm11, q1032
3947    vpblendd            xm0, xm11, 0x02         ; 4321 0___
3948    psrldq              xm6, xm8, 2
3949    psrldq              xm7, xm8, 12
3950    pblendw             xm0, xm6, 0x4           ; 4321 05__
3951    pblendw             xm0, xm7, 0x2           ; 4321 056_
3952    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
3953    vpermq              m12, m13, q1302
3954    vpblendd            m12, m10, 0xCC
3955    vpblendd             m9, m6, 0xCC
3956    mova         [dstq+strideq*0+ 0], xm12
3957    mova         [dstq+strideq*0+16], xm9
3958    vextracti128 [dstq+strideq*1+ 0], m12, 1
3959    vextracti128 [dstq+strideq*1+16], m9, 1
3960    ret
3961ALIGN function_align
3962.w32:
3963    ALLOC_STACK          64, 16
3964    vpbroadcastw        m15, r8m ; bitdepth_max
3965    sub                  hd, 2
3966    lea                  r3, [dstq+32]
3967    lea                 r5d, [hd*2+20]
3968    call .w16_main
3969    mov                dstq, r3
3970    lea                 tlq, [tlq+r5+32]
3971    sub                 r5d, 20
3972    shr                 r5d, 1
3973    sub                 r5d, 2
3974    lea                  r4, [dstq+strideq*2-2]
3975DEFINE_ARGS dst, stride, tl, stride3, left, h
3976    lea            stride3q, [strideq*3]
3977    movu                 m8, [tlq-6]                        ; 4321 0___
3978    mova               xm10, [base+filter_shuf2]
3979    pinsrw              xm0, xm8, [dstq+strideq*0-2], 2
3980    pinsrw              xm0, xm0, [dstq+strideq*1-2], 1     ; 4321 056_
3981    pinsrw              xm9, [leftq+strideq*0], 5
3982    pinsrw              xm9, [leftq+strideq*1], 4
3983    FILTER_1BLK          13, 0, 6, 10, 15
3984    vpermq              m12, m13, q3120
3985    mova               xm14, [base+filter_shuf3]
3986    vinserti128         m14, [base+filter_shuf1], 1
3987    psrldq               m6, m12, 8
3988    punpcklwd            m7, m6, m12
3989    vpblendd             m0, m6, 0x03           ; ___0 ____ | _0__ 4321
3990    vpblendd             m0, m7, 0x80           ; 56_0 ____ | _0__ 4321
3991    vpblendd             m0, m8, 0x30           ; 56_0 4321 | _0__ 4321
3992    vpblendd             m0, m9, 0x04           ; 56_0 4321 | _056 4321
3993    FILTER_2BLK          12, 0, 6, 7, 8, 14, 15
3994    vpblendd            m13, m12, 0xCC
3995    pinsrw              xm9, [leftq+strideq*2], 3
3996    pinsrw              xm9, [leftq+stride3q ], 2
3997    lea               leftq, [leftq+strideq*4]
3998    pinsrw              xm9, [leftq+strideq*0], 1
3999    pinsrw              xm9, [leftq+strideq*1], 0
4000    movq           [rsp+32], xm9
4001    mov                 r7d, 1
4002    pslldq               m8, m9, 4
4003    vpblendd             m0, m8, 0x0C           ; ___0 ____ | _056 ____
4004    vpermq              m12, m12, q2031         ; 6___ 5___
4005    psrldq              xm6, xm12, 2
4006    psrldq              xm7, xm12, 12
4007    vpblendd            xm6, xm7, 0x01          ; ____ _56_
4008    pblendw             xm6, [tlq+10], 0xF8     ; 4321 056_
4009    FILTER_1BLK          11, 6, 7, 10, 15
4010    vpermq              m11, m11, q3120
4011    pshufd               m9, m11, q1032
4012    vbroadcasti128       m8, [tlq+22]           ; __43 210_ | ____ ____
4013    pshufd               m8, m8, q3021          ; __0_ 4321 | ____ ____
4014    pshufhw              m8, m8, q3201          ; ___0 4321 | ____ ____
4015    vpblendd             m9, m8, 0x70           ; ___0 4321 | ____ 4321
4016    mova         [dstq+strideq*0], xm13
4017    vextracti128 [dstq+strideq*1], m13, 1
4018    vpermq               m6, m12, q0123         ; ____ 4321 | ____ 4321
4019    jmp .w32_loop_start
4020.w32_loop_last:
4021    mova                 m0, [rsp+0]
4022    jmp .w32_loop
4023.w32_loop_left:
4024    mova                 m0, [rsp+0]
4025    vpblendd             m0, [rsp+32+r7*4-12], 0x0C
4026    dec                 r7d
4027    jg .w32_loop
4028    cmp                  hd, 2
4029    je .w32_loop
4030    pinsrw              xm6, [rsp+32], 6
4031    pinsrw              xm6, [leftq+strideq*2], 5
4032    pinsrw              xm6, [leftq+stride3q ], 4
4033    lea               leftq, [leftq+strideq*4]
4034    pinsrw              xm6, [leftq+strideq*0], 3
4035    pinsrw              xm6, [leftq+strideq*1], 2
4036    pinsrw              xm6, [leftq+strideq*2], 1
4037    pinsrw              xm6, [leftq+stride3q ], 0
4038    lea               leftq, [leftq+strideq*4]
4039    movu           [rsp+36], xm6
4040    pinsrw              xm6, [leftq+strideq*0], 1
4041    pinsrw              xm6, [leftq+strideq*1], 0
4042    movd           [rsp+32], xm6
4043    mov                 r7d, 4
4044.w32_loop:
4045    vpermq              m13, m13, q3322
4046    vpermq              m11,  m9, q2020
4047    vpermq               m9,  m9, q1302
4048    vpermq               m6, m12, q0123
4049    psrldq               m7, 4
4050    vpblendd            m13, m10, 0xCC
4051    vpblendd             m9, m7, 0x40           ; ___0 4321 | ____ 4321
4052    mova         [dstq+strideq*0], xm13
4053    vextracti128 [dstq+strideq*1], m13, 1
4054.w32_loop_start:
4055    mova                m13, m12
4056    psrldq               m7, m12, 8
4057    punpcklwd            m7, m12
4058    vpblendd             m0, m6, 0x33           ; ___0 4321 | _056 4321
4059    vpblendd             m0, m7, 0x80           ; 56_0 4321 | _056 4321
4060    FILTER_2BLK          10, 0, 6, 7, 8, 14, 15
4061    vpermq              m12, m10, q2031
4062    mova            [rsp+0], m0
4063    psrldq               m8, m11, 8
4064    psrldq              xm6, xm12, 2
4065    psrldq              xm7, xm12, 10
4066    psrldq              xm0, xm13, 2
4067    punpcklwd            m8, m11
4068    punpcklwd           xm7, xm6
4069    vpblendd             m8, m9, 0x73           ; 56_0 4321 | ____ 4321
4070    vpblendd             m8, m7, 0x04           ; 56_0 4321 | __56 4321
4071    vpblendd             m8, m0, 0x08           ; 56_0 4321 | _056 4321
4072    call .main
4073    vpermq               m8, m11, q3120
4074    vpblendd             m6, m8, m9, 0xCC
4075    mova         [dstq+strideq*0+16], xm6
4076    vextracti128 [dstq+strideq*1+16], m6, 1
4077    lea                dstq, [dstq+strideq*2]
4078    sub                  hd, 2
4079    jg .w32_loop_left
4080    jz .w32_loop_last
4081    vpermq               m8, m9, q3120
4082    vextracti128        xm0, m8, 1              ; 4321 ____
4083    pshufd             xm11, xm11, q1032
4084    vpblendd            xm0, xm11, 0x02         ; 4321 0___
4085    psrldq              xm6, xm8, 2
4086    psrldq              xm7, xm8, 12
4087    pblendw             xm0, xm6, 0x4           ; 4321 05__
4088    pblendw             xm0, xm7, 0x2           ; 4321 056_
4089    FILTER_1BLK           6, 0, 7, [base+filter_shuf2], 15
4090    vpermq              m12, m13, q1302
4091    vpblendd            m12, m10, 0xCC
4092    vpblendd             m9, m6, 0xCC
4093    mova         [dstq+strideq*0+ 0], xm12
4094    mova         [dstq+strideq*0+16], xm9
4095    vextracti128 [dstq+strideq*1+ 0], m12, 1
4096    vextracti128 [dstq+strideq*1+16], m9, 1
4097    RET
4098.main:
4099    FILTER_2BLK           9, 8, 6, 7, 0, 14, 15
4100    ret
4101
4102%if WIN64
4103DECLARE_REG_TMP 5
4104%else
4105DECLARE_REG_TMP 7
4106%endif
4107
4108%macro IPRED_CFL 1 ; ac in, unpacked pixels out
4109    psignw               m3, m%1, m1
4110    pabsw               m%1, m%1
4111    pmulhrsw            m%1, m2
4112    psignw              m%1, m3
4113    paddw               m%1, m0
4114%endmacro
4115
4116cglobal ipred_cfl_top_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4117    movifnidn            hd, hm
4118    add                 tlq, 2
4119    movd                xm4, wd
4120    pxor                 m6, m6
4121    vpbroadcastw         m7, r7m
4122    pavgw               xm4, xm6
4123    tzcnt                wd, wd
4124    movd                xm5, wd
4125    movu                 m0, [tlq]
4126    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4127    movsxd               r6, [t0+wq*4]
4128    add                  r6, t0
4129    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4130    movsxd               wq, [t0+wq*4]
4131    add                  wq, t0
4132    movifnidn           acq, acmp
4133    jmp                  r6
4134
4135cglobal ipred_cfl_left_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4136    mov                  hd, hm ; zero upper half
4137    sub                 tlq, hq
4138    movd                xm4, hd
4139    sub                 tlq, hq
4140    pxor                 m6, m6
4141    vpbroadcastw         m7, r7m
4142    pavgw               xm4, xm6
4143    tzcnt               r6d, hd
4144    movd                xm5, r6d
4145    movu                 m0, [tlq]
4146    lea                  t0, [ipred_cfl_left_16bpc_avx2_table]
4147    movsxd               r6, [t0+r6*4]
4148    add                  r6, t0
4149    add                  t0, ipred_cfl_splat_16bpc_avx2_table-ipred_cfl_left_16bpc_avx2_table
4150    tzcnt                wd, wd
4151    movsxd               wq, [t0+wq*4]
4152    add                  wq, t0
4153    movifnidn           acq, acmp
4154    jmp                  r6
4155.h32:
4156    paddw                m0, [tlq+32]
4157.h16:
4158    vextracti128        xm1, m0, 1
4159    paddw               xm0, xm1
4160.h8:
4161    psrldq              xm1, xm0, 8
4162    paddw               xm0, xm1
4163.h4:
4164    punpcklwd           xm0, xm6
4165    psrlq               xm1, xm0, 32
4166    paddd               xm0, xm1
4167    psrldq              xm1, xm0, 8
4168    paddd               xm0, xm1
4169    paddd               xm0, xm4
4170    psrld               xm0, xm5
4171    vpbroadcastw         m0, xm0
4172    jmp                  wq
4173
4174cglobal ipred_cfl_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4175    movifnidn            hd, hm
4176    movifnidn            wd, wm
4177    tzcnt               r6d, hd
4178    lea                 t0d, [wq+hq]
4179    movd                xm4, t0d
4180    tzcnt               t0d, t0d
4181    movd                xm5, t0d
4182    lea                  t0, [ipred_cfl_16bpc_avx2_table]
4183    tzcnt                wd, wd
4184    movsxd               r6, [t0+r6*4]
4185    movsxd               wq, [t0+wq*4+4*4]
4186    psrlw               xm4, 1
4187    pxor                 m6, m6
4188    vpbroadcastw         m7, r7m
4189    add                  r6, t0
4190    add                  wq, t0
4191    movifnidn           acq, acmp
4192    jmp                  r6
4193.h4:
4194    movq                xm0, [tlq-8]
4195    jmp                  wq
4196.w4:
4197    movq                xm1, [tlq+2]
4198    paddw                m0, m4
4199    paddw                m0, m1
4200    psrlq                m1, m0, 32
4201    paddw                m0, m1
4202    psrld                m1, m0, 16
4203    paddw                m0, m1
4204    cmp                  hd, 4
4205    jg .w4_mul
4206    psrlw               xm0, 3
4207    jmp .w4_end
4208.w4_mul:
4209    vextracti128        xm1, m0, 1
4210    paddw               xm0, xm1
4211    lea                 r2d, [hq*2]
4212    mov                 r6d, 0xAAAB6667
4213    shrx                r6d, r6d, r2d
4214    punpckhwd           xm1, xm0, xm6
4215    punpcklwd           xm0, xm6
4216    paddd               xm0, xm1
4217    movd                xm1, r6d
4218    psrld               xm0, 2
4219    pmulhuw             xm0, xm1
4220    psrlw               xm0, 1
4221.w4_end:
4222    vpbroadcastw         m0, xm0
4223.s4:
4224    vpbroadcastw         m1, alpham
4225    lea                  r6, [strideq*3]
4226    pabsw                m2, m1
4227    psllw                m2, 9
4228.s4_loop:
4229    mova                 m4, [acq]
4230    IPRED_CFL             4
4231    pmaxsw               m4, m6
4232    pminsw               m4, m7
4233    vextracti128        xm5, m4, 1
4234    movq   [dstq+strideq*0], xm4
4235    movq   [dstq+strideq*2], xm5
4236    movhps [dstq+strideq*1], xm4
4237    movhps [dstq+r6       ], xm5
4238    lea                dstq, [dstq+strideq*4]
4239    add                 acq, 32
4240    sub                  hd, 4
4241    jg .s4_loop
4242    RET
4243ALIGN function_align
4244.h8:
4245    mova                xm0, [tlq-16]
4246    jmp                  wq
4247.w8:
4248    vextracti128        xm1, m0, 1
4249    paddw               xm0, [tlq+2]
4250    paddw               xm0, xm4
4251    paddw               xm0, xm1
4252    psrld               xm1, xm0, 16
4253    paddw               xm0, xm1
4254    pblendw             xm0, xm6, 0xAA
4255    psrlq               xm1, xm0, 32
4256    paddd               xm0, xm1
4257    psrldq              xm1, xm0, 8
4258    paddd               xm0, xm1
4259    psrld               xm0, xm5
4260    cmp                  hd, 8
4261    je .w8_end
4262    mov                 r6d, 0xAAAB
4263    mov                 r2d, 0x6667
4264    cmp                  hd, 32
4265    cmovz               r6d, r2d
4266    movd                xm1, r6d
4267    pmulhuw             xm0, xm1
4268    psrlw               xm0, 1
4269.w8_end:
4270    vpbroadcastw         m0, xm0
4271.s8:
4272    vpbroadcastw         m1, alpham
4273    lea                  r6, [strideq*3]
4274    pabsw                m2, m1
4275    psllw                m2, 9
4276.s8_loop:
4277    mova                 m4, [acq]
4278    mova                 m5, [acq+32]
4279    IPRED_CFL             4
4280    IPRED_CFL             5
4281    pmaxsw               m4, m6
4282    pmaxsw               m5, m6
4283    pminsw               m4, m7
4284    pminsw               m5, m7
4285    mova         [dstq+strideq*0], xm4
4286    mova         [dstq+strideq*2], xm5
4287    vextracti128 [dstq+strideq*1], m4, 1
4288    vextracti128 [dstq+r6       ], m5, 1
4289    lea                dstq, [dstq+strideq*4]
4290    add                 acq, 64
4291    sub                  hd, 4
4292    jg .s8_loop
4293    RET
4294ALIGN function_align
4295.h16:
4296    mova                 m0, [tlq-32]
4297    jmp                  wq
4298.w16:
4299    paddw                m0, [tlq+2]
4300    vextracti128        xm1, m0, 1
4301    paddw               xm0, xm4
4302    paddw               xm0, xm1
4303    punpckhwd           xm1, xm0, xm6
4304    punpcklwd           xm0, xm6
4305    paddd               xm0, xm1
4306    psrlq               xm1, xm0, 32
4307    paddd               xm0, xm1
4308    psrldq              xm1, xm0, 8
4309    paddd               xm0, xm1
4310    psrld               xm0, xm5
4311    cmp                  hd, 16
4312    je .w16_end
4313    mov                 r6d, 0xAAAB
4314    mov                 r2d, 0x6667
4315    test                 hb, 8|32
4316    cmovz               r6d, r2d
4317    movd                xm1, r6d
4318    pmulhuw             xm0, xm1
4319    psrlw               xm0, 1
4320.w16_end:
4321    vpbroadcastw         m0, xm0
4322.s16:
4323    vpbroadcastw         m1, alpham
4324    pabsw                m2, m1
4325    psllw                m2, 9
4326.s16_loop:
4327    mova                 m4, [acq]
4328    mova                 m5, [acq+32]
4329    IPRED_CFL             4
4330    IPRED_CFL             5
4331    pmaxsw               m4, m6
4332    pmaxsw               m5, m6
4333    pminsw               m4, m7
4334    pminsw               m5, m7
4335    mova   [dstq+strideq*0], m4
4336    mova   [dstq+strideq*1], m5
4337    lea                dstq, [dstq+strideq*2]
4338    add                 acq, 64
4339    sub                  hd, 2
4340    jg .s16_loop
4341    RET
4342ALIGN function_align
4343.h32:
4344    mova                 m0, [tlq-64]
4345    paddw                m0, [tlq-32]
4346    jmp                  wq
4347.w32:
4348    paddw                m0, [tlq+ 2]
4349    paddw                m0, [tlq+34]
4350    vextracti128        xm1, m0, 1
4351    paddw               xm0, xm4
4352    paddw               xm0, xm1
4353    punpcklwd           xm1, xm0, xm6
4354    punpckhwd           xm0, xm6
4355    paddd               xm0, xm1
4356    psrlq               xm1, xm0, 32
4357    paddd               xm0, xm1
4358    psrldq              xm1, xm0, 8
4359    paddd               xm0, xm1
4360    psrld               xm0, xm5
4361    cmp                  hd, 32
4362    je .w32_end
4363    lea                 r2d, [hq*2]
4364    mov                 r6d, 0x6667AAAB
4365    shrx                r6d, r6d, r2d
4366    movd                xm1, r6d
4367    pmulhuw             xm0, xm1
4368    psrlw               xm0, 1
4369.w32_end:
4370    vpbroadcastw         m0, xm0
4371.s32:
4372    vpbroadcastw         m1, alpham
4373    pabsw                m2, m1
4374    psllw                m2, 9
4375.s32_loop:
4376    mova                 m4, [acq]
4377    mova                 m5, [acq+32]
4378    IPRED_CFL             4
4379    IPRED_CFL             5
4380    pmaxsw               m4, m6
4381    pmaxsw               m5, m6
4382    pminsw               m4, m7
4383    pminsw               m5, m7
4384    mova        [dstq+32*0], m4
4385    mova        [dstq+32*1], m5
4386    add                dstq, strideq
4387    add                 acq, 64
4388    dec                  hd
4389    jg .s32_loop
4390    RET
4391
4392cglobal ipred_cfl_128_16bpc, 3, 7, 8, dst, stride, tl, w, h, ac, alpha
4393    mov                 r6d, r7m
4394    shr                 r6d, 11
4395    lea                  t0, [ipred_cfl_splat_16bpc_avx2_table]
4396    tzcnt                wd, wd
4397    movifnidn            hd, hm
4398    movsxd               wq, [t0+wq*4]
4399    vpbroadcastd         m0, [t0-ipred_cfl_splat_16bpc_avx2_table+pw_512+r6*4]
4400    pxor                 m6, m6
4401    vpbroadcastw         m7, r7m
4402    add                  wq, t0
4403    movifnidn           acq, acmp
4404    jmp                  wq
4405
4406cglobal ipred_cfl_ac_420_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4407    movifnidn         hpadd, hpadm
4408    vpbroadcastd         m5, [pw_2]
4409    mov                  hd, hm
4410    shl               hpadd, 2
4411    pxor                 m4, m4
4412    sub                  hd, hpadd
4413    cmp            dword wm, 8
4414    jg .w16
4415    je .w8
4416.w4:
4417    lea                  r3, [strideq*3]
4418    mov                  r5, acq
4419.w4_loop:
4420    mova                xm0, [ypxq+strideq*2]
4421    mova                xm1, [ypxq+r3       ]
4422    vinserti128          m0, [ypxq+strideq*0], 1
4423    vinserti128          m1, [ypxq+strideq*1], 1
4424    lea                ypxq, [ypxq+strideq*4]
4425    pmaddwd              m0, m5
4426    pmaddwd              m1, m5
4427    paddd                m0, m1
4428    vextracti128        xm1, m0, 1
4429    paddd                m4, m0
4430    packssdw            xm1, xm0
4431    mova              [acq], xm1
4432    add                 acq, 16
4433    sub                  hd, 2
4434    jg .w4_loop
4435    test              hpadd, hpadd
4436    jz .dc
4437    vpermq               m1, m1, q1111
4438    pslld               xm0, 2
4439.w4_hpad_loop:
4440    mova              [acq], m1
4441    paddd                m4, m0
4442    add                 acq, 32
4443    sub               hpadd, 4
4444    jg .w4_hpad_loop
4445    jmp .dc
4446.w8:
4447    mov                  r5, acq
4448    test              wpadd, wpadd
4449    jnz .w8_wpad1
4450.w8_loop:
4451    pmaddwd              m0, m5, [ypxq+strideq*0]
4452    pmaddwd              m1, m5, [ypxq+strideq*1]
4453    lea                ypxq, [ypxq+strideq*2]
4454    paddd                m0, m1
4455    vextracti128        xm1, m0, 1
4456    paddd                m4, m0
4457    packssdw            xm1, xm0, xm1
4458    mova              [acq], xm1
4459    add                 acq, 16
4460    dec                  hd
4461    jg .w8_loop
4462.w8_hpad:
4463    test              hpadd, hpadd
4464    jz .dc
4465    vinserti128          m1, xm1, 1
4466    pslld                m0, 2
4467    jmp .hpad
4468.w8_wpad1:
4469    pmaddwd             xm0, xm5, [ypxq+strideq*0]
4470    pmaddwd             xm3, xm5, [ypxq+strideq*1]
4471    lea                ypxq, [ypxq+strideq*2]
4472    paddd               xm0, xm3
4473    pshufd              xm3, xm0, q3333
4474    packssdw            xm1, xm0, xm3
4475    paddd               xm0, xm3
4476    paddd               xm4, xm0
4477    mova              [acq], xm1
4478    add                 acq, 16
4479    dec                  hd
4480    jg .w8_wpad1
4481    jmp .w8_hpad
4482.w16_wpad:
4483    mova                 m0, [ypxq+strideq*0+ 0]
4484    mova                 m1, [ypxq+strideq*1+ 0]
4485    cmp               wpadd, 2
4486    jl .w16_wpad1
4487    je .w16_wpad2
4488    vpbroadcastd         m2, [ypxq+strideq*0+12]
4489    vpbroadcastd         m3, [ypxq+strideq*1+12]
4490    vpblendd             m0, m2, 0xf0
4491    vpblendd             m1, m3, 0xf0
4492    jmp .w16_wpad_end
4493.w16_wpad2:
4494    vpbroadcastd         m2, [ypxq+strideq*0+28]
4495    vpbroadcastd         m3, [ypxq+strideq*1+28]
4496    jmp .w16_wpad_end
4497.w16_wpad1:
4498    vpbroadcastd         m2, [ypxq+strideq*0+44]
4499    vpbroadcastd         m3, [ypxq+strideq*1+44]
4500    vinserti128          m2, [ypxq+strideq*0+32], 0
4501    vinserti128          m3, [ypxq+strideq*1+32], 0
4502.w16_wpad_end:
4503    lea                ypxq, [ypxq+strideq*2]
4504    REPX    {pmaddwd x, m5}, m0, m1, m2, m3
4505    paddd                m0, m1
4506    paddd                m2, m3
4507    packssdw             m1, m0, m2
4508    paddd                m0, m2
4509    vpermq               m1, m1, q3120
4510    paddd                m4, m0
4511    mova              [acq], m1
4512    add                 acq, 32
4513    dec                  hd
4514    jg .w16_wpad
4515    jmp .w16_hpad
4516.w16:
4517    mov                  r5, acq
4518    test              wpadd, wpadd
4519    jnz .w16_wpad
4520.w16_loop:
4521    pmaddwd              m0, m5, [ypxq+strideq*0+ 0]
4522    pmaddwd              m2, m5, [ypxq+strideq*0+32]
4523    pmaddwd              m1, m5, [ypxq+strideq*1+ 0]
4524    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4525    lea                ypxq, [ypxq+strideq*2]
4526    paddd                m0, m1
4527    paddd                m2, m3
4528    packssdw             m1, m0, m2
4529    paddd                m0, m2
4530    vpermq               m1, m1, q3120
4531    paddd                m4, m0
4532    mova              [acq], m1
4533    add                 acq, 32
4534    dec                  hd
4535    jg .w16_loop
4536.w16_hpad:
4537    add               hpadd, hpadd
4538    jz .dc
4539    paddd                m0, m0
4540.hpad:
4541    mova         [acq+32*0], m1
4542    paddd                m4, m0
4543    mova         [acq+32*1], m1
4544    add                 acq, 32*2
4545    sub               hpadd, 4
4546    jg .hpad
4547.dc:
4548    vextracti128        xm1, m4, 1
4549    sub                  r5, acq ; -w*h*2
4550    tzcnt               r1d, r5d
4551    paddd               xm4, xm1
4552    sub                 r1d, 2
4553    punpckhqdq          xm1, xm4, xm4
4554    movd                xm0, r1d
4555    paddd               xm1, xm4
4556    pshuflw             xm4, xm1, q1032
4557    paddd               xm1, xm4
4558    psrld               xm1, xm0
4559    pxor                xm0, xm0
4560    pavgw               xm1, xm0
4561    vpbroadcastw         m1, xm1
4562.dc_loop:
4563    mova                 m0, [acq+r5]
4564    psubw                m0, m1
4565    mova           [acq+r5], m0
4566    add                  r5, 32
4567    jl .dc_loop
4568    RET
4569
4570cglobal ipred_cfl_ac_422_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4571    movifnidn         hpadd, hpadm
4572    vpbroadcastd         m5, [pw_4]
4573    mov                  hd, hm
4574    shl               hpadd, 2
4575    pxor                 m4, m4
4576    sub                  hd, hpadd
4577    cmp            dword wm, 8
4578    jg .w16
4579    je .w8
4580.w4:
4581    lea                  r3, [strideq*3]
4582    mov                  r5, acq
4583.w4_loop:
4584    mova                xm0, [ypxq+strideq*0]
4585    mova                xm1, [ypxq+strideq*1]
4586    vinserti128          m0, [ypxq+strideq*2], 1
4587    vinserti128          m1, [ypxq+r3       ], 1
4588    lea                ypxq, [ypxq+strideq*4]
4589    pmaddwd              m0, m5
4590    pmaddwd              m1, m5
4591    paddd                m4, m0
4592    packssdw             m0, m1
4593    paddd                m4, m1
4594    mova              [acq], m0
4595    add                 acq, 32
4596    sub                  hd, 4
4597    jg .w4_loop
4598    test              hpadd, hpadd
4599    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4600    vextracti128        xm1, m1, 1
4601    vpermq               m0, m0, q3333
4602    pslld               xm1, 2
4603.w4_hpad_loop:
4604    mova              [acq], m0
4605    paddd                m4, m1
4606    add                 acq, 32
4607    sub               hpadd, 4
4608    jg .w4_hpad_loop
4609    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4610.w8:
4611    mov                  r5, acq
4612    test              wpadd, wpadd
4613    jnz .w8_wpad1
4614.w8_loop:
4615    pmaddwd              m1, m5, [ypxq+strideq*0]
4616    pmaddwd              m0, m5, [ypxq+strideq*1]
4617    lea                ypxq, [ypxq+strideq*2]
4618    paddd                m4, m1
4619    packssdw             m1, m0
4620    paddd                m4, m0
4621    vpermq               m2, m1, q3120
4622    mova              [acq], m2
4623    add                 acq, 32
4624    sub                  hd, 2
4625    jg .w8_loop
4626.w8_hpad:
4627    test              hpadd, hpadd
4628    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4629    vpermq               m1, m1, q3131
4630    pslld                m0, 2
4631    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4632.w8_wpad1:
4633    vpbroadcastd         m1, [ypxq+strideq*0+12]
4634    vpbroadcastd         m0, [ypxq+strideq*1+12]
4635    vinserti128          m1, [ypxq+strideq*0+ 0], 0
4636    vinserti128          m0, [ypxq+strideq*1+ 0], 0
4637    lea                ypxq, [ypxq+strideq*2]
4638    pmaddwd              m1, m5
4639    pmaddwd              m0, m5
4640    paddd                m4, m1
4641    packssdw             m1, m0
4642    paddd                m4, m0
4643    vpermq               m2, m1, q3120
4644    mova              [acq], m2
4645    add                 acq, 32
4646    sub                  hd, 2
4647    jg .w8_wpad1
4648    jmp .w8_hpad
4649.w16:
4650    mov                  r5, acq
4651    test              wpadd, wpadd
4652    jnz .w16_wpad
4653.w16_loop:
4654    pmaddwd              m2, m5, [ypxq+strideq*0+ 0]
4655    pmaddwd              m1, m5, [ypxq+strideq*0+32]
4656    pmaddwd              m0, m5, [ypxq+strideq*1+ 0]
4657    pmaddwd              m3, m5, [ypxq+strideq*1+32]
4658    lea                ypxq, [ypxq+strideq*2]
4659    paddd                m4, m2
4660    packssdw             m2, m1
4661    paddd                m4, m1
4662    packssdw             m1, m0, m3
4663    paddd                m0, m3
4664    vpermq               m2, m2, q3120
4665    paddd                m4, m0
4666    vpermq               m1, m1, q3120
4667    mova         [acq+32*0], m2
4668    mova         [acq+32*1], m1
4669    add                 acq, 32*2
4670    sub                  hd, 2
4671    jg .w16_loop
4672    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4673.w16_wpad:
4674    mova                 m2, [ypxq+strideq*0+ 0]
4675    mova                 m0, [ypxq+strideq*1+ 0]
4676    cmp               wpadd, 2
4677    jl .w16_wpad1
4678    je .w16_wpad2
4679    vpbroadcastd         m1, [ypxq+strideq*0+12]
4680    vpbroadcastd         m3, [ypxq+strideq*1+12]
4681    vpblendd             m2, m1, 0xf0
4682    vpblendd             m0, m3, 0xf0
4683    jmp .w16_wpad_end
4684.w16_wpad2:
4685    vpbroadcastd         m1, [ypxq+strideq*0+28]
4686    vpbroadcastd         m3, [ypxq+strideq*1+28]
4687    jmp .w16_wpad_end
4688.w16_wpad1:
4689    vpbroadcastd         m1, [ypxq+strideq*0+44]
4690    vpbroadcastd         m3, [ypxq+strideq*1+44]
4691    vinserti128          m1, [ypxq+strideq*0+32], 0
4692    vinserti128          m3, [ypxq+strideq*1+32], 0
4693.w16_wpad_end:
4694    lea                ypxq, [ypxq+strideq*2]
4695    REPX    {pmaddwd x, m5}, m2, m0, m1, m3
4696    paddd                m4, m2
4697    packssdw             m2, m1
4698    paddd                m4, m1
4699    packssdw             m1, m0, m3
4700    paddd                m0, m3
4701    vpermq               m2, m2, q3120
4702    paddd                m4, m0
4703    vpermq               m1, m1, q3120
4704    mova         [acq+32*0], m2
4705    mova         [acq+32*1], m1
4706    add                 acq, 32*2
4707    sub                  hd, 2
4708    jg .w16_wpad
4709    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).w16_hpad
4710
4711cglobal ipred_cfl_ac_444_16bpc, 4, 7, 6, ac, ypx, stride, wpad, hpad, w, h
4712    lea                  r6, [ipred_cfl_ac_444_16bpc_avx2_table]
4713    tzcnt                wd, wm
4714    movifnidn         hpadd, hpadm
4715    vpbroadcastd         m5, [pw_1]
4716    movsxd               wq, [r6+wq*4]
4717    shl               hpadd, 2
4718    add                  wq, r6
4719    mov                  hd, hm
4720    pxor                 m4, m4
4721    sub                  hd, hpadd
4722    jmp                  wq
4723.w4:
4724    lea                  r3, [strideq*3]
4725    mov                  r5, acq
4726.w4_loop:
4727    movq                xm0, [ypxq+strideq*0]
4728    movhps              xm0, [ypxq+strideq*1]
4729    vpbroadcastq         m1, [ypxq+strideq*2]
4730    vpbroadcastq         m2, [ypxq+r3       ]
4731    lea                ypxq, [ypxq+strideq*4]
4732    vpblendd             m0, m1, 0x30
4733    vpblendd             m0, m2, 0xc0
4734    psllw                m0, 3
4735    pmaddwd              m1, m0, m5
4736    mova              [acq], m0
4737    add                 acq, 32
4738    paddd                m4, m1
4739    sub                  hd, 4
4740    jg .w4_loop
4741    test              hpadd, hpadd
4742    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4743    vpermq               m0, m0, q3333
4744    paddd                m1, m1
4745    mova         [acq+32*0], m0
4746    vpermq               m1, m1, q3333
4747    mova         [acq+32*1], m0
4748    add                 acq, 32*2
4749    paddd                m4, m1
4750    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4751.w8:
4752    lea                  r3, [strideq*3]
4753    mov                  r5, acq
4754.w8_loop:
4755    mova                xm2, [ypxq+strideq*0]
4756    vinserti128          m2, [ypxq+strideq*1], 1
4757    mova                xm1, [ypxq+strideq*2]
4758    vinserti128          m1, [ypxq+r3       ], 1
4759    lea                ypxq, [ypxq+strideq*4]
4760    psllw                m2, 3
4761    psllw                m1, 3
4762    mova         [acq+32*0], m2
4763    pmaddwd              m2, m5
4764    mova         [acq+32*1], m1
4765    pmaddwd              m0, m1, m5
4766    add                 acq, 32*2
4767    paddd                m4, m2
4768    paddd                m4, m0
4769    sub                  hd, 4
4770    jg .w8_loop
4771    test              hpadd, hpadd
4772    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4773    vperm2i128           m1, m1, 0x11
4774    pslld                m0, 2
4775    pxor                 m2, m2
4776    vpblendd             m0, m2, 0x0f
4777    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4778.w16_wpad2:
4779    vpbroadcastw         m3, [ypxq+strideq*0+14]
4780    vpbroadcastw         m0, [ypxq+strideq*1+14]
4781    vpblendd             m2, m3, 0xf0
4782    vpblendd             m1, m0, 0xf0
4783    jmp .w16_wpad_end
4784.w16:
4785    mov                  r5, acq
4786.w16_loop:
4787    mova                 m2, [ypxq+strideq*0]
4788    mova                 m1, [ypxq+strideq*1]
4789    test              wpadd, wpadd
4790    jnz .w16_wpad2
4791.w16_wpad_end:
4792    lea                ypxq, [ypxq+strideq*2]
4793    psllw                m2, 3
4794    psllw                m1, 3
4795    mova         [acq+32*0], m2
4796    pmaddwd              m2, m5
4797    mova         [acq+32*1], m1
4798    pmaddwd              m0, m1, m5
4799    add                 acq, 32*2
4800    paddd                m4, m2
4801    paddd                m4, m0
4802    sub                  hd, 2
4803    jg .w16_loop
4804    add               hpadd, hpadd
4805    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4806    paddd                m0, m0
4807    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).hpad
4808.w32:
4809    mov                  r5, acq
4810    test              wpadd, wpadd
4811    jnz .w32_wpad
4812.w32_loop:
4813    mova                 m0, [ypxq+ 0]
4814    mova                 m1, [ypxq+32]
4815    add                ypxq, strideq
4816    psllw                m0, 3
4817    psllw                m1, 3
4818    pmaddwd              m2, m0, m5
4819    mova         [acq+32*0], m0
4820    pmaddwd              m3, m1, m5
4821    mova         [acq+32*1], m1
4822    add                 acq, 32*2
4823    paddd                m2, m3
4824    paddd                m4, m2
4825    dec                  hd
4826    jg .w32_loop
4827.w32_hpad:
4828    test              hpadd, hpadd
4829    jz mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4830    paddd                m2, m2
4831.w32_hpad_loop:
4832    mova         [acq+32*0], m0
4833    mova         [acq+32*1], m1
4834    paddd                m4, m2
4835    mova         [acq+32*2], m0
4836    mova         [acq+32*3], m1
4837    add                 acq, 32*4
4838    sub               hpadd, 2
4839    jg .w32_hpad_loop
4840    jmp mangle(private_prefix %+ _ipred_cfl_ac_420_16bpc_avx2).dc
4841.w32_wpad:
4842    mova                 m0, [ypxq+ 0]
4843    cmp               wpadd, 4
4844    jl .w32_wpad2
4845    je .w32_wpad4
4846    vpbroadcastw         m1, [ypxq+14]
4847    vpblendd             m0, m1, 0xf0
4848    jmp .w32_wpad_end
4849.w32_wpad4:
4850    vpbroadcastw         m1, [ypxq+30]
4851    jmp .w32_wpad_end
4852.w32_wpad2:
4853    vpbroadcastw         m1, [ypxq+46]
4854    vinserti128          m1, [ypxq+32], 0
4855.w32_wpad_end:
4856    add                ypxq, strideq
4857    psllw                m0, 3
4858    psllw                m1, 3
4859    pmaddwd              m2, m0, m5
4860    mova         [acq+32*0], m0
4861    pmaddwd              m3, m1, m5
4862    mova         [acq+32*1], m1
4863    add                 acq, 32*2
4864    paddd                m2, m3
4865    paddd                m4, m2
4866    dec                  hd
4867    jg .w32_wpad
4868    jmp .w32_hpad
4869
4870cglobal pal_pred_16bpc, 4, 6, 6, dst, stride, pal, idx, w, h
4871    vbroadcasti128       m4, [palq]
4872    lea                  r2, [pal_pred_16bpc_avx2_table]
4873    tzcnt                wd, wm
4874    vbroadcasti128       m5, [pal_pred_shuf]
4875    movifnidn            hd, hm
4876    movsxd               wq, [r2+wq*4]
4877    pshufb               m4, m5
4878    punpckhqdq           m5, m4, m4
4879    add                  wq, r2
4880DEFINE_ARGS dst, stride, stride3, idx, w, h
4881    lea            stride3q, [strideq*3]
4882    jmp                  wq
4883.w4:
4884    movq                xm0, [idxq]
4885    add                idxq, 8
4886    psrlw               xm1, xm0, 4
4887    punpcklbw           xm0, xm1
4888    pshufb              xm1, xm4, xm0
4889    pshufb              xm2, xm5, xm0
4890    punpcklbw           xm0, xm1, xm2
4891    punpckhbw           xm1, xm2
4892    movq   [dstq+strideq*0], xm0
4893    movq   [dstq+strideq*2], xm1
4894    movhps [dstq+strideq*1], xm0
4895    movhps [dstq+stride3q ], xm1
4896    lea                dstq, [dstq+strideq*4]
4897    sub                  hd, 4
4898    jg .w4
4899    RET
4900.w8:
4901    pmovzxbw             m2, [idxq]
4902    add                idxq, 16
4903    psllw                m1, m2, 4
4904    por                  m2, m1
4905    pshufb               m1, m4, m2
4906    pshufb               m2, m5, m2
4907    punpcklbw            m0, m1, m2
4908    punpckhbw            m1, m2
4909    mova         [dstq+strideq*0], xm0
4910    mova         [dstq+strideq*1], xm1
4911    vextracti128 [dstq+strideq*2], m0, 1
4912    vextracti128 [dstq+stride3q ], m1, 1
4913    lea                dstq, [dstq+strideq*4]
4914    sub                  hd, 4
4915    jg .w8
4916    RET
4917.w16:
4918    pshufd               m3, [idxq], q3120
4919    add                idxq, 32
4920    vpermq               m3, m3, q3120
4921    psrlw                m1, m3, 4
4922    punpcklbw            m2, m3, m1
4923    punpckhbw            m3, m1
4924    pshufb               m1, m4, m2
4925    pshufb               m2, m5, m2
4926    punpcklbw            m0, m1, m2
4927    punpckhbw            m1, m2
4928    mova   [dstq+strideq*0], m0
4929    mova   [dstq+strideq*1], m1
4930    pshufb               m1, m4, m3
4931    pshufb               m3, m5, m3
4932    punpcklbw            m0, m1, m3
4933    punpckhbw            m1, m3
4934    mova   [dstq+strideq*2], m0
4935    mova   [dstq+stride3q ], m1
4936    lea                dstq, [dstq+strideq*4]
4937    sub                  hd, 4
4938    jg .w16
4939    RET
4940.w32:
4941    pshufd               m3, [idxq], q3120
4942    add                idxq, 32
4943    vpermq               m3, m3, q3120
4944    psrlw                m1, m3, 4
4945    punpcklbw            m2, m3, m1
4946    punpckhbw            m3, m1
4947    pshufb               m1, m4, m2
4948    pshufb               m2, m5, m2
4949    punpcklbw            m0, m1, m2
4950    punpckhbw            m1, m2
4951    mova          [dstq+ 0], m0
4952    mova          [dstq+32], m1
4953    pshufb               m1, m4, m3
4954    pshufb               m3, m5, m3
4955    punpcklbw            m0, m1, m3
4956    punpckhbw            m1, m3
4957    mova  [dstq+strideq+ 0], m0
4958    mova  [dstq+strideq+32], m1
4959    lea                dstq, [dstq+strideq*2]
4960    sub                  hd, 2
4961    jg .w32
4962    RET
4963.w64:
4964    pshufd               m3, [idxq], q3120
4965    add                idxq, 32
4966    vpermq               m3, m3, q3120
4967    psrlw                m1, m3, 4
4968    punpcklbw            m2, m3, m1
4969    punpckhbw            m3, m1
4970    pshufb               m1, m4, m2
4971    pshufb               m2, m5, m2
4972    punpcklbw            m0, m1, m2
4973    punpckhbw            m1, m2
4974    mova        [dstq+32*0], m0
4975    mova        [dstq+32*1], m1
4976    pshufb               m1, m4, m3
4977    pshufb               m3, m5, m3
4978    punpcklbw            m0, m1, m3
4979    punpckhbw            m1, m3
4980    mova        [dstq+32*2], m0
4981    mova        [dstq+32*3], m1
4982    add                 dstq, strideq
4983    dec                   hd
4984    jg .w64
4985    RET
4986
4987%endif
4988