• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2024, VideoLAN and dav1d authors
3 * Copyright © 2024, Janne Grunau
4 * Copyright © 2024, Martin Storsjo
5 * Copyright © 2024, Arm Limited
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright notice, this
12 *    list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright notice,
15 *    this list of conditions and the following disclaimer in the documentation
16 *    and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29
30#include "src/arm/asm.S"
31#include "util.S"
32
33
34#if HAVE_DOTPROD
35ENABLE_DOTPROD
36
37// No spaces in these expressions, due to gas-preprocessor. It is translated by
38// -1 to save the negative offset at getting the address of `mc_subpel_filters`.
39#define REGULAR1        (((0*15-1)<<7)|(3*15-1))
40#define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
41#define SHARP1          (((2*15-1)<<7)|(3*15-1))
42
43#define FUNC_ALIGN      2
44#define JUMP_ALIGN      2
45#define LOOP_ALIGN      2
46
47
48// Lookup table used to help conversion of shifted 32-bit values to 8-bit.
49        .align 4
50L(hv_tbl_neon_dotprod):
51        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
52
53// Shuffle indices to permute horizontal samples in preparation for input to
54// SDOT instructions. The 8-tap horizontal convolution uses sample indices in the
55// interval of [-3, 4] relative to the current sample position. We load samples
56// from index value -4 to keep loads word aligned, so the shuffle bytes are
57// translated by 1 to handle this.
58        .align 4
59L(h_tbl_neon_dotprod):
60        .byte  1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6,   4,  5,  6,  7
61        .byte  5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10,   8,  9, 10, 11
62        .byte  9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14,  12, 13, 14, 15
63
64// Vertical convolutions are also using SDOT instructions, where a 128-bit
65// register contains a transposed 4x4 matrix of values. Subsequent iterations of
66// the vertical convolution can reuse the 3x4 sub-matrix from the previous loop
67// iteration. These shuffle indices shift and merge this 4x4 matrix with the
68// values of a new line.
69        .align 4
70L(v_tbl_neon_dotprod):
71        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
72        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
73        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
74        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
75        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
76
77
78.macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
79function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
80        mov             x9,  \type_h
81        mov             x10, \type_v
82    .if \jump
83        b               \op\()_8tap_\isa
84    .endif
85endfunc
86.endm
87
88.macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
89make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
90make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
91make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
92make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
93make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
94make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
95make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
96make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
97make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
98
99function \type\()_8tap_\isa, align=FUNC_ALIGN
100        clz             w8, \w
101        mov             w11,  #0x4081   // (1 << 14) | (1 << 7) | (1 << 0)
102        sub             w8, w8, #24     // for jump tables
103        movrel          x12, X(mc_subpel_filters)
104        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
105        cbnz            \my, L(\type\()_8tap_v_\isa)
106.ifc \type, prep
107        add             \wd_strd, \w, \w    // prep_neon needs w * 2 as stride
108.endif
109        b               X(\type\()_neon)
110
111        .align JUMP_ALIGN
112L(\type\()_8tap_v_\isa):
113        madd            \my, \my, w11, w10
114        ldr             q6, L(v_tbl_neon_dotprod)
115        sub             \src, \src, \s_strd
116.ifc \isa, neon_dotprod
117    .ifc \type, prep
118        mov             w8, 0x2002          // FILTER_WEIGHT * 128 + rounding
119        dup             v4.4s, w8
120    .else
121        movi            v4.4s, #32, lsl 8   // FILTER_WEIGHT * 128, bias for SDOT
122    .endif
123.endif
124        ubfx            w11, \my, #7, #7
125        and             \my, \my, #0x7F
126        ldr             q28, L(v_tbl_neon_dotprod) + 16
127        cmp             \h, #4
128        csel            \my, \my, w11, le
129        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
130        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
131        ldr             q29, L(v_tbl_neon_dotprod) + 32
132.ifc \isa, neon_dotprod
133        movi            v5.16b, #128
134.endif
135        ldr             d7, [\xmy]
136        cmp             \w, #8
137        b.eq            80f
138        b.lt            40f
139
140        // .align JUMP_ALIGN    // fallthrough
141160:    // V - 16xN+
142        ldr             q30, L(v_tbl_neon_dotprod) + 48
143        ldr             q31, L(v_tbl_neon_dotprod) + 64
144.ifc \type, prep
145        add             \wd_strd, \w, \w
146.endif
147        .align LOOP_ALIGN
148161:
149        mov             \lsrc, \src
150        mov             \ldst, \dst
151        sub             w8, \h, #1
152
153        ldr             q16, [\lsrc]
154        ldr             q17, [\lsrc, \s_strd]
155        add             \lsrc, \lsrc, \s_strd, lsl #1
156        ldr             q18, [\lsrc]
157        ldr             q19, [\lsrc, \s_strd]
158        add             \lsrc, \lsrc, \s_strd, lsl #1
159
160        zip1            v0.16b, v16.16b, v17.16b
161        zip2            v1.16b, v16.16b, v17.16b
162        zip1            v2.16b, v18.16b, v19.16b
163        zip2            v3.16b, v18.16b, v19.16b
164
165        ldr             q20, [\lsrc]
166        ldr             q21, [\lsrc, \s_strd]
167        add             \lsrc, \lsrc, \s_strd, lsl #1
168        ldr             q22, [\lsrc]
169        ldr             q23, [\lsrc, \s_strd]
170        add             \lsrc, \lsrc, \s_strd, lsl #1
171
172        zip1            v18.16b, v20.16b, v21.16b
173        zip2            v21.16b, v20.16b, v21.16b
174        zip1            v24.16b, v22.16b, v23.16b
175        zip2            v27.16b, v22.16b, v23.16b
176
177        zip1            v16.8h, v0.8h, v2.8h
178        zip2            v19.8h, v0.8h, v2.8h
179        zip1            v22.8h, v1.8h, v3.8h
180        zip2            v25.8h, v1.8h, v3.8h
181
182        zip1            v17.8h, v18.8h, v24.8h
183        zip2            v20.8h, v18.8h, v24.8h
184        zip1            v23.8h, v21.8h, v27.8h
185        zip2            v26.8h, v21.8h, v27.8h
186.ifc \isa, neon_dotprod
187        sub             v16.16b, v16.16b, v5.16b
188        sub             v19.16b, v19.16b, v5.16b
189        sub             v22.16b, v22.16b, v5.16b
190        sub             v25.16b, v25.16b, v5.16b
191
192        sub             v17.16b, v17.16b, v5.16b
193        sub             v20.16b, v20.16b, v5.16b
194        sub             v23.16b, v23.16b, v5.16b
195        sub             v26.16b, v26.16b, v5.16b
196.endif
197        .align LOOP_ALIGN
19816:
199.ifc \isa, neon_i8mm
200        ld1             {v18.16b}, [\lsrc], \s_strd
201        movi            v0.4s, #0
202        movi            v1.4s, #0
203        movi            v2.4s, #0
204        movi            v3.4s, #0
205        mov             v21.16b, v18.16b
206        mov             v24.16b, v18.16b
207        mov             v27.16b, v18.16b
208.else   // neon_dotprod
209        ld1             {v27.16b}, [\lsrc], \s_strd
210        mov             v0.16b, v4.16b
211        mov             v1.16b, v4.16b
212        mov             v2.16b, v4.16b
213        mov             v3.16b, v4.16b
214        sub             v18.16b, v27.16b, v5.16b
215        sub             v21.16b, v27.16b, v5.16b
216        sub             v24.16b, v27.16b, v5.16b
217        sub             v27.16b, v27.16b, v5.16b
218.endif
219        \dot            v0.4s, v16.16b, v7.4b[0]
220        \dot            v1.4s, v19.16b, v7.4b[0]
221        \dot            v2.4s, v22.16b, v7.4b[0]
222        \dot            v3.4s, v25.16b, v7.4b[0]
223
224        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
225        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
226        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
227        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
228
229        \dot            v0.4s, v17.16b, v7.4b[1]
230        \dot            v1.4s, v20.16b, v7.4b[1]
231        \dot            v2.4s, v23.16b, v7.4b[1]
232        \dot            v3.4s, v26.16b, v7.4b[1]
233
234        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
235        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
236        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
237        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
238
239        subs            w8, w8, #1
240        uzp1            v0.8h, v0.8h, v1.8h
241        uzp1            v2.8h, v2.8h, v3.8h
242.ifc \type, prep
243    .ifc \isa, neon_i8mm
244        srshr           v0.8h, v0.8h, #2
245        srshr           v1.8h, v2.8h, #2
246    .else
247        sshr            v0.8h, v0.8h, #2
248        sshr            v1.8h, v2.8h, #2
249    .endif
250        st1             {v0.8h, v1.8h}, [\ldst], \d_strd
251.else   // put
252        sqrshrun        v0.8b, v0.8h, #6
253        sqrshrun2       v0.16b, v2.8h, #6
254        st1             {v0.16b}, [\ldst], \d_strd
255.endif
256        b.gt            16b
257
258.ifc \isa, neon_i8mm
259        movi            v0.4s, #0
260        movi            v1.4s, #0
261        movi            v2.4s, #0
262        movi            v3.4s, #0
263.else   // neon_dotprod
264        mov             v0.16b, v4.16b
265        mov             v1.16b, v4.16b
266        mov             v2.16b, v4.16b
267        mov             v3.16b, v4.16b
268.endif
269        \dot            v0.4s, v16.16b, v7.4b[0]
270        \dot            v1.4s, v19.16b, v7.4b[0]
271        \dot            v2.4s, v22.16b, v7.4b[0]
272        \dot            v3.4s, v25.16b, v7.4b[0]
273
274        \dot            v0.4s, v17.16b, v7.4b[1]
275        \dot            v1.4s, v20.16b, v7.4b[1]
276        \dot            v2.4s, v23.16b, v7.4b[1]
277        \dot            v3.4s, v26.16b, v7.4b[1]
278
279        subs            \w, \w, #16
280        uzp1            v0.8h, v0.8h, v1.8h
281        uzp1            v2.8h, v2.8h, v3.8h
282.ifc \type, prep
283    .ifc \isa, neon_i8mm
284        srshr           v0.8h, v0.8h, #2
285        srshr           v1.8h, v2.8h, #2
286    .else
287        sshr            v0.8h, v0.8h, #2
288        sshr            v1.8h, v2.8h, #2
289    .endif
290        stp             q0, q1, [\ldst]
291        add             \dst, \dst, #32
292.else   // put
293        sqrshrun        v0.8b, v0.8h, #6
294        sqrshrun2       v0.16b, v2.8h, #6
295        str             q0, [\ldst]
296        add             \dst, \dst, #16
297.endif
298        add             \src, \src, #16
299        b.gt            161b
300        ret
301
302        .align JUMP_ALIGN
30380:     // V - 8xN
304        ldr             d16, [\src]
305        ldr             d17, [\src, \s_strd]
306        add             \src, \src, \s_strd, lsl #1
307        ldr             d18, [\src]
308        ldr             d19, [\src, \s_strd]
309        add             \src, \src, \s_strd, lsl #1
310
311        ldr             d20, [\src]
312        ldr             d21, [\src, \s_strd]
313        add             \src, \src, \s_strd, lsl #1
314        ldr             d22, [\src]
315        ldr             d23, [\src, \s_strd]
316        add             \src, \src, \s_strd, lsl #1
317        subs            \h, \h, #2  // for prep: sub is enough
318
319        zip1            v0.16b, v16.16b, v17.16b
320        zip1            v2.16b, v18.16b, v19.16b
321        zip1            v18.16b, v20.16b, v21.16b
322        zip1            v24.16b, v22.16b, v23.16b
323
324        zip1            v16.8h,  v0.8h,  v2.8h
325        zip2            v19.8h,  v0.8h,  v2.8h
326        zip1            v17.8h, v18.8h, v24.8h
327        zip2            v20.8h, v18.8h, v24.8h
328.ifc \isa, neon_dotprod
329        sub             v16.16b, v16.16b, v5.16b
330        sub             v19.16b, v19.16b, v5.16b
331        sub             v17.16b, v17.16b, v5.16b
332        sub             v20.16b, v20.16b, v5.16b
333.endif
334.ifc \type, put
335        b.eq            82f
336.endif
337        .align LOOP_ALIGN
3388:
339.ifc \isa, neon_i8mm
340        ldr             d18, [\src]
341        movi            v0.4s, #0
342        movi            v1.4s, #0
343        ldr             d24, [\src, \s_strd]
344        add             \src, \src, \s_strd, lsl #1
345        movi            v2.4s, #0
346        movi            v3.4s, #0
347        mov             v21.8b, v18.8b
348        mov             v27.8b, v24.8b
349.else   // neon_dotprod
350        ldr             d21, [\src]
351        ldr             d27, [\src, \s_strd]
352        add             \src, \src, \s_strd, lsl #1
353        mov             v0.16b, v4.16b
354        mov             v1.16b, v4.16b
355        mov             v2.16b, v4.16b
356        mov             v3.16b, v4.16b
357        sub             v18.16b, v21.16b, v5.16b
358        sub             v21.16b, v21.16b, v5.16b
359        sub             v24.16b, v27.16b, v5.16b
360        sub             v27.16b, v27.16b, v5.16b
361.endif
362        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
363        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
364        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
365        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
366
367        \dot            v0.4s, v16.16b, v7.4b[0]
368        \dot            v0.4s, v17.16b, v7.4b[1]
369        \dot            v1.4s, v19.16b, v7.4b[0]
370        \dot            v1.4s, v20.16b, v7.4b[1]
371
372        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
373        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
374        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
375        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
376
377        \dot            v2.4s, v22.16b, v7.4b[0]
378        \dot            v2.4s, v23.16b, v7.4b[1]
379        \dot            v3.4s, v25.16b, v7.4b[0]
380        \dot            v3.4s, v26.16b, v7.4b[1]
381
382        subs            \h, \h, #2
383        uzp1            v0.8h, v0.8h, v1.8h
384        uzp1            v2.8h, v2.8h, v3.8h
385.ifc \type, prep
386    .ifc \isa, neon_i8mm
387        srshr           v0.8h, v0.8h, #2
388        srshr           v1.8h, v2.8h, #2
389    .else
390        sshr            v0.8h, v0.8h, #2
391        sshr            v1.8h, v2.8h, #2
392    .endif
393        stp             q0, q1, [\dst], #32
394.else   // put
395        sqrshrun        v0.8b, v0.8h, #6
396        sqrshrun        v1.8b, v2.8h, #6
397        str             d0, [\dst]
398        str             d1, [\dst, \d_strd]
399        add             \dst, \dst, \d_strd, lsl #1
400.endif
401        b.gt            8b
402
403.ifc \type, put
404        .align JUMP_ALIGN
40582:
406.endif
407.ifc \isa, neon_i8mm
408        ldr             d18, [\src]
409        movi            v0.4s, #0
410        movi            v1.4s, #0
411        movi            v2.4s, #0
412        movi            v3.4s, #0
413        mov             v21.8b, v18.8b
414.else   // neon_dotprod
415        ldr             d21, [\src]
416        mov             v0.16b, v4.16b
417        mov             v1.16b, v4.16b
418        mov             v2.16b, v4.16b
419        mov             v3.16b, v4.16b
420        sub             v18.16b, v21.16b, v5.16b
421        sub             v21.16b, v21.16b, v5.16b
422.endif
423        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
424        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
425        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
426        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
427
428        \dot            v0.4s, v16.16b, v7.4b[0]
429        \dot            v0.4s, v17.16b, v7.4b[1]
430        \dot            v1.4s, v19.16b, v7.4b[0]
431        \dot            v1.4s, v20.16b, v7.4b[1]
432
433        \dot            v2.4s, v22.16b, v7.4b[0]
434        \dot            v2.4s, v23.16b, v7.4b[1]
435        \dot            v3.4s, v25.16b, v7.4b[0]
436        \dot            v3.4s, v26.16b, v7.4b[1]
437
438        uzp1            v0.8h, v0.8h, v1.8h
439        uzp1            v2.8h, v2.8h, v3.8h
440.ifc \type, prep
441    .ifc \isa, neon_i8mm
442        srshr           v0.8h, v0.8h, #2
443        srshr           v1.8h, v2.8h, #2
444    .else
445        sshr            v0.8h, v0.8h, #2
446        sshr            v1.8h, v2.8h, #2
447    .endif
448        stp             q0, q1, [\dst]
449.else   // put
450        sqrshrun        v0.8b, v0.8h, #6
451        sqrshrun        v1.8b, v2.8h, #6
452        str             d0, [\dst]
453        str             d1, [\dst, \d_strd]
454.endif
455        ret
456
457        .align JUMP_ALIGN
45840:     // V - 4xN or 2xN (put only)
459.ifc \type, put
460        cmp             \w, #2
461        b.eq            20f
462.endif
463        ldr             s16, [\src]
464        ldr             s17, [\src, \s_strd]
465        add             \src, \src, \s_strd, lsl #1
466        ldr             s18, [\src]
467        ldr             s19, [\src, \s_strd]
468        add             \src, \src, \s_strd, lsl #1
469
470        ldr             s20, [\src]
471        ldr             s21, [\src, \s_strd]
472        add             \src, \src, \s_strd, lsl #1
473        ldr             s22, [\src]
474        ldr             s23, [\src, \s_strd]
475        add             \src, \src, \s_strd, lsl #1
476        subs            \h, \h, #2  // for prep: sub is enough
477
478        zip1            v0.8b, v16.8b, v17.8b
479        zip1            v2.8b, v18.8b, v19.8b
480        zip1            v18.8b, v20.8b, v21.8b
481        zip1            v24.8b, v22.8b, v23.8b
482
483        zip1            v16.8h, v0.8h, v2.8h
484        zip1            v17.8h, v18.8h, v24.8h
485.ifc \isa, neon_dotprod
486        sub             v16.16b, v16.16b, v5.16b
487        sub             v17.16b, v17.16b, v5.16b
488.endif
489.ifc \type, put
490        b.eq            42f
491.endif
492        .align LOOP_ALIGN
4934:
494        ldr             s18, [\src]
495        ldr             s21, [\src, \s_strd]
496        add             \src, \src, \s_strd, lsl #1
497.ifc \isa, neon_i8mm
498        movi            v0.4s, #0
499        movi            v1.4s, #0
500.else   // neon_dotprod
501        mov             v0.16b, v4.16b
502        mov             v1.16b, v4.16b
503        sub             v18.16b, v18.16b, v5.16b
504        sub             v21.16b, v21.16b, v5.16b
505.endif
506        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
507        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
508
509        \dot            v0.4s, v16.16b, v7.4b[0]
510        \dot            v0.4s, v17.16b, v7.4b[1]
511
512        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
513        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
514
515        \dot            v1.4s, v19.16b, v7.4b[0]
516        \dot            v1.4s, v20.16b, v7.4b[1]
517.ifc \type, prep
518        subs            \h, \h, #2
519    .ifc \isa, neon_i8mm
520        rshrn           v0.4h, v0.4s, #2
521        rshrn2          v0.8h, v1.4s, #2
522    .else
523        shrn            v0.4h, v0.4s, #2
524        shrn2           v0.8h, v1.4s, #2
525    .endif
526        str             q0, [\dst], #16
527.else
528        uzp1            v0.8h, v0.8h, v1.8h
529        sqrshrun        v0.8b, v0.8h, #6
530        subs            \h, \h, #2
531        fmov            x8, d0
532        lsr             x9, x8, #32
533        str             w8, [\dst]
534        str             w9, [\dst, \d_strd]
535        add             \dst, \dst, \d_strd, lsl #1
536.endif
537        b.gt            4b
538
539.ifc \type, put
540        .align JUMP_ALIGN
54142:
542.endif
543        ldr             s18, [\src]
544.ifc \isa, neon_i8mm
545        movi            v0.4s, #0
546        movi            v1.4s, #0
547.else   // neon_dotprod
548        mov             v0.16b, v4.16b
549        mov             v1.16b, v4.16b
550        sub             v18.16b, v18.16b, v5.16b
551.endif
552        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
553        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
554
555        \dot            v0.4s, v16.16b, v7.4b[0]
556        \dot            v0.4s, v17.16b, v7.4b[1]
557
558        \dot            v1.4s, v19.16b, v7.4b[0]
559        \dot            v1.4s, v20.16b, v7.4b[1]
560.ifc \type, prep
561    .ifc \isa, neon_i8mm
562        rshrn           v0.4h, v0.4s, #2
563        rshrn2          v0.8h, v1.4s, #2
564    .else
565        shrn            v0.4h, v0.4s, #2
566        shrn2           v0.8h, v1.4s, #2
567    .endif
568        str             q0, [\dst]
569.else
570        uzp1            v0.8h, v0.8h, v1.8h
571        sqrshrun        v0.8b, v0.8h, #6
572        fmov            x8, d0
573        lsr             x9, x8, #32
574        str             w8, [\dst]
575        str             w9, [\dst, \d_strd]
576.endif
577        ret
578
579.ifc \type, put
580        .align JUMP_ALIGN
58120:     // V - 2xN
582        ldr             h16, [\src]
583        ldr             h17, [\src, \s_strd]
584        add             \src, \src, \s_strd, lsl #1
585        ldr             h18, [\src]
586        ldr             h19, [\src, \s_strd]
587        add             \src, \src, \s_strd, lsl #1
588
589        ldr             h20, [\src]
590        ldr             h21, [\src, \s_strd]
591        add             \src, \src, \s_strd, lsl #1
592        ldr             h22, [\src]
593        ldr             h23, [\src, \s_strd]
594        add             \src, \src, \s_strd, lsl #1
595        subs            \h, \h, #2
596
597        zip1            v0.8b, v16.8b, v17.8b
598        zip1            v2.8b, v18.8b, v19.8b
599        zip1            v18.8b, v20.8b, v21.8b
600        zip1            v24.8b, v22.8b, v23.8b
601
602        zip1            v16.4h, v0.4h, v2.4h
603        zip1            v17.4h, v18.4h, v24.4h
604    .ifc \isa, neon_dotprod
605        sub             v16.8b, v16.8b, v5.8b
606        sub             v17.8b, v17.8b, v5.8b
607    .endif
608        b.eq            22f
609
610        .align LOOP_ALIGN
6112:
612        ldr             h18, [\src]
613        ldr             h21, [\src, \s_strd]
614        add             \src, \src, \s_strd, lsl #1
615    .ifc \isa, neon_i8mm
616        movi            v0.4s, #0
617        movi            v1.4s, #0
618    .else   // put
619        mov             v0.16b, v4.16b
620        mov             v1.16b, v4.16b
621        sub             v18.8b, v18.8b, v5.8b
622        sub             v21.8b, v21.8b, v5.8b
623    .endif
624        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
625        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
626
627        \dot            v0.4s, v16.16b, v7.4b[0]
628        \dot            v0.4s, v17.16b, v7.4b[1]
629
630        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
631        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
632
633        \dot            v1.4s, v19.16b, v7.4b[0]
634        \dot            v1.4s, v20.16b, v7.4b[1]
635
636        uzp1            v0.8h, v0.8h, v1.8h
637        sqrshrun        v0.8b, v0.8h, #6
638
639        subs            \h, \h, #2
640        fmov            x8, d0
641        lsr             x9, x8, #32
642        strh            w8, [\dst]
643        strh            w9, [\dst, \d_strd]
644        add             \dst, \dst, \d_strd, lsl #1
645        b.gt            2b
646
647        .align JUMP_ALIGN
64822:
649        ldr             h18, [\src]
650    .ifc \isa, neon_i8mm
651        movi            v0.4s, #0
652        movi            v1.4s, #0
653    .else   // put
654        mov             v0.16b, v4.16b
655        mov             v1.16b, v4.16b
656        sub             v18.8b, v18.8b, v5.8b
657    .endif
658        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
659        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
660
661        \dot            v0.4s, v16.16b, v7.4b[0]
662        \dot            v0.4s, v17.16b, v7.4b[1]
663
664        \dot            v1.4s, v19.16b, v7.4b[0]
665        \dot            v1.4s, v20.16b, v7.4b[1]
666
667        uzp1            v0.8h, v0.8h, v1.8h
668        sqrshrun        v0.8b, v0.8h, #6
669
670        fmov            x8, d0
671        lsr             x9, x8, #32
672        strh            w8, [\dst]
673        strh            w9, [\dst, \d_strd]
674        ret
675.endif
676
677        .align JUMP_ALIGN
678L(\type\()_8tap_h_hv_\isa):
679        madd            \mx, \mx, w11, w9
680        madd            w14, \my, w11, w10      // for HV
681        ldr             q28, L(h_tbl_neon_dotprod)
682.ifc \isa, neon_dotprod
683        mov             w13, 0x2002             // FILTER_WEIGHT * 128 + rounding
684        dup             v27.4s, w13             // put H overrides this
685.endif
686        sub             \src, \src, #4          // src - 4
687        ubfx            w9, \mx, #7, #7
688        and             \mx, \mx, #0x7F
689        ubfx            w11, w14, #7, #7        // for HV
690        and             w14, w14, #0x7F         // for HV
691        cmp             \w, #4
692        csel            \mx, \mx, w9, le
693        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
694.ifc \isa, neon_dotprod
695        movi            v24.16b, #128
696.endif
697        cbz             \my, L(\type\()_8tap_h_\isa)
698
699        // HV cases
700        cmp             \h, #4
701        csel            w14, w14, w11, le
702        sub             \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 4
703        add             \xmy, x12, x14, lsl #3      // subpel V filter address
704        mov             x15, x30
705        ldr             d7, [\xmy]
706.ifc \type, put
707        ldr             q25, L(hv_tbl_neon_dotprod)
708.endif
709        sxtl            v7.8h, v7.8b
710        cmp             w10, SHARP1
711        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
712
713        // HV 8-tap cases
714        sub             \src, \src, \s_strd         // src - s_strd * 3 - 4
715        cmp             \w, #4
716        b.eq            40f
717.ifc \type, put
718        b.lt            20f
719.endif
720
721        // .align JUMP_ALIGN    // fallthrough
72280:     // HV8 - 8xN+
723        ldr             q29, L(h_tbl_neon_dotprod) + 16
724        ldr             q30, L(h_tbl_neon_dotprod) + 32
725        ldr             d26, [\xmx]
726.ifc \type, prep
727        add             \wd_strd, \w, \w
728.endif
729        .align LOOP_ALIGN
73081:
731        mov             \lsrc, \src
732        mov             \ldst, \dst
733        mov             w8, \h
734.ifc \isa, neon_i8mm
735        bl              L(\type\()_hv_filter8_\isa)
736        srshr           v16.8h, v22.8h, #2
737        bl              L(\type\()_hv_filter8_\isa)
738        srshr           v17.8h, v22.8h, #2
739        bl              L(\type\()_hv_filter8_\isa)
740        srshr           v18.8h, v22.8h, #2
741        bl              L(\type\()_hv_filter8_\isa)
742        srshr           v19.8h, v22.8h, #2
743        bl              L(\type\()_hv_filter8_\isa)
744        srshr           v20.8h, v22.8h, #2
745        bl              L(\type\()_hv_filter8_\isa)
746        srshr           v21.8h, v22.8h, #2
747        bl              L(\type\()_hv_filter8_\isa)
748        srshr           v22.8h, v22.8h, #2
749.else
750        bl              L(\type\()_hv_filter8_\isa)
751        sshr            v16.8h, v22.8h, #2
752        bl              L(\type\()_hv_filter8_\isa)
753        sshr            v17.8h, v22.8h, #2
754        bl              L(\type\()_hv_filter8_\isa)
755        sshr            v18.8h, v22.8h, #2
756        bl              L(\type\()_hv_filter8_\isa)
757        sshr            v19.8h, v22.8h, #2
758        bl              L(\type\()_hv_filter8_\isa)
759        sshr            v20.8h, v22.8h, #2
760        bl              L(\type\()_hv_filter8_\isa)
761        sshr            v21.8h, v22.8h, #2
762        bl              L(\type\()_hv_filter8_\isa)
763        sshr            v22.8h, v22.8h, #2
764.endif
765        .align LOOP_ALIGN
7668:
767        ldr             q23, [\lsrc]
768        add             \lsrc, \lsrc, \s_strd
769
770        smull           v0.4s, v16.4h, v7.h[0]
771        smull2          v1.4s, v16.8h, v7.h[0]
772        mov             v16.16b, v17.16b
773.ifc \isa, neon_i8mm
774        movi            v5.4s, #0
775        movi            v6.4s, #0
776        tbl             v2.16b, {v23.16b}, v28.16b
777        tbl             v3.16b, {v23.16b}, v29.16b
778.else   // neon_dotprod
779        sub             v23.16b, v23.16b, v24.16b
780        mov             v5.16b, v27.16b
781        mov             v6.16b, v27.16b
782.endif
783        smlal           v0.4s, v17.4h, v7.h[1]
784        smlal2          v1.4s, v17.8h, v7.h[1]
785.ifc \isa, neon_i8mm
786        tbl             v4.16b, {v23.16b}, v30.16b
787        mov             v17.16b, v18.16b
788.else   // neon_dotprod
789        mov             v17.16b, v18.16b
790        tbl             v2.16b, {v23.16b}, v28.16b
791        tbl             v3.16b, {v23.16b}, v29.16b
792        tbl             v4.16b, {v23.16b}, v30.16b
793.endif
794        smlal           v0.4s, v18.4h, v7.h[2]
795        smlal2          v1.4s, v18.8h, v7.h[2]
796        mov             v18.16b, v19.16b
797
798        \dot            v5.4s, v2.16b, v26.4b[0]
799        \dot            v6.4s, v3.16b, v26.4b[0]
800
801        smlal           v0.4s, v19.4h, v7.h[3]
802        smlal2          v1.4s, v19.8h, v7.h[3]
803        mov             v19.16b, v20.16b
804
805        \dot            v5.4s, v3.16b, v26.4b[1]
806        \dot            v6.4s, v4.16b, v26.4b[1]
807
808        smlal           v0.4s, v20.4h, v7.h[4]
809        smlal2          v1.4s, v20.8h, v7.h[4]
810        mov             v20.16b, v21.16b
811
812        smlal           v0.4s, v21.4h, v7.h[5]
813        smlal2          v1.4s, v21.8h, v7.h[5]
814.ifc \type, prep
815        uzp1            v23.8h, v5.8h, v6.8h
816.endif
817        mov             v21.16b, v22.16b
818        smlal           v0.4s, v22.4h, v7.h[6]
819        smlal2          v1.4s, v22.8h, v7.h[6]
820.ifc \isa, neon_i8mm
821        subs            w8, w8, #1
822.endif
823.ifc \type, prep
824    .ifc \isa, neon_i8mm
825        srshr           v22.8h, v23.8h, #2
826    .else
827        sshr            v22.8h, v23.8h, #2
828    .endif
829        smlal           v0.4s, v22.4h, v7.h[7]
830        smlal2          v1.4s, v22.8h, v7.h[7]
831        rshrn           v0.4h, v0.4s, #6
832        rshrn2          v0.8h, v1.4s, #6
833.else   // put
834    .ifc \isa, neon_i8mm
835        rshrn           v22.4h, v5.4s, #2
836        rshrn2          v22.8h, v6.4s, #2
837    .else
838        shrn            v22.4h, v5.4s, #2
839        shrn2           v22.8h, v6.4s, #2
840    .endif
841        smlal           v0.4s, v22.4h, v7.h[7]
842        smlal2          v1.4s, v22.8h, v7.h[7]
843        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
844        sqrshrun        v0.8b, v0.8h, #2
845.endif
846.ifc \isa, neon_dotprod
847        subs            w8, w8, #1
848.endif
849.ifc \type, prep
850        st1             {v0.8h}, [\ldst], \d_strd
851        b.gt            8b
852        add             \dst, \dst, #16
853.else
854        st1             {v0.8b}, [\ldst], \d_strd
855        b.gt            8b
856        add             \dst, \dst, #8
857.endif
858        add             \src, \src, #8
859        subs            \w, \w, #8
860        b.gt            81b
861        ret             x15
862
863        .align JUMP_ALIGN
86440:     // HV8 - 4xN
865        ldr             s26, [\xmx, #2]
866        add             \src, \src, #2
867
868        bl              L(\type\()_hv_filter4_\isa)
869        shrn            v16.4h, v22.4s, #2
870        bl              L(\type\()_hv_filter4_\isa)
871        shrn            v17.4h, v22.4s, #2
872        bl              L(\type\()_hv_filter4_\isa)
873        shrn            v18.4h, v22.4s, #2
874        bl              L(\type\()_hv_filter4_\isa)
875        shrn            v19.4h, v22.4s, #2
876        bl              L(\type\()_hv_filter4_\isa)
877        shrn            v20.4h, v22.4s, #2
878        bl              L(\type\()_hv_filter4_\isa)
879        shrn            v21.4h, v22.4s, #2
880        bl              L(\type\()_hv_filter4_\isa)
881        shrn            v22.4h, v22.4s, #2
882
883        .align LOOP_ALIGN
8844:
885        ld1             {v4.8b}, [\src], \s_strd
886
887        smull           v0.4s, v16.4h, v7.h[0]
888        smlal           v0.4s, v17.4h, v7.h[1]
889        mov             v16.16b, v17.16b
890        mov             v17.16b, v18.16b
891.ifc \isa, neon_dotprod
892        sub             v4.16b, v4.16b, v24.16b
893.endif
894        smlal           v0.4s, v18.4h, v7.h[2]
895        smlal           v0.4s, v19.4h, v7.h[3]
896        tbl             v2.16b, {v4.16b}, v28.16b
897.ifc \isa, neon_i8mm
898        movi            v5.4s, #0
899.else
900        mov             v5.16b, v27.16b
901.endif
902        mov             v18.16b, v19.16b
903        mov             v19.16b, v20.16b
904
905        smlal           v0.4s, v20.4h, v7.h[4]
906        smlal           v0.4s, v21.4h, v7.h[5]
907
908        \dot            v5.4s, v2.16b, v26.4b[0]
909        mov             v20.16b, v21.16b
910        mov             v21.16b, v22.16b
911        smlal           v0.4s, v22.4h, v7.h[6]
912.ifc \isa, neon_i8mm
913        rshrn           v22.4h, v5.4s, #2
914.else
915        shrn            v22.4h, v5.4s, #2
916.endif
917        smlal           v0.4s, v22.4h, v7.h[7]
918.ifc \type, prep
919        rshrn           v0.4h, v0.4s, #6
920        str             d0, [\dst], #8
921        subs            \h, \h, #1
922.else
923        subs            \h, \h, #1
924        tbl             v0.8b, {v0.16b}, v25.8b
925        sqrshrun        v0.8b, v0.8h, #2
926        str             s0, [\dst]
927        add             \dst, \dst, \d_strd
928.endif
929        b.gt            4b
930        ret             x15
931
932.ifc \type, put
933        .align JUMP_ALIGN
93420:     // HV8 - 2xN
935        ldr             s26, [\xmx, #2]
936        add             \src, \src, #2
937
938        bl              L(\type\()_hv_filter4_\isa)
939        shrn            v16.4h, v22.4s, #2
940        bl              L(\type\()_hv_filter4_\isa)
941        shrn            v17.4h, v22.4s, #2
942        bl              L(\type\()_hv_filter4_\isa)
943        shrn            v18.4h, v22.4s, #2
944        bl              L(\type\()_hv_filter4_\isa)
945        shrn            v19.4h, v22.4s, #2
946        bl              L(\type\()_hv_filter4_\isa)
947        shrn            v20.4h, v22.4s, #2
948        bl              L(\type\()_hv_filter4_\isa)
949        shrn            v21.4h, v22.4s, #2
950        bl              L(\type\()_hv_filter4_\isa)
951        shrn            v22.4h, v22.4s, #2
952
953        .align LOOP_ALIGN
9542:
955        ld1             {v4.8b}, [\src], \s_strd
956
957        smull           v0.4s, v16.4h, v7.h[0]
958        smlal           v0.4s, v17.4h, v7.h[1]
959        mov             v16.16b, v17.16b
960        mov             v17.16b, v18.16b
961    .ifc \isa, neon_dotprod
962        sub             v4.16b, v4.16b, v24.16b
963    .endif
964        smlal           v0.4s, v18.4h, v7.h[2]
965        smlal           v0.4s, v19.4h, v7.h[3]
966        tbl             v2.16b, {v4.16b}, v28.16b
967    .ifc \isa, neon_i8mm
968        movi            v5.4s, #0
969    .else
970        mov             v5.16b, v27.16b
971    .endif
972        mov             v18.16b, v19.16b
973        mov             v19.16b, v20.16b
974
975        smlal           v0.4s, v20.4h, v7.h[4]
976        smlal           v0.4s, v21.4h, v7.h[5]
977
978        \dot            v5.4s, v2.16b, v26.4b[0]
979        mov             v20.16b, v21.16b
980        mov             v21.16b, v22.16b
981
982        smlal           v0.4s, v22.4h, v7.h[6]
983    .ifc \isa, neon_i8mm
984        rshrn           v22.4h, v5.4s, #2
985    .else
986        shrn            v22.4h, v5.4s, #2
987    .endif
988        smlal           v0.4s, v22.4h, v7.h[7]
989        subs            \h, \h, #1
990
991        tbl             v0.8b, {v0.16b}, v25.8b
992        sqrshrun        v0.8b, v0.8h, #2
993
994        str             h0, [\dst]
995        add             \dst, \dst, \d_strd
996        b.gt            2b
997        ret             x15
998.endif
999
1000        .align JUMP_ALIGN
1001L(\type\()_6tap_hv_\isa):
1002        cmp             \w, #4
1003        b.eq            40f
1004.ifc \type, put
1005        b.lt            20f
1006.endif
1007
1008        // .align JUMP_ALIGN    // fallthrough
100980:     // HV6 - 8xN+
1010        ldr             q29, L(h_tbl_neon_dotprod) + 16
1011        ldr             q30, L(h_tbl_neon_dotprod) + 32
1012        ldr             d26, [\xmx]
1013.ifc \type, prep
1014        add             \wd_strd, \w, \w
1015.endif
1016
1017        .align LOOP_ALIGN
101881:
1019        mov             \lsrc, \src
1020        mov             \ldst, \dst
1021        mov             w8, \h
1022.ifc \isa, neon_i8mm
1023        bl              L(\type\()_hv_filter8_\isa)
1024        srshr           v16.8h, v22.8h, #2
1025        bl              L(\type\()_hv_filter8_\isa)
1026        srshr           v17.8h, v22.8h, #2
1027        bl              L(\type\()_hv_filter8_\isa)
1028        srshr           v18.8h, v22.8h, #2
1029        bl              L(\type\()_hv_filter8_\isa)
1030        srshr           v19.8h, v22.8h, #2
1031        bl              L(\type\()_hv_filter8_\isa)
1032        srshr           v20.8h, v22.8h, #2
1033.else
1034        bl              L(\type\()_hv_filter8_\isa)
1035        sshr            v16.8h, v22.8h, #2
1036        bl              L(\type\()_hv_filter8_\isa)
1037        sshr            v17.8h, v22.8h, #2
1038        bl              L(\type\()_hv_filter8_\isa)
1039        sshr            v18.8h, v22.8h, #2
1040        bl              L(\type\()_hv_filter8_\isa)
1041        sshr            v19.8h, v22.8h, #2
1042        bl              L(\type\()_hv_filter8_\isa)
1043        sshr            v20.8h, v22.8h, #2
1044.endif
1045        .align LOOP_ALIGN
10468:
1047        ldr             q23, [\xmy]
1048        add             \xmy, \xmy, \s_strd
1049
1050        smull           v0.4s, v16.4h, v7.h[1]
1051        smull2          v1.4s, v16.8h, v7.h[1]
1052.ifc \isa, neon_dotprod
1053        sub             v23.16b, v23.16b, v24.16b
1054.endif
1055        mov             v16.16b, v17.16b
1056.ifc \isa, neon_i8mm
1057        movi            v5.4s, #0
1058        movi            v6.4s, #0
1059.else
1060        mov             v5.16b, v27.16b
1061        mov             v6.16b, v27.16b
1062.endif
1063        tbl             v2.16b, {v23.16b}, v28.16b
1064        tbl             v3.16b, {v23.16b}, v29.16b
1065
1066        smlal           v0.4s, v17.4h, v7.h[2]
1067        smlal2          v1.4s, v17.8h, v7.h[2]
1068        tbl             v4.16b, {v23.16b}, v30.16b
1069        mov             v17.16b, v18.16b
1070
1071        \dot            v5.4s, v2.16b, v26.4b[0]
1072        \dot            v6.4s, v3.16b, v26.4b[0]
1073
1074        smlal           v0.4s, v18.4h, v7.h[3]
1075        smlal2          v1.4s, v18.8h, v7.h[3]
1076        mov             v18.16b, v19.16b
1077
1078        \dot            v5.4s, v3.16b, v26.4b[1]
1079        \dot            v6.4s, v4.16b, v26.4b[1]
1080
1081        smlal           v0.4s, v19.4h, v7.h[4]
1082        smlal2          v1.4s, v19.8h, v7.h[4]
1083        mov             v19.16b, v20.16b
1084        uzp1            v23.8h, v5.8h, v6.8h
1085
1086        smlal           v0.4s, v20.4h, v7.h[5]
1087        smlal2          v1.4s, v20.8h, v7.h[5]
1088.ifc \isa, neon_i8mm
1089        srshr           v20.8h, v23.8h, #2
1090.else
1091        sshr            v20.8h, v23.8h, #2
1092.endif
1093        subs            w8, w8, #1
1094        smlal           v0.4s, v20.4h, v7.h[6]
1095        smlal2          v1.4s, v20.8h, v7.h[6]
1096.ifc \type, prep
1097        rshrn           v0.4h, v0.4s, #6
1098        rshrn2          v0.8h, v1.4s, #6
1099        st1             {v0.8h}, [\ldst], \d_strd
1100        b.gt            8b
1101        add             \dst, \dst, #16
1102.else
1103        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
1104        sqrshrun        v0.8b, v0.8h, #2
1105        st1             {v0.8b}, [\ldst], \d_strd
1106        b.gt            8b
1107        add             \dst, \dst, #8
1108.endif
1109        add             \src, \src, #8
1110        subs            \w, \w, #8
1111        b.gt            81b
1112        ret             x15
1113
1114        .align FUNC_ALIGN
1115L(\type\()_hv_filter8_\isa):
1116        ld1             {v4.16b}, [\lsrc], \s_strd
1117.ifc \isa, neon_i8mm
1118        movi            v22.4s, #0
1119        movi            v23.4s, #0
1120.else   // neon_dotprod
1121        sub             v4.16b, v4.16b, v24.16b
1122        mov             v22.16b, v27.16b
1123        mov             v23.16b, v27.16b
1124.endif
1125        tbl             v2.16b, {v4.16b}, v28.16b
1126        tbl             v3.16b, {v4.16b}, v29.16b
1127        tbl             v4.16b, {v4.16b}, v30.16b
1128        \dot            v22.4s, v2.16b, v26.4b[0]
1129        \dot            v23.4s, v3.16b, v26.4b[0]
1130        \dot            v22.4s, v3.16b, v26.4b[1]
1131        \dot            v23.4s, v4.16b, v26.4b[1]
1132        uzp1            v22.8h, v22.8h, v23.8h
1133        ret
1134
1135        .align FUNC_ALIGN
1136L(\type\()_hv_filter4_\isa):
1137        ld1             {v4.8b}, [\src], \s_strd
1138.ifc \isa, neon_i8mm
1139        movi            v22.4s, #2
1140.else
1141        mov             v22.16b, v27.16b
1142        sub             v4.16b, v4.16b, v24.16b
1143.endif
1144        tbl             v2.16b, {v4.16b}, v28.16b
1145        \dot            v22.4s, v2.16b, v26.4b[0]
1146        ret
1147
1148        .align JUMP_ALIGN
114940:     // HV6 - 4xN
1150        ldr             s26, [\xmx, #2]
1151        add             \src, \src, #2
1152
1153        bl              L(\type\()_hv_filter4_\isa)
1154        shrn            v16.4h, v22.4s, #2
1155        bl              L(\type\()_hv_filter4_\isa)
1156        shrn            v17.4h, v22.4s, #2
1157        bl              L(\type\()_hv_filter4_\isa)
1158        shrn            v18.4h, v22.4s, #2
1159        bl              L(\type\()_hv_filter4_\isa)
1160        shrn            v19.4h, v22.4s, #2
1161        bl              L(\type\()_hv_filter4_\isa)
1162        shrn            v20.4h, v22.4s, #2
1163
1164        .align LOOP_ALIGN
11654:
1166        ld1             {v4.8b}, [\src], \s_strd
1167
1168        smull           v0.4s, v16.4h, v7.h[1]
1169        smlal           v0.4s, v17.4h, v7.h[2]
1170.ifc \isa, neon_dotprod
1171        sub             v4.16b, v4.16b, v24.16b
1172.endif
1173        mov             v16.16b, v17.16b
1174        mov             v17.16b, v18.16b
1175
1176        smlal           v0.4s, v18.4h, v7.h[3]
1177        smlal           v0.4s, v19.4h, v7.h[4]
1178        tbl             v2.16b, {v4.16b}, v28.16b
1179.ifc \isa, neon_i8mm
1180        movi            v5.4s, #0
1181.else
1182        mov             v5.16b, v27.16b
1183.endif
1184        mov             v18.16b, v19.16b
1185        mov             v19.16b, v20.16b
1186        \dot            v5.4s, v2.16b, v26.4b[0]
1187
1188        smlal           v0.4s, v20.4h, v7.h[5]
1189.ifc \isa, neon_i8mm
1190        rshrn           v20.4h, v5.4s, #2
1191.else
1192        shrn            v20.4h, v5.4s, #2
1193.endif
1194        subs            \h, \h, #1
1195        smlal           v0.4s, v20.4h, v7.h[6]
1196.ifc \type, prep
1197        rshrn           v0.4h, v0.4s, #6
1198        str             d0, [\dst], #8
1199.else
1200        tbl             v0.8b, {v0.16b}, v25.8b
1201        sqrshrun        v0.8b, v0.8h, #2
1202        str             s0, [\dst]
1203        add             \dst, \dst, \d_strd
1204.endif
1205        b.gt            4b
1206        ret             x15
1207
1208.ifc \type, put
1209        .align JUMP_ALIGN
121020:     // HV6 - 2xN
1211        ldr             s26, [\xmx, #2]
1212        add             \src, \src, #2
1213
1214        bl              L(\type\()_hv_filter4_\isa)
1215        shrn            v16.4h, v22.4s, #2
1216        bl              L(\type\()_hv_filter4_\isa)
1217        shrn            v17.4h, v22.4s, #2
1218        bl              L(\type\()_hv_filter4_\isa)
1219        shrn            v18.4h, v22.4s, #2
1220        bl              L(\type\()_hv_filter4_\isa)
1221        shrn            v19.4h, v22.4s, #2
1222        bl              L(\type\()_hv_filter4_\isa)
1223        shrn            v20.4h, v22.4s, #2
1224
1225        .align LOOP_ALIGN
12262:
1227        ld1             {v4.8b}, [\src], \s_strd
1228
1229        smull           v0.4s, v16.4h, v7.h[1]
1230        smlal           v0.4s, v17.4h, v7.h[2]
1231    .ifc \isa, neon_dotprod
1232        sub             v4.16b, v4.16b, v24.16b
1233    .endif
1234        mov             v16.16b, v17.16b
1235        mov             v17.16b, v18.16b
1236
1237        smlal           v0.4s, v18.4h, v7.h[3]
1238        smlal           v0.4s, v19.4h, v7.h[4]
1239        tbl             v2.16b, {v4.16b}, v28.16b
1240    .ifc \isa, neon_i8mm
1241        movi            v5.4s, #0
1242    .else
1243        mov             v5.16b, v27.16b
1244    .endif
1245
1246        mov             v18.16b, v19.16b
1247        mov             v19.16b, v20.16b
1248        \dot            v5.4s, v2.16b, v26.4b[0]
1249
1250        smlal           v0.4s, v20.4h, v7.h[5]
1251    .ifc \isa, neon_i8mm
1252        rshrn           v20.4h, v5.4s, #2
1253    .else
1254        shrn            v20.4h, v5.4s, #2
1255    .endif
1256
1257        subs            \h, \h, #1
1258        smlal           v0.4s, v20.4h, v7.h[6]
1259
1260        tbl             v0.8b, {v0.16b}, v25.8b
1261        sqrshrun        v0.8b, v0.8h, #2
1262
1263        str             h0, [\dst]
1264        add             \dst, \dst, \d_strd
1265        b.gt            2b
1266        ret             x15
1267.endif
1268
1269        .align JUMP_ALIGN
1270L(\type\()_8tap_h_\isa):
1271        adr             x9, L(\type\()_8tap_h_\isa\()_tbl)
1272        ldrh            w8, [x9, x8, lsl #1]
1273.ifc \type, put
1274    .ifc \isa, neon_i8mm
1275        movi            v27.4s, #34     // special rounding
1276    .else
1277        mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
1278        dup             v27.4s, w10
1279    .endif
1280.endif
1281        sub             x9, x9, x8
1282        br              x9
1283
1284.ifc \type, put
1285        .align JUMP_ALIGN
128620:     // H - 2xN
1287        AARCH64_VALID_JUMP_TARGET
1288        add             \src, \src, #2
1289        ldr             s26, [\xmx, #2]
1290
1291        .align LOOP_ALIGN
12922:
1293        ldr             d0, [\src]
1294        ldr             d1, [\src, \s_strd]
1295        add             \src, \src, \s_strd, lsl #1
1296    .ifc \isa, neon_dotprod
1297        sub             v0.8b, v0.8b, v24.8b
1298        sub             v1.8b, v1.8b, v24.8b
1299    .endif
1300        mov             v4.16b, v27.16b
1301        mov             v5.16b, v27.16b
1302
1303        tbl             v2.16b, {v0.16b}, v28.16b
1304        tbl             v3.16b, {v1.16b}, v28.16b
1305
1306        \dot            v4.4s, v2.16b, v26.4b[0]
1307        \dot            v5.4s, v3.16b, v26.4b[0]
1308
1309        uzp1            v4.8h, v4.8h, v5.8h
1310        sqshrun         v4.8b, v4.8h, #6
1311
1312        subs            \h, \h, #2
1313        fmov            x8, d4
1314        lsr             x9, x8, #32
1315        strh            w8, [\dst]
1316        strh            w9, [\dst, \d_strd]
1317        add             \dst, \dst, \d_strd, lsl #1
1318        b.gt            2b
1319        ret
1320.endif
1321
1322        .align JUMP_ALIGN
132340:     // H - 4xN
1324        AARCH64_VALID_JUMP_TARGET
1325        add             \src, \src, #2
1326        ldr             s26, [\xmx, #2]
1327
1328        .align LOOP_ALIGN
13294:
1330        ldr             d0, [\src]
1331        ldr             d1, [\src, \s_strd]
1332        add             \src, \src, \s_strd, lsl #1
1333.ifc \type\()_\isa, prep_neon_i8mm
1334        movi            v4.4s, #0
1335        movi            v5.4s, #0
1336.else
1337    .ifc \isa, neon_dotprod
1338        sub             v0.8b, v0.8b, v24.8b
1339        sub             v1.8b, v1.8b, v24.8b
1340    .endif
1341        mov             v4.16b, v27.16b
1342        mov             v5.16b, v27.16b
1343.endif
1344        tbl             v2.16b, {v0.16b}, v28.16b
1345        tbl             v3.16b, {v1.16b}, v28.16b
1346
1347        \dot            v4.4s, v2.16b, v26.4b[0]
1348        \dot            v5.4s, v3.16b, v26.4b[0]
1349.ifc \type, prep
1350        subs            \h, \h, #2
1351    .ifc \isa, neon_i8mm
1352        uzp1            v4.8h, v4.8h, v5.8h
1353        srshr           v4.8h, v4.8h, #2
1354    .else
1355        shrn            v4.4h, v4.4s, #2
1356        shrn2           v4.8h, v5.4s, #2
1357    .endif
1358        str             q4, [\dst], #16
1359.else   // put
1360        uzp1            v4.8h, v4.8h, v5.8h
1361        sqshrun         v4.8b, v4.8h, #6
1362        subs            \h, \h, #2
1363        fmov            x8, d4
1364        lsr             x9, x8, #32
1365        str             w8, [\dst]
1366        str             w9, [\dst, \d_strd]
1367        add             \dst, \dst, \d_strd, lsl #1
1368.endif
1369        b.gt            4b
1370        ret
1371
1372        .align JUMP_ALIGN
137380:     // H - 8xN
1374        AARCH64_VALID_JUMP_TARGET
1375        ldr             q29, L(h_tbl_neon_dotprod) + 16
1376        ldr             q30, L(h_tbl_neon_dotprod) + 32
1377        ldr             d26, [\xmx]
1378
1379        .align LOOP_ALIGN
13808:
1381        ldr             q0, [\src]
1382        ldr             q16, [\src, \s_strd]
1383        add             \src, \src, \s_strd, lsl #1
1384.ifc \type\()_\isa, prep_neon_i8mm
1385        movi            v4.4s, #0
1386        movi            v5.4s, #0
1387        movi            v20.4s, #0
1388        movi            v21.4s, #0
1389.else
1390    .ifc \isa, neon_dotprod
1391        sub             v0.16b, v0.16b, v24.16b
1392        sub             v16.16b, v16.16b, v24.16b
1393    .endif
1394        mov             v4.16b, v27.16b
1395        mov             v5.16b, v27.16b
1396        mov             v20.16b, v27.16b
1397        mov             v21.16b, v27.16b
1398.endif
1399        tbl             v1.16b, {v0.16b}, v28.16b
1400        tbl             v2.16b, {v0.16b}, v29.16b
1401        tbl             v3.16b, {v0.16b}, v30.16b
1402        tbl             v17.16b, {v16.16b}, v28.16b
1403        tbl             v18.16b, {v16.16b}, v29.16b
1404        tbl             v19.16b, {v16.16b}, v30.16b
1405
1406        \dot            v4.4s, v1.16b, v26.4b[0]
1407        \dot            v5.4s, v2.16b, v26.4b[0]
1408        \dot            v20.4s, v17.16b, v26.4b[0]
1409        \dot            v21.4s, v18.16b, v26.4b[0]
1410        \dot            v4.4s, v2.16b, v26.4b[1]
1411        \dot            v5.4s, v3.16b, v26.4b[1]
1412        \dot            v20.4s, v18.16b, v26.4b[1]
1413        \dot            v21.4s, v19.16b, v26.4b[1]
1414
1415        uzp1            v4.8h, v4.8h, v5.8h
1416        uzp1            v20.8h, v20.8h, v21.8h
1417.ifc \type, prep
1418    .ifc \isa, neon_i8mm
1419        srshr           v4.8h, v4.8h, #2
1420        srshr           v20.8h, v20.8h, #2
1421    .else
1422        sshr            v4.8h, v4.8h, #2
1423        sshr            v20.8h, v20.8h, #2
1424    .endif
1425        subs            \h, \h, #2
1426        stp             q4, q20, [\dst], #32
1427.else   // put
1428        sqshrun         v4.8b, v4.8h, #6
1429        sqshrun         v20.8b, v20.8h, #6
1430        subs            \h, \h, #2
1431        str             d4, [\dst]
1432        str             d20, [\dst, \d_strd]
1433        add             \dst, \dst, \d_strd, lsl #1
1434.endif
1435        b.gt            8b
1436        ret
1437
1438        .align JUMP_ALIGN
1439160:    // H - 16xN
1440        AARCH64_VALID_JUMP_TARGET
1441        ldr             q29, L(h_tbl_neon_dotprod) + 16
1442        ldr             q30, L(h_tbl_neon_dotprod) + 32
1443        ldr             d26, [\xmx]
1444
1445        .align LOOP_ALIGN
144616:
1447        ldr             q16, [\src]
1448        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
1449        add             \src, \src, \s_strd
1450.ifc \type\()_\isa, prep_neon_i8mm
1451        movi            v6.4s, #0
1452        movi            v7.4s, #0
1453        movi            v22.4s, #0
1454        movi            v23.4s, #0
1455.else
1456    .ifc \isa, neon_dotprod
1457        sub             v16.16b, v16.16b, v24.16b
1458        sub             v17.16b, v17.16b, v24.16b
1459    .endif
1460        mov             v6.16b, v27.16b
1461        mov             v7.16b, v27.16b
1462        mov             v22.16b, v27.16b
1463        mov             v23.16b, v27.16b
1464.endif
1465        tbl             v0.16b, {v16.16b}, v28.16b
1466        tbl             v1.16b, {v16.16b}, v29.16b
1467        tbl             v2.16b, {v16.16b}, v30.16b
1468        tbl             v3.16b, {v17.16b}, v28.16b
1469        tbl             v4.16b, {v17.16b}, v29.16b
1470
1471        \dot            v6.4s, v0.16b, v26.4b[0]
1472        \dot            v7.4s, v1.16b, v26.4b[0]
1473        \dot            v22.4s, v2.16b, v26.4b[0]
1474        \dot            v23.4s, v3.16b, v26.4b[0]
1475        \dot            v6.4s, v1.16b, v26.4b[1]
1476        \dot            v7.4s, v2.16b, v26.4b[1]
1477        \dot            v22.4s, v3.16b, v26.4b[1]
1478        \dot            v23.4s, v4.16b, v26.4b[1]
1479
1480        uzp1            v6.8h, v6.8h, v7.8h
1481        uzp1            v22.8h, v22.8h, v23.8h
1482.ifc \type, prep
1483    .ifc \isa, neon_i8mm
1484        srshr           v6.8h, v6.8h, #2
1485        srshr           v22.8h, v22.8h, #2
1486    .else
1487        sshr            v6.8h, v6.8h, #2
1488        sshr            v22.8h, v22.8h, #2
1489    .endif
1490        subs            \h, \h, #1
1491        stp             q6, q22, [\dst], #32
1492.else   // put
1493        sqshrun         v6.8b, v6.8h, #6
1494        sqshrun2        v6.16b, v22.8h, #6
1495        subs            \h, \h, #1
1496        st1             {v6.16b}, [\dst], \d_strd
1497.endif
1498        b.gt            16b
1499        ret
1500
1501        .align JUMP_ALIGN
1502320:    // H - 32xN+
1503640:
15041280:
1505        AARCH64_VALID_JUMP_TARGET
1506        ldr             q29, L(h_tbl_neon_dotprod) + 16
1507        ldr             q30, L(h_tbl_neon_dotprod) + 32
1508        ldr             d26, [\xmx]
1509.ifc \type, put
1510        sub             \d_strd, \d_strd, \w, uxtw
1511.endif
1512        sub             \s_strd, \s_strd, \w, uxtw
1513        mov             w8, \w
1514
1515        .align LOOP_ALIGN
151632:
1517        ldr             q16, [\src]
1518        ldr             q17, [\src, #12]  // avoid 2 register TBL for small cores
1519        add             \src, \src, #16
1520.ifc \type\()_\isa, prep_neon_i8mm
1521        movi            v6.4s, #0
1522        movi            v7.4s, #0
1523        movi            v22.4s, #0
1524        movi            v23.4s, #0
1525.else
1526    .ifc \isa, neon_dotprod
1527        sub             v16.16b, v16.16b, v24.16b
1528        sub             v17.16b, v17.16b, v24.16b
1529    .endif
1530        mov             v6.16b, v27.16b
1531        mov             v7.16b, v27.16b
1532        mov             v22.16b, v27.16b
1533        mov             v23.16b, v27.16b
1534.endif
1535        tbl             v0.16b, {v16.16b}, v28.16b
1536        tbl             v1.16b, {v16.16b}, v29.16b
1537        tbl             v2.16b, {v16.16b}, v30.16b
1538        tbl             v3.16b, {v17.16b}, v28.16b
1539        tbl             v4.16b, {v17.16b}, v29.16b
1540
1541        \dot            v6.4s, v0.16b, v26.4b[0]
1542        \dot            v7.4s, v1.16b, v26.4b[0]
1543        \dot            v22.4s, v2.16b, v26.4b[0]
1544        \dot            v23.4s, v3.16b, v26.4b[0]
1545        \dot            v6.4s, v1.16b, v26.4b[1]
1546        \dot            v7.4s, v2.16b, v26.4b[1]
1547        \dot            v22.4s, v3.16b, v26.4b[1]
1548        \dot            v23.4s, v4.16b, v26.4b[1]
1549
1550        uzp1            v6.8h, v6.8h, v7.8h
1551        uzp1            v22.8h, v22.8h, v23.8h
1552.ifc \type, prep
1553    .ifc \isa, neon_i8mm
1554        srshr           v6.8h, v6.8h, #2
1555        srshr           v22.8h, v22.8h, #2
1556    .else
1557        sshr            v6.8h, v6.8h, #2
1558        sshr            v22.8h, v22.8h, #2
1559    .endif
1560        subs            w8, w8, #16
1561        stp             q6, q22, [\dst], #32
1562.else   // put
1563        sqshrun         v6.8b, v6.8h, #6
1564        sqshrun2        v6.16b, v22.8h, #6
1565        subs            w8, w8, #16
1566        str             q6, [\dst], #16
1567.endif
1568        b.gt            32b
1569
1570        add             \src, \src, \s_strd
1571.ifc \type, put
1572        add             \dst, \dst, \d_strd
1573.endif
1574        mov             w8, \w
1575        subs            \h, \h, #1
1576        b.gt            32b
1577        ret
1578
1579L(\type\()_8tap_h_\isa\()_tbl):
1580        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 1280b)
1581        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 640b)
1582        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 320b)
1583        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 160b)
1584        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 80b)
1585        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 40b)
1586.ifc \type, put
1587        .hword (L(\type\()_8tap_h_\isa\()_tbl) - 20b)
1588        .hword 0
1589.endif
1590endfunc
1591.endm
1592
1593// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1594// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1595filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1596
1597// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1598// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1599filter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1600
1601#if HAVE_I8MM
1602ENABLE_I8MM
1603
1604// dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
1605// xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
1606filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
1607
1608// dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
1609// xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
1610filter_8tap_fn  put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
1611
1612DISABLE_I8MM
1613#endif  // HAVE_I8MM
1614
1615DISABLE_DOTPROD
1616#endif  // HAVE_DOTPROD
1617