• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2018, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32.macro avg dst, t0, t1, t2, t3
33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
39.endm
40
41.macro w_avg dst, t0, t1, t2, t3
42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
52.endm
53
54.macro mask dst, t0, t1, t2, t3
55        ld1             {v30.16b}, [x6],  16
56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
57        mul             v30.16b, v30.16b, v31.16b
58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
59        shll            v28.8h, v30.8b,  #8
60        shll2           v29.8h, v30.16b, #8
61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
69.endm
70
71.macro bidir_fn type
72function \type\()_8bpc_neon, export=1
73        clz             w4,  w4
74.ifc \type, w_avg
75        dup             v30.8h, w6
76        neg             v30.8h, v30.8h
77        shl             v30.8h, v30.8h, #11
78.endif
79.ifc \type, mask
80        movi            v31.16b, #256-2
81.endif
82        adr             x7,  L(\type\()_tbl)
83        sub             w4,  w4,  #24
84        ldrh            w4,  [x7, x4, lsl #1]
85        \type           v4,  v0,  v1,  v2,  v3
86        sub             x7,  x7,  w4, uxtw
87        br              x7
8840:
89        AARCH64_VALID_JUMP_TARGET
90        add             x7,  x0,  x1
91        lsl             x1,  x1,  #1
924:
93        cmp             w5,  #4
94        st1             {v4.s}[0],  [x0], x1
95        st1             {v4.s}[1],  [x7], x1
96        st1             {v4.s}[2],  [x0], x1
97        st1             {v4.s}[3],  [x7], x1
98        b.eq            0f
99        \type           v5,  v0,  v1,  v2,  v3
100        cmp             w5,  #8
101        st1             {v5.s}[0],  [x0], x1
102        st1             {v5.s}[1],  [x7], x1
103        st1             {v5.s}[2],  [x0], x1
104        st1             {v5.s}[3],  [x7], x1
105        b.eq            0f
106        \type           v4,  v0,  v1,  v2,  v3
107        st1             {v4.s}[0],  [x0], x1
108        st1             {v4.s}[1],  [x7], x1
109        \type           v5,  v0,  v1,  v2,  v3
110        st1             {v4.s}[2],  [x0], x1
111        st1             {v4.s}[3],  [x7], x1
112        st1             {v5.s}[0],  [x0], x1
113        st1             {v5.s}[1],  [x7], x1
114        st1             {v5.s}[2],  [x0], x1
115        st1             {v5.s}[3],  [x7], x1
116        ret
11780:
118        AARCH64_VALID_JUMP_TARGET
119        add             x7,  x0,  x1
120        lsl             x1,  x1,  #1
1218:
122        st1             {v4.d}[0],  [x0], x1
123        \type           v5,  v0,  v1,  v2,  v3
124        st1             {v4.d}[1],  [x7], x1
125        st1             {v5.d}[0],  [x0], x1
126        subs            w5,  w5,  #4
127        st1             {v5.d}[1],  [x7], x1
128        b.le            0f
129        \type           v4,  v0,  v1,  v2,  v3
130        b               8b
13116:
132        AARCH64_VALID_JUMP_TARGET
133        \type           v5,  v0,  v1,  v2,  v3
134        st1             {v4.16b}, [x0], x1
135        \type           v6,  v0,  v1,  v2,  v3
136        st1             {v5.16b}, [x0], x1
137        \type           v7,  v0,  v1,  v2,  v3
138        st1             {v6.16b}, [x0], x1
139        subs            w5,  w5,  #4
140        st1             {v7.16b}, [x0], x1
141        b.le            0f
142        \type           v4,  v0,  v1,  v2,  v3
143        b               16b
144320:
145        AARCH64_VALID_JUMP_TARGET
146        add             x7,  x0,  x1
147        lsl             x1,  x1,  #1
14832:
149        \type           v5,  v0,  v1,  v2,  v3
150        \type           v6,  v0,  v1,  v2,  v3
151        st1             {v4.16b,v5.16b}, [x0], x1
152        \type           v7,  v0,  v1,  v2,  v3
153        subs            w5,  w5,  #2
154        st1             {v6.16b,v7.16b}, [x7], x1
155        b.le            0f
156        \type           v4,  v0,  v1,  v2,  v3
157        b               32b
158640:
159        AARCH64_VALID_JUMP_TARGET
160        add             x7,  x0,  x1
161        lsl             x1,  x1,  #1
16264:
163        \type           v5,  v0,  v1,  v2,  v3
164        \type           v6,  v0,  v1,  v2,  v3
165        \type           v7,  v0,  v1,  v2,  v3
166        \type           v16, v0,  v1,  v2,  v3
167        \type           v17, v0,  v1,  v2,  v3
168        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
169        \type           v18, v0,  v1,  v2,  v3
170        \type           v19, v0,  v1,  v2,  v3
171        subs            w5,  w5,  #2
172        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
173        b.le            0f
174        \type           v4, v0,  v1,  v2,  v3
175        b               64b
1761280:
177        AARCH64_VALID_JUMP_TARGET
178        add             x7,  x0,  #64
179128:
180        \type           v5,  v0,  v1,  v2,  v3
181        \type           v6,  v0,  v1,  v2,  v3
182        \type           v7,  v0,  v1,  v2,  v3
183        \type           v16, v0,  v1,  v2,  v3
184        \type           v17, v0,  v1,  v2,  v3
185        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
186        \type           v18, v0,  v1,  v2,  v3
187        \type           v19, v0,  v1,  v2,  v3
188        subs            w5,  w5,  #1
189        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
190        b.le            0f
191        \type           v4, v0,  v1,  v2,  v3
192        b               128b
1930:
194        ret
195L(\type\()_tbl):
196        .hword L(\type\()_tbl) - 1280b
197        .hword L(\type\()_tbl) -  640b
198        .hword L(\type\()_tbl) -  320b
199        .hword L(\type\()_tbl) -   16b
200        .hword L(\type\()_tbl) -   80b
201        .hword L(\type\()_tbl) -   40b
202endfunc
203.endm
204
205bidir_fn avg
206bidir_fn w_avg
207bidir_fn mask
208
209
210.macro w_mask_fn type
211function w_mask_\type\()_8bpc_neon, export=1
212        clz             w8,  w4
213        adr             x9,  L(w_mask_\type\()_tbl)
214        sub             w8,  w8,  #24
215        ldrh            w8,  [x9,  x8,  lsl #1]
216        sub             x9,  x9,  w8,  uxtw
217        mov             w10, #6903
218        dup             v0.8h,   w10
219.if \type == 444
220        movi            v1.16b,  #64
221.elseif \type == 422
222        dup             v2.8b,   w7
223        movi            v3.8b,   #129
224        sub             v3.8b,   v3.8b,   v2.8b
225.elseif \type == 420
226        dup             v2.8h,   w7
227        movi            v3.8h,   #1, lsl #8
228        sub             v3.8h,   v3.8h,   v2.8h
229.endif
230        add             x12,  x0,  x1
231        lsl             x1,   x1,  #1
232        br              x9
2334:
234        AARCH64_VALID_JUMP_TARGET
235        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
236        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
237        subs            w5,  w5,  #4
238        sub             v16.8h,  v6.8h,   v4.8h
239        sub             v17.8h,  v7.8h,   v5.8h
240        sabd            v18.8h,  v4.8h,   v6.8h
241        sabd            v19.8h,  v5.8h,   v7.8h
242        uqsub           v18.8h,  v0.8h,   v18.8h
243        uqsub           v19.8h,  v0.8h,   v19.8h
244        ushr            v18.8h,  v18.8h,  #8
245        ushr            v19.8h,  v19.8h,  #8
246        shl             v20.8h,  v18.8h,  #9
247        shl             v21.8h,  v19.8h,  #9
248        sqdmulh         v20.8h,  v20.8h,  v16.8h
249        sqdmulh         v21.8h,  v21.8h,  v17.8h
250        add             v20.8h,  v20.8h,  v4.8h
251        add             v21.8h,  v21.8h,  v5.8h
252        sqrshrun        v22.8b,  v20.8h,  #4
253        sqrshrun        v23.8b,  v21.8h,  #4
254.if \type == 444
255        uzp1            v18.16b,  v18.16b, v19.16b      // Same as xtn, xtn2
256        sub             v18.16b,  v1.16b,  v18.16b
257        st1             {v18.16b}, [x6],  #16
258.elseif \type == 422
259        addp            v18.8h,   v18.8h,  v19.8h
260        xtn             v18.8b,   v18.8h
261        uhsub           v18.8b,   v3.8b,   v18.8b
262        st1             {v18.8b},  [x6],  #8
263.elseif \type == 420
264        trn1            v24.2d,   v18.2d,  v19.2d
265        trn2            v25.2d,   v18.2d,  v19.2d
266        add             v24.8h,   v24.8h,  v25.8h
267        addp            v18.8h,   v24.8h,  v24.8h
268        sub             v18.4h,   v3.4h,   v18.4h
269        rshrn           v18.8b,   v18.8h,  #2
270        st1             {v18.s}[0],  [x6],  #4
271.endif
272        st1             {v22.s}[0],  [x0],  x1
273        st1             {v22.s}[1],  [x12], x1
274        st1             {v23.s}[0],  [x0],  x1
275        st1             {v23.s}[1],  [x12], x1
276        b.gt            4b
277        ret
2788:
279        AARCH64_VALID_JUMP_TARGET
280        ld1             {v4.8h,   v5.8h},   [x2],  #32
281        ld1             {v6.8h,   v7.8h},   [x3],  #32
282        subs            w5,  w5,  #2
283        sub             v16.8h,  v6.8h,   v4.8h
284        sub             v17.8h,  v7.8h,   v5.8h
285        sabd            v18.8h,  v4.8h,   v6.8h
286        sabd            v19.8h,  v5.8h,   v7.8h
287        uqsub           v18.8h,  v0.8h,   v18.8h
288        uqsub           v19.8h,  v0.8h,   v19.8h
289        ushr            v18.8h,  v18.8h,  #8
290        ushr            v19.8h,  v19.8h,  #8
291        shl             v20.8h,  v18.8h,  #9
292        shl             v21.8h,  v19.8h,  #9
293        sqdmulh         v20.8h,  v20.8h,  v16.8h
294        sqdmulh         v21.8h,  v21.8h,  v17.8h
295        add             v20.8h,  v20.8h,  v4.8h
296        add             v21.8h,  v21.8h,  v5.8h
297        sqrshrun        v22.8b,  v20.8h,  #4
298        sqrshrun        v23.8b,  v21.8h,  #4
299.if \type == 444
300        uzp1            v18.16b, v18.16b, v19.16b       // Same as xtn, xtn2
301        sub             v18.16b, v1.16b,  v18.16b
302        st1             {v18.16b}, [x6],  #16
303.elseif \type == 422
304        addp            v18.8h,  v18.8h,  v19.8h
305        xtn             v18.8b,  v18.8h
306        uhsub           v18.8b,  v3.8b,   v18.8b
307        st1             {v18.8b},  [x6],  #8
308.elseif \type == 420
309        add             v18.8h,  v18.8h,  v19.8h
310        addp            v18.8h,  v18.8h,  v18.8h
311        sub             v18.4h,  v3.4h,   v18.4h
312        rshrn           v18.8b,  v18.8h,  #2
313        st1             {v18.s}[0],  [x6],  #4
314.endif
315        st1             {v22.8b},  [x0],  x1
316        st1             {v23.8b},  [x12], x1
317        b.gt            8b
318        ret
3191280:
320640:
321320:
322160:
323        AARCH64_VALID_JUMP_TARGET
324        mov             w11, w4
325        sub             x1,  x1,  w4,  uxtw
326.if \type == 444
327        add             x10, x6,  w4,  uxtw
328.elseif \type == 422
329        add             x10, x6,  x11, lsr #1
330.endif
331        add             x9,  x3,  w4,  uxtw #1
332        add             x7,  x2,  w4,  uxtw #1
333161:
334        mov             w8,  w4
33516:
336        ld1             {v4.8h,   v5.8h},   [x2],  #32
337        ld1             {v6.8h,   v7.8h},   [x3],  #32
338        ld1             {v16.8h,  v17.8h},  [x7],  #32
339        ld1             {v18.8h,  v19.8h},  [x9],  #32
340        subs            w8,  w8,  #16
341        sub             v6.8h,   v6.8h,   v4.8h
342        sub             v7.8h,   v7.8h,   v5.8h
343        sub             v18.8h,  v18.8h,  v16.8h
344        sub             v19.8h,  v19.8h,  v17.8h
345        abs             v20.8h,  v6.8h
346        abs             v21.8h,  v7.8h
347        abs             v22.8h,  v18.8h
348        abs             v23.8h,  v19.8h
349        uqsub           v20.8h,  v0.8h,   v20.8h
350        uqsub           v21.8h,  v0.8h,   v21.8h
351        uqsub           v22.8h,  v0.8h,   v22.8h
352        uqsub           v23.8h,  v0.8h,   v23.8h
353        ushr            v20.8h,  v20.8h,  #8
354        ushr            v21.8h,  v21.8h,  #8
355        ushr            v22.8h,  v22.8h,  #8
356        ushr            v23.8h,  v23.8h,  #8
357        shl             v24.8h,  v20.8h,  #9
358        shl             v25.8h,  v21.8h,  #9
359        shl             v26.8h,  v22.8h,  #9
360        shl             v27.8h,  v23.8h,  #9
361        sqdmulh         v24.8h,  v24.8h,  v6.8h
362        sqdmulh         v25.8h,  v25.8h,  v7.8h
363        sqdmulh         v26.8h,  v26.8h,  v18.8h
364        sqdmulh         v27.8h,  v27.8h,  v19.8h
365        add             v24.8h,  v24.8h,  v4.8h
366        add             v25.8h,  v25.8h,  v5.8h
367        add             v26.8h,  v26.8h,  v16.8h
368        add             v27.8h,  v27.8h,  v17.8h
369        sqrshrun        v24.8b,  v24.8h,  #4
370        sqrshrun        v25.8b,  v25.8h,  #4
371        sqrshrun        v26.8b,  v26.8h,  #4
372        sqrshrun        v27.8b,  v27.8h,  #4
373.if \type == 444
374        uzp1            v20.16b, v20.16b, v21.16b       // Same as xtn, xtn2
375        uzp1            v21.16b, v22.16b, v23.16b       // Ditto
376        sub             v20.16b, v1.16b,  v20.16b
377        sub             v21.16b, v1.16b,  v21.16b
378        st1             {v20.16b}, [x6],  #16
379        st1             {v21.16b}, [x10], #16
380.elseif \type == 422
381        addp            v20.8h,  v20.8h,  v21.8h
382        addp            v21.8h,  v22.8h,  v23.8h
383        xtn             v20.8b,  v20.8h
384        xtn             v21.8b,  v21.8h
385        uhsub           v20.8b,  v3.8b,   v20.8b
386        uhsub           v21.8b,  v3.8b,   v21.8b
387        st1             {v20.8b},  [x6],  #8
388        st1             {v21.8b},  [x10], #8
389.elseif \type == 420
390        add             v20.8h,  v20.8h,  v22.8h
391        add             v21.8h,  v21.8h,  v23.8h
392        addp            v20.8h,  v20.8h,  v21.8h
393        sub             v20.8h,  v3.8h,   v20.8h
394        rshrn           v20.8b,  v20.8h,  #2
395        st1             {v20.8b},  [x6],  #8
396.endif
397        st1             {v24.8b,  v25.8b},  [x0],  #16
398        st1             {v26.8b,  v27.8b},  [x12], #16
399        b.gt            16b
400        subs            w5,  w5,  #2
401        add             x2,  x2,  w4,  uxtw #1
402        add             x3,  x3,  w4,  uxtw #1
403        add             x7,  x7,  w4,  uxtw #1
404        add             x9,  x9,  w4,  uxtw #1
405.if \type == 444
406        add             x6,  x6,  w4,  uxtw
407        add             x10, x10, w4,  uxtw
408.elseif \type == 422
409        add             x6,  x6,  x11, lsr #1
410        add             x10, x10, x11, lsr #1
411.endif
412        add             x0,  x0,  x1
413        add             x12, x12, x1
414        b.gt            161b
415        ret
416L(w_mask_\type\()_tbl):
417        .hword L(w_mask_\type\()_tbl) - 1280b
418        .hword L(w_mask_\type\()_tbl) -  640b
419        .hword L(w_mask_\type\()_tbl) -  320b
420        .hword L(w_mask_\type\()_tbl) -  160b
421        .hword L(w_mask_\type\()_tbl) -    8b
422        .hword L(w_mask_\type\()_tbl) -    4b
423endfunc
424.endm
425
426w_mask_fn 444
427w_mask_fn 422
428w_mask_fn 420
429
430
431function blend_8bpc_neon, export=1
432        adr             x6,  L(blend_tbl)
433        clz             w3,  w3
434        sub             w3,  w3,  #26
435        ldrh            w3,  [x6,  x3,  lsl #1]
436        sub             x6,  x6,  w3,  uxtw
437        movi            v4.16b,  #64
438        add             x8,  x0,  x1
439        lsl             x1,  x1,  #1
440        br              x6
4414:
442        AARCH64_VALID_JUMP_TARGET
443        ld1             {v2.8b},     [x5],  #8
444        ld1             {v1.d}[0],   [x2],  #8
445        ld1             {v0.s}[0],   [x0]
446        subs            w4,  w4,  #2
447        ld1             {v0.s}[1],   [x8]
448        sub             v3.8b,   v4.8b,   v2.8b
449        umull           v5.8h,   v1.8b,   v2.8b
450        umlal           v5.8h,   v0.8b,   v3.8b
451        rshrn           v6.8b,   v5.8h,   #6
452        st1             {v6.s}[0],   [x0],  x1
453        st1             {v6.s}[1],   [x8],  x1
454        b.gt            4b
455        ret
4568:
457        AARCH64_VALID_JUMP_TARGET
458        ld1             {v2.16b},  [x5],  #16
459        ld1             {v1.16b},  [x2],  #16
460        ld1             {v0.d}[0],   [x0]
461        ld1             {v0.d}[1],   [x8]
462        sub             v3.16b,  v4.16b,  v2.16b
463        subs            w4,  w4,  #2
464        umull           v5.8h,   v1.8b,   v2.8b
465        umlal           v5.8h,   v0.8b,   v3.8b
466        umull2          v6.8h,   v1.16b,  v2.16b
467        umlal2          v6.8h,   v0.16b,  v3.16b
468        rshrn           v7.8b,   v5.8h,   #6
469        rshrn2          v7.16b,  v6.8h,   #6
470        st1             {v7.d}[0],   [x0],  x1
471        st1             {v7.d}[1],   [x8],  x1
472        b.gt            8b
473        ret
47416:
475        AARCH64_VALID_JUMP_TARGET
476        ld1             {v1.16b,  v2.16b},  [x5],  #32
477        ld1             {v5.16b,  v6.16b},  [x2],  #32
478        ld1             {v0.16b},  [x0]
479        subs            w4,  w4,  #2
480        sub             v7.16b,  v4.16b,  v1.16b
481        sub             v20.16b, v4.16b,  v2.16b
482        ld1             {v3.16b},  [x8]
483        umull           v16.8h,  v5.8b,   v1.8b
484        umlal           v16.8h,  v0.8b,   v7.8b
485        umull2          v17.8h,  v5.16b,  v1.16b
486        umlal2          v17.8h,  v0.16b,  v7.16b
487        umull           v21.8h,  v6.8b,   v2.8b
488        umlal           v21.8h,  v3.8b,   v20.8b
489        umull2          v22.8h,  v6.16b,  v2.16b
490        umlal2          v22.8h,  v3.16b,  v20.16b
491        rshrn           v18.8b,  v16.8h,  #6
492        rshrn2          v18.16b, v17.8h,  #6
493        rshrn           v19.8b,  v21.8h,  #6
494        rshrn2          v19.16b, v22.8h,  #6
495        st1             {v18.16b}, [x0],  x1
496        st1             {v19.16b}, [x8],  x1
497        b.gt            16b
498        ret
49932:
500        AARCH64_VALID_JUMP_TARGET
501        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
502        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
503        ld1             {v20.16b, v21.16b}, [x0]
504        subs            w4,  w4,  #2
505        ld1             {v22.16b, v23.16b}, [x8]
506        sub             v5.16b,  v4.16b,  v0.16b
507        sub             v6.16b,  v4.16b,  v1.16b
508        sub             v30.16b, v4.16b,  v2.16b
509        sub             v31.16b, v4.16b,  v3.16b
510        umull           v24.8h,  v16.8b,  v0.8b
511        umlal           v24.8h,  v20.8b,  v5.8b
512        umull2          v26.8h,  v16.16b, v0.16b
513        umlal2          v26.8h,  v20.16b, v5.16b
514        umull           v28.8h,  v17.8b,  v1.8b
515        umlal           v28.8h,  v21.8b,  v6.8b
516        umull2          v7.8h,   v17.16b, v1.16b
517        umlal2          v7.8h,   v21.16b, v6.16b
518        umull           v27.8h,  v18.8b,  v2.8b
519        umlal           v27.8h,  v22.8b,  v30.8b
520        umull2          v1.8h,   v18.16b, v2.16b
521        umlal2          v1.8h,   v22.16b, v30.16b
522        umull           v29.8h,  v19.8b,  v3.8b
523        umlal           v29.8h,  v23.8b,  v31.8b
524        umull2          v21.8h,  v19.16b, v3.16b
525        umlal2          v21.8h,  v23.16b, v31.16b
526        rshrn           v24.8b,  v24.8h,  #6
527        rshrn2          v24.16b, v26.8h,  #6
528        rshrn           v25.8b,  v28.8h,  #6
529        rshrn2          v25.16b, v7.8h,   #6
530        rshrn           v27.8b,  v27.8h,  #6
531        rshrn2          v27.16b, v1.8h,   #6
532        rshrn           v28.8b,  v29.8h,  #6
533        rshrn2          v28.16b, v21.8h,  #6
534        st1             {v24.16b, v25.16b}, [x0],  x1
535        st1             {v27.16b, v28.16b}, [x8],  x1
536        b.gt            32b
537        ret
538L(blend_tbl):
539        .hword L(blend_tbl) - 32b
540        .hword L(blend_tbl) - 16b
541        .hword L(blend_tbl) -  8b
542        .hword L(blend_tbl) -  4b
543endfunc
544
545function blend_h_8bpc_neon, export=1
546        adr             x6,  L(blend_h_tbl)
547        movrel          x5,  X(obmc_masks)
548        add             x5,  x5,  w4,  uxtw
549        sub             w4,  w4,  w4,  lsr #2
550        clz             w7,  w3
551        movi            v4.16b,  #64
552        add             x8,  x0,  x1
553        lsl             x1,  x1,  #1
554        sub             w7,  w7,  #24
555        ldrh            w7,  [x6,  x7,  lsl #1]
556        sub             x6,  x6,  w7, uxtw
557        br              x6
5582:
559        AARCH64_VALID_JUMP_TARGET
560        ld1             {v0.h}[0],   [x5],  #2
561        ld1             {v1.s}[0],   [x2],  #4
562        subs            w4,  w4,  #2
563        ld1             {v2.h}[0],   [x0]
564        zip1            v0.8b,   v0.8b,   v0.8b
565        sub             v3.8b,   v4.8b,   v0.8b
566        ld1             {v2.h}[1],   [x8]
567        umull           v5.8h,   v1.8b,   v0.8b
568        umlal           v5.8h,   v2.8b,   v3.8b
569        rshrn           v5.8b,   v5.8h,   #6
570        st1             {v5.h}[0],   [x0],  x1
571        st1             {v5.h}[1],   [x8],  x1
572        b.gt            2b
573        ret
5744:
575        AARCH64_VALID_JUMP_TARGET
576        ld2r            {v0.8b,   v1.8b},   [x5],  #2
577        ld1             {v2.8b},   [x2],  #8
578        subs            w4,  w4,  #2
579        ext             v0.8b,   v0.8b,   v1.8b,   #4
580        ld1             {v3.s}[0],   [x0]
581        sub             v5.8b,   v4.8b,   v0.8b
582        ld1             {v3.s}[1],   [x8]
583        umull           v6.8h,   v2.8b,   v0.8b
584        umlal           v6.8h,   v3.8b,   v5.8b
585        rshrn           v6.8b,   v6.8h,   #6
586        st1             {v6.s}[0],   [x0],  x1
587        st1             {v6.s}[1],   [x8],  x1
588        b.gt            4b
589        ret
5908:
591        AARCH64_VALID_JUMP_TARGET
592        ld2r            {v0.16b,  v1.16b},  [x5],  #2
593        ld1             {v2.16b},  [x2],  #16
594        ld1             {v3.d}[0],   [x0]
595        ext             v0.16b,  v0.16b,  v1.16b,  #8
596        sub             v5.16b,  v4.16b,  v0.16b
597        ld1             {v3.d}[1],   [x8]
598        subs            w4,  w4,  #2
599        umull           v6.8h,   v0.8b,   v2.8b
600        umlal           v6.8h,   v3.8b,   v5.8b
601        umull2          v7.8h,   v0.16b,  v2.16b
602        umlal2          v7.8h,   v3.16b,  v5.16b
603        rshrn           v16.8b,  v6.8h,   #6
604        rshrn2          v16.16b, v7.8h,   #6
605        st1             {v16.d}[0],  [x0],  x1
606        st1             {v16.d}[1],  [x8],  x1
607        b.gt            8b
608        ret
60916:
610        AARCH64_VALID_JUMP_TARGET
611        ld2r            {v0.16b,  v1.16b},  [x5],  #2
612        ld1             {v2.16b,  v3.16b},  [x2],  #32
613        ld1             {v5.16b},  [x0]
614        sub             v7.16b,  v4.16b,  v0.16b
615        sub             v16.16b, v4.16b,  v1.16b
616        ld1             {v6.16b},  [x8]
617        subs            w4,  w4,  #2
618        umull           v17.8h,  v0.8b,   v2.8b
619        umlal           v17.8h,  v5.8b,   v7.8b
620        umull2          v18.8h,  v0.16b,  v2.16b
621        umlal2          v18.8h,  v5.16b,  v7.16b
622        umull           v19.8h,  v1.8b,   v3.8b
623        umlal           v19.8h,  v6.8b,   v16.8b
624        umull2          v20.8h,  v1.16b,  v3.16b
625        umlal2          v20.8h,  v6.16b,  v16.16b
626        rshrn           v21.8b,  v17.8h,  #6
627        rshrn2          v21.16b, v18.8h,  #6
628        rshrn           v22.8b,  v19.8h,  #6
629        rshrn2          v22.16b, v20.8h,  #6
630        st1             {v21.16b}, [x0],  x1
631        st1             {v22.16b}, [x8],  x1
632        b.gt            16b
633        ret
6341280:
635640:
636320:
637        AARCH64_VALID_JUMP_TARGET
638        sub             x1,  x1,  w3,  uxtw
639        add             x7,  x2,  w3,  uxtw
640321:
641        ld2r            {v0.16b,  v1.16b},  [x5],  #2
642        mov             w6,  w3
643        sub             v20.16b, v4.16b,  v0.16b
644        sub             v21.16b, v4.16b,  v1.16b
64532:
646        ld1             {v16.16b, v17.16b}, [x2],  #32
647        ld1             {v2.16b,  v3.16b},  [x0]
648        subs            w6,  w6,  #32
649        umull           v23.8h,  v0.8b,   v16.8b
650        umlal           v23.8h,  v2.8b,   v20.8b
651        ld1             {v18.16b, v19.16b}, [x7],  #32
652        umull2          v27.8h,  v0.16b,  v16.16b
653        umlal2          v27.8h,  v2.16b,  v20.16b
654        ld1             {v6.16b,  v7.16b},  [x8]
655        umull           v24.8h,  v0.8b,   v17.8b
656        umlal           v24.8h,  v3.8b,   v20.8b
657        umull2          v28.8h,  v0.16b,  v17.16b
658        umlal2          v28.8h,  v3.16b,  v20.16b
659        umull           v25.8h,  v1.8b,   v18.8b
660        umlal           v25.8h,  v6.8b,   v21.8b
661        umull2          v5.8h,   v1.16b,  v18.16b
662        umlal2          v5.8h,   v6.16b,  v21.16b
663        rshrn           v29.8b,  v23.8h,  #6
664        rshrn2          v29.16b, v27.8h,  #6
665        umull           v26.8h,  v1.8b,   v19.8b
666        umlal           v26.8h,  v7.8b,   v21.8b
667        umull2          v31.8h,  v1.16b,  v19.16b
668        umlal2          v31.8h,  v7.16b,  v21.16b
669        rshrn           v30.8b,  v24.8h,  #6
670        rshrn2          v30.16b, v28.8h,  #6
671        rshrn           v23.8b,  v25.8h,  #6
672        rshrn2          v23.16b, v5.8h,   #6
673        rshrn           v24.8b,  v26.8h,  #6
674        st1             {v29.16b, v30.16b}, [x0],  #32
675        rshrn2          v24.16b, v31.8h,  #6
676        st1             {v23.16b, v24.16b}, [x8],  #32
677        b.gt            32b
678        subs            w4,  w4,  #2
679        add             x0,  x0,  x1
680        add             x8,  x8,  x1
681        add             x2,  x2,  w3,  uxtw
682        add             x7,  x7,  w3,  uxtw
683        b.gt            321b
684        ret
685L(blend_h_tbl):
686        .hword L(blend_h_tbl) - 1280b
687        .hword L(blend_h_tbl) -  640b
688        .hword L(blend_h_tbl) -  320b
689        .hword L(blend_h_tbl) -   16b
690        .hword L(blend_h_tbl) -    8b
691        .hword L(blend_h_tbl) -    4b
692        .hword L(blend_h_tbl) -    2b
693endfunc
694
695function blend_v_8bpc_neon, export=1
696        adr             x6,  L(blend_v_tbl)
697        movrel          x5,  X(obmc_masks)
698        add             x5,  x5,  w3,  uxtw
699        clz             w3,  w3
700        movi            v4.16b,  #64
701        add             x8,  x0,  x1
702        lsl             x1,  x1,  #1
703        sub             w3,  w3,  #26
704        ldrh            w3,  [x6,  x3,  lsl #1]
705        sub             x6,  x6,  w3,  uxtw
706        br              x6
70720:
708        AARCH64_VALID_JUMP_TARGET
709        ld1r            {v0.8b},   [x5]
710        sub             v1.8b,   v4.8b,   v0.8b
7112:
712        ld1             {v2.h}[0],   [x2],  #2
713        ld1             {v3.b}[0],   [x0]
714        subs            w4,  w4,  #2
715        ld1             {v2.b}[1],   [x2]
716        ld1             {v3.b}[1],   [x8]
717        umull           v5.8h,   v2.8b,   v0.8b
718        umlal           v5.8h,   v3.8b,   v1.8b
719        rshrn           v5.8b,   v5.8h,   #6
720        add             x2,  x2,  #2
721        st1             {v5.b}[0],   [x0],  x1
722        st1             {v5.b}[1],   [x8],  x1
723        b.gt            2b
724        ret
72540:
726        AARCH64_VALID_JUMP_TARGET
727        ld1r            {v0.2s},   [x5]
728        sub             x1,  x1,  #2
729        sub             v1.8b,   v4.8b,   v0.8b
7304:
731        ld1             {v2.8b},   [x2],  #8
732        ld1             {v3.s}[0],   [x0]
733        ld1             {v3.s}[1],   [x8]
734        subs            w4,  w4,  #2
735        umull           v5.8h,   v2.8b,   v0.8b
736        umlal           v5.8h,   v3.8b,   v1.8b
737        rshrn           v5.8b,   v5.8h,   #6
738        st1             {v5.h}[0],   [x0],  #2
739        st1             {v5.h}[2],   [x8],  #2
740        st1             {v5.b}[2],   [x0],  x1
741        st1             {v5.b}[6],   [x8],  x1
742        b.gt            4b
743        ret
74480:
745        AARCH64_VALID_JUMP_TARGET
746        ld1r            {v0.2d},   [x5]
747        sub             x1,  x1,  #4
748        sub             v1.16b,  v4.16b,  v0.16b
7498:
750        ld1             {v2.16b},  [x2],  #16
751        ld1             {v3.d}[0],   [x0]
752        ld1             {v3.d}[1],   [x8]
753        subs            w4,  w4,  #2
754        umull           v5.8h,  v0.8b,  v2.8b
755        umlal           v5.8h,  v3.8b,  v1.8b
756        umull2          v6.8h,  v0.16b, v2.16b
757        umlal2          v6.8h,  v3.16b, v1.16b
758        rshrn           v7.8b,  v5.8h,  #6
759        rshrn2          v7.16b, v6.8h,  #6
760        st1             {v7.s}[0],   [x0],  #4
761        st1             {v7.s}[2],   [x8],  #4
762        st1             {v7.h}[2],   [x0],  x1
763        st1             {v7.h}[6],   [x8],  x1
764        b.gt            8b
765        ret
766160:
767        AARCH64_VALID_JUMP_TARGET
768        ld1             {v0.16b},  [x5]
769        sub             x1,  x1,  #8
770        sub             v2.16b,  v4.16b,  v0.16b
77116:
772        ld1             {v5.16b,  v6.16b},  [x2],  #32
773        ld1             {v7.16b},  [x0]
774        subs            w4,  w4,  #2
775        ld1             {v16.16b}, [x8]
776        umull           v17.8h,  v5.8b,   v0.8b
777        umlal           v17.8h,  v7.8b,   v2.8b
778        umull2          v18.8h,  v5.16b,  v0.16b
779        umlal2          v18.8h,  v7.16b,  v2.16b
780        umull           v20.8h,  v6.8b,   v0.8b
781        umlal           v20.8h,  v16.8b,  v2.8b
782        umull2          v21.8h,  v6.16b,  v0.16b
783        umlal2          v21.8h,  v16.16b, v2.16b
784        rshrn           v19.8b,  v17.8h,  #6
785        rshrn2          v19.16b, v18.8h,  #6
786        rshrn           v22.8b,  v20.8h,  #6
787        rshrn2          v22.16b, v21.8h,  #6
788        st1             {v19.8b},  [x0],  #8
789        st1             {v22.8b},  [x8],  #8
790        st1             {v19.s}[2],  [x0],  x1
791        st1             {v22.s}[2],  [x8],  x1
792        b.gt            16b
793        ret
794320:
795        AARCH64_VALID_JUMP_TARGET
796        ld1             {v0.16b,  v1.16b},  [x5]
797        sub             x1,  x1,  #16
798        sub             v2.16b,  v4.16b,  v0.16b
799        sub             v3.8b,   v4.8b,   v1.8b
80032:
801        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
802        ld1             {v5.16b,  v6.16b},  [x0]
803        subs            w4,  w4,  #2
804        ld1             {v20.16b, v21.16b}, [x8]
805        umull           v22.8h,  v16.8b,  v0.8b
806        umlal           v22.8h,  v5.8b,   v2.8b
807        umull2          v23.8h,  v16.16b, v0.16b
808        umlal2          v23.8h,  v5.16b,  v2.16b
809        umull           v28.8h,  v17.8b,  v1.8b
810        umlal           v28.8h,  v6.8b,   v3.8b
811        umull           v30.8h,  v18.8b,  v0.8b
812        umlal           v30.8h,  v20.8b,  v2.8b
813        umull2          v31.8h,  v18.16b, v0.16b
814        umlal2          v31.8h,  v20.16b, v2.16b
815        umull           v25.8h,  v19.8b,  v1.8b
816        umlal           v25.8h,  v21.8b,  v3.8b
817        rshrn           v24.8b,  v22.8h,  #6
818        rshrn2          v24.16b, v23.8h,  #6
819        rshrn           v28.8b,  v28.8h,  #6
820        rshrn           v30.8b,  v30.8h,  #6
821        rshrn2          v30.16b, v31.8h,  #6
822        rshrn           v27.8b,  v25.8h,  #6
823        st1             {v24.16b}, [x0],  #16
824        st1             {v30.16b}, [x8],  #16
825        st1             {v28.8b},  [x0],  x1
826        st1             {v27.8b},  [x8],  x1
827        b.gt            32b
828        ret
829L(blend_v_tbl):
830        .hword L(blend_v_tbl) - 320b
831        .hword L(blend_v_tbl) - 160b
832        .hword L(blend_v_tbl) -  80b
833        .hword L(blend_v_tbl) -  40b
834        .hword L(blend_v_tbl) -  20b
835endfunc
836
837
838// This has got the same signature as the put_8tap functions,
839// and assumes that x8 is set to (clz(w)-24).
840function put_neon, export=1
841        adr             x9,  L(put_tbl)
842        ldrh            w8,  [x9, x8, lsl #1]
843        sub             x9,  x9,  x8
844        br              x9
845
84620:
847        AARCH64_VALID_JUMP_TARGET
8482:
849        ldrh            w9, [x2]
850        ldrh            w10, [x2, x3]
851        add             x2, x2, x3, lsl #1
852        subs            w5, w5, #2
853        strh            w9, [x0]
854        strh            w10, [x0, x1]
855        add             x0, x0, x1, lsl #1
856        b.gt            2b
857        ret
85840:
859        AARCH64_VALID_JUMP_TARGET
8604:
861        ldr             w9, [x2]
862        ldr             w10, [x2, x3]
863        add             x2, x2, x3, lsl #1
864        subs            w5, w5, #2
865        str             w9, [x0]
866        str             w10, [x0, x1]
867        add             x0, x0, x1, lsl #1
868        b.gt            4b
869        ret
87080:
871        AARCH64_VALID_JUMP_TARGET
8728:
873        ldr             x9, [x2]
874        ldr             x10, [x2, x3]
875        add             x2, x2, x3, lsl #1
876        subs            w5, w5, #2
877        str             x9, [x0]
878        str             x10, [x0, x1]
879        add             x0, x0, x1, lsl #1
880        b.gt            8b
881        ret
882160:
883        AARCH64_VALID_JUMP_TARGET
88416:
885        ldr             q0, [x2]
886        ldr             q1, [x2, x3]
887        add             x2, x2, x3, lsl #1
888        subs            w5, w5, #2
889        str             q0, [x0]
890        str             q1, [x0, x1]
891        add             x0, x0, x1, lsl #1
892        b.gt            16b
893        ret
894320:
895        AARCH64_VALID_JUMP_TARGET
89632:
897        ldp             q0, q1, [x2]
898        add             x2, x2, x3
899        stp             q0, q1, [x0]
900        add             x0, x0, x1
901        ldp             q2, q3, [x2]
902        add             x2, x2, x3
903        stp             q2, q3, [x0]
904        subs            w5, w5, #2
905        add             x0, x0, x1
906        b.gt            32b
907        ret
908640:
909        AARCH64_VALID_JUMP_TARGET
91064:
911        ldp             q0, q1, [x2]
912        stp             q0, q1, [x0]
913        ldp             q2, q3, [x2, #32]
914        add             x2, x2, x3
915        stp             q2, q3, [x0, #32]
916        subs            w5, w5, #1
917        add             x0, x0, x1
918        b.gt            64b
919        ret
9201280:
921        AARCH64_VALID_JUMP_TARGET
922128:
923        ldp             q0, q1, [x2]
924        stp             q0, q1, [x0]
925        ldp             q2, q3, [x2, #32]
926        stp             q2, q3, [x0, #32]
927        ldp             q4, q5, [x2, #64]
928        stp             q4, q5, [x0, #64]
929        ldp             q6, q7, [x2, #96]
930        add             x2, x2, x3
931        stp             q6, q7, [x0, #96]
932        subs            w5, w5, #1
933        add             x0, x0, x1
934        b.gt            128b
935        ret
936
937L(put_tbl):
938        .hword L(put_tbl) - 1280b
939        .hword L(put_tbl) -  640b
940        .hword L(put_tbl) -  320b
941        .hword L(put_tbl) -  160b
942        .hword L(put_tbl) -   80b
943        .hword L(put_tbl) -   40b
944        .hword L(put_tbl) -   20b
945endfunc
946
947
948// This has got the same signature as the prep_8tap functions,
949// and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
950function prep_neon, export=1
951        adr             x9,  L(prep_tbl)
952        ldrh            w8,  [x9, x8, lsl #1]
953        movi            v24.16b, #16
954        sub             x9,  x9,  x8
955        br              x9
956
95740:
958        AARCH64_VALID_JUMP_TARGET
9594:
960        ld1             {v0.s}[0], [x1], x2
961        ld1             {v0.s}[1], [x1], x2
962        ld1             {v1.s}[0], [x1], x2
963        ld1             {v1.s}[1], [x1], x2
964        ushll           v0.8h, v0.8b, #4
965        ushll           v1.8h, v1.8b, #4
966        subs            w4, w4, #4
967        stp             q0, q1, [x0], #32
968        b.gt            4b
969        ret
97080:
971        AARCH64_VALID_JUMP_TARGET
9728:
973        ldr             d0, [x1]
974        ldr             d1, [x1, x2]
975        add             x1, x1, x2, lsl #1
976        ldr             d2, [x1]
977        ldr             d3, [x1, x2]
978        add             x1, x1, x2, lsl #1
979        ushll           v0.8h, v0.8b, #4
980        ushll           v1.8h, v1.8b, #4
981        umull           v2.8h, v2.8b, v24.8b
982        umull           v3.8h, v3.8b, v24.8b
983        subs            w4, w4, #4
984        stp             q0, q1, [x0]
985        stp             q2, q3, [x0, #32]
986        add             x0, x0, #64
987        b.gt            8b
988        ret
989160:
990        AARCH64_VALID_JUMP_TARGET
99116:
992        ldr             q1, [x1]
993        ldr             q3, [x1, x2]
994        add             x1, x1, x2, lsl #1
995        ushll           v0.8h, v1.8b, #4
996        ushll2          v1.8h, v1.16b, #4
997        ldr             q5, [x1]
998        ldr             q7, [x1, x2]
999        add             x1, x1, x2, lsl #1
1000        umull           v2.8h, v3.8b, v24.8b
1001        umull2          v3.8h, v3.16b, v24.16b
1002        ushll           v4.8h, v5.8b, #4
1003        ushll2          v5.8h, v5.16b, #4
1004        umull           v6.8h, v7.8b, v24.8b
1005        umull2          v7.8h, v7.16b, v24.16b
1006        subs            w4, w4, #4
1007        stp             q0, q1, [x0]
1008        stp             q2, q3, [x0, #32]
1009        stp             q4, q5, [x0, #64]
1010        stp             q6, q7, [x0, #96]
1011        add             x0, x0, #128
1012        b.gt            16b
1013        ret
1014320:
1015        AARCH64_VALID_JUMP_TARGET
101632:
1017        ldp             q4, q5, [x1]
1018        add             x1, x1, x2
1019        ldp             q6, q7, [x1]
1020        add             x1, x1, x2
1021        ushll           v0.8h, v4.8b, #4
1022        ushll2          v1.8h, v4.16b, #4
1023        umull           v2.8h, v5.8b, v24.8b
1024        umull2          v3.8h, v5.16b, v24.16b
1025        ushll           v4.8h, v6.8b, #4
1026        ushll2          v5.8h, v6.16b, #4
1027        umull           v6.8h, v7.8b, v24.8b
1028        umull2          v7.8h, v7.16b, v24.16b
1029        subs            w4, w4, #2
1030        stp             q0, q1, [x0]
1031        stp             q2, q3, [x0, #32]
1032        stp             q4, q5, [x0, #64]
1033        stp             q6, q7, [x0, #96]
1034        add             x0, x0, #128
1035        b.gt            32b
1036        ret
1037640:
1038        AARCH64_VALID_JUMP_TARGET
103964:
1040        ldp             q4, q5, [x1]
1041        ldp             q6, q7, [x1, #32]
1042        add             x1, x1, x2
1043        ushll           v0.8h, v4.8b, #4
1044        ushll2          v1.8h, v4.16b, #4
1045        umull           v2.8h, v5.8b, v24.8b
1046        umull2          v3.8h, v5.16b, v24.16b
1047        ushll           v4.8h, v6.8b, #4
1048        ushll2          v5.8h, v6.16b, #4
1049        umull           v6.8h, v7.8b, v24.8b
1050        umull2          v7.8h, v7.16b, v24.16b
1051        subs            w4, w4, #1
1052        stp             q0, q1, [x0]
1053        stp             q2, q3, [x0, #32]
1054        stp             q4, q5, [x0, #64]
1055        stp             q6, q7, [x0, #96]
1056        add             x0, x0, #128
1057        b.gt            64b
1058        ret
10591280:
1060        AARCH64_VALID_JUMP_TARGET
1061128:
1062        ldp             q28, q29, [x1]
1063        ldp             q30, q31, [x1, #32]
1064        ushll           v16.8h, v28.8b, #4
1065        ushll2          v17.8h, v28.16b, #4
1066        umull           v18.8h, v29.8b, v24.8b
1067        umull2          v19.8h, v29.16b, v24.16b
1068        ushll           v20.8h, v30.8b, #4
1069        ushll2          v21.8h, v30.16b, #4
1070        umull           v22.8h, v31.8b, v24.8b
1071        umull2          v23.8h, v31.16b, v24.16b
1072        ldp             q28, q29, [x1, #64]
1073        ldp             q30, q31, [x1, #96]
1074        add             x1, x1, x2
1075        stp             q16, q17, [x0]
1076        stp             q18, q19, [x0, #32]
1077        stp             q20, q21, [x0, #64]
1078        stp             q22, q23, [x0, #96]
1079        ushll           v16.8h, v28.8b, #4
1080        ushll2          v17.8h, v28.16b, #4
1081        umull           v18.8h, v29.8b, v24.8b
1082        umull2          v19.8h, v29.16b, v24.16b
1083        ushll           v20.8h, v30.8b, #4
1084        ushll2          v21.8h, v30.16b, #4
1085        umull           v22.8h, v31.8b, v24.8b
1086        umull2          v23.8h, v31.16b, v24.16b
1087        subs            w4, w4, #1
1088        stp             q16, q17, [x0, #128]
1089        stp             q18, q19, [x0, #160]
1090        stp             q20, q21, [x0, #192]
1091        stp             q22, q23, [x0, #224]
1092        add             x0, x0, #256
1093        b.gt            128b
1094        ret
1095
1096L(prep_tbl):
1097        .hword L(prep_tbl) - 1280b
1098        .hword L(prep_tbl) -  640b
1099        .hword L(prep_tbl) -  320b
1100        .hword L(prep_tbl) -  160b
1101        .hword L(prep_tbl) -   80b
1102        .hword L(prep_tbl) -   40b
1103endfunc
1104
1105
1106.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1107        ld1             {\d0\wd}[0], [\s0], \strd
1108        ld1             {\d1\wd}[0], [\s1], \strd
1109.ifnb \d2
1110        ld1             {\d2\wd}[0], [\s0], \strd
1111        ld1             {\d3\wd}[0], [\s1], \strd
1112.endif
1113.ifnb \d4
1114        ld1             {\d4\wd}[0], [\s0], \strd
1115.endif
1116.ifnb \d5
1117        ld1             {\d5\wd}[0], [\s1], \strd
1118.endif
1119.ifnb \d6
1120        ld1             {\d6\wd}[0], [\s0], \strd
1121.endif
1122.endm
1123.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1124        ld1             {\d0\wd}, [\s0], \strd
1125        ld1             {\d1\wd}, [\s1], \strd
1126.ifnb \d2
1127        ld1             {\d2\wd}, [\s0], \strd
1128        ld1             {\d3\wd}, [\s1], \strd
1129.endif
1130.ifnb \d4
1131        ld1             {\d4\wd}, [\s0], \strd
1132.endif
1133.ifnb \d5
1134        ld1             {\d5\wd}, [\s1], \strd
1135.endif
1136.ifnb \d6
1137        ld1             {\d6\wd}, [\s0], \strd
1138.endif
1139.endm
1140.macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1141        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1142.endm
1143.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1144        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1145.endm
1146.macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1147        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1148.endm
1149.macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1150        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1151.endm
1152.macro interleave_1 wd, r0, r1, r2, r3, r4
1153        trn1            \r0\wd, \r0\wd, \r1\wd
1154        trn1            \r1\wd, \r1\wd, \r2\wd
1155.ifnb \r3
1156        trn1            \r2\wd, \r2\wd, \r3\wd
1157        trn1            \r3\wd, \r3\wd, \r4\wd
1158.endif
1159.endm
1160.macro interleave_1_h r0, r1, r2, r3, r4
1161        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
1162.endm
1163.macro interleave_1_s r0, r1, r2, r3, r4
1164        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1165.endm
1166.macro interleave_2 wd, r0, r1, r2, r3, r4, r5
1167        trn1            \r0\wd,  \r0\wd, \r2\wd
1168        trn1            \r1\wd,  \r1\wd, \r3\wd
1169        trn1            \r2\wd,  \r2\wd, \r4\wd
1170        trn1            \r3\wd,  \r3\wd, \r5\wd
1171.endm
1172.macro interleave_2_s r0, r1, r2, r3, r4, r5
1173        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
1174.endm
1175.macro uxtl_b r0, r1, r2, r3, r4, r5, r6
1176        uxtl            \r0\().8h, \r0\().8b
1177        uxtl            \r1\().8h, \r1\().8b
1178.ifnb \r2
1179        uxtl            \r2\().8h, \r2\().8b
1180        uxtl            \r3\().8h, \r3\().8b
1181.endif
1182.ifnb \r4
1183        uxtl            \r4\().8h, \r4\().8b
1184.endif
1185.ifnb \r5
1186        uxtl            \r5\().8h, \r5\().8b
1187.endif
1188.ifnb \r6
1189        uxtl            \r6\().8h, \r6\().8b
1190.endif
1191.endm
1192.macro mul_mla_4tap d, s0, s1, s2, s3, wd
1193        mul             \d\wd,  \s0\wd,  v0.h[0]
1194        mla             \d\wd,  \s1\wd,  v0.h[1]
1195        mla             \d\wd,  \s2\wd,  v0.h[2]
1196        mla             \d\wd,  \s3\wd,  v0.h[3]
1197.endm
1198// Interleaving the mul/mla chains actually hurts performance
1199// significantly on Cortex A53, thus keeping mul/mla tightly
1200// chained like this.
1201.macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
1202        mul             \d0\().4h, \s1\().4h, v0.h[1]
1203        mla             \d0\().4h, \s2\().4h, v0.h[2]
1204        mla             \d0\().4h, \s3\().4h, v0.h[3]
1205        mla             \d0\().4h, \s4\().4h, v0.h[4]
1206        mla             \d0\().4h, \s5\().4h, v0.h[5]
1207        mla             \d0\().4h, \s6\().4h, v0.h[6]
1208.endm
1209.macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
1210        mul             \d0\().8h, \s1\().8h, v0.h[1]
1211        mla             \d0\().8h, \s2\().8h, v0.h[2]
1212        mla             \d0\().8h, \s3\().8h, v0.h[3]
1213        mla             \d0\().8h, \s4\().8h, v0.h[4]
1214        mla             \d0\().8h, \s5\().8h, v0.h[5]
1215        mla             \d0\().8h, \s6\().8h, v0.h[6]
1216.endm
1217.macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1218        mul             \d0\().8h, \s1\().8h, v0.h[1]
1219        mla             \d0\().8h, \s2\().8h, v0.h[2]
1220        mla             \d0\().8h, \s3\().8h, v0.h[3]
1221        mla             \d0\().8h, \s4\().8h, v0.h[4]
1222        mla             \d0\().8h, \s5\().8h, v0.h[5]
1223        mla             \d0\().8h, \s6\().8h, v0.h[6]
1224        mul             \d1\().8h, \s2\().8h, v0.h[1]
1225        mla             \d1\().8h, \s3\().8h, v0.h[2]
1226        mla             \d1\().8h, \s4\().8h, v0.h[3]
1227        mla             \d1\().8h, \s5\().8h, v0.h[4]
1228        mla             \d1\().8h, \s6\().8h, v0.h[5]
1229        mla             \d1\().8h, \s7\().8h, v0.h[6]
1230.endm
1231.macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1232        mul             \d0\().8h, \s1\().8h, v0.h[1]
1233        mla             \d0\().8h, \s2\().8h, v0.h[2]
1234        mla             \d0\().8h, \s3\().8h, v0.h[3]
1235        mla             \d0\().8h, \s4\().8h, v0.h[4]
1236        mla             \d0\().8h, \s5\().8h, v0.h[5]
1237        mla             \d0\().8h, \s6\().8h, v0.h[6]
1238        mul             \d1\().8h, \s3\().8h, v0.h[1]
1239        mla             \d1\().8h, \s4\().8h, v0.h[2]
1240        mla             \d1\().8h, \s5\().8h, v0.h[3]
1241        mla             \d1\().8h, \s6\().8h, v0.h[4]
1242        mla             \d1\().8h, \s7\().8h, v0.h[5]
1243        mla             \d1\().8h, \s8\().8h, v0.h[6]
1244.endm
1245.macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
1246        mul             \d0\().4h, \s0\().4h, v0.h[0]
1247        mla             \d0\().4h, \s1\().4h, v0.h[1]
1248        mla             \d0\().4h, \s2\().4h, v0.h[2]
1249        mla             \d0\().4h, \s3\().4h, v0.h[3]
1250        mla             \d0\().4h, \s4\().4h, v0.h[4]
1251        mla             \d0\().4h, \s5\().4h, v0.h[5]
1252        mla             \d0\().4h, \s6\().4h, v0.h[6]
1253        mla             \d0\().4h, \s7\().4h, v0.h[7]
1254.endm
1255.macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
1256        mul             \d0\().8h, \s0\().8h, v0.h[0]
1257        mla             \d0\().8h, \s1\().8h, v0.h[1]
1258        mla             \d0\().8h, \s2\().8h, v0.h[2]
1259        mla             \d0\().8h, \s3\().8h, v0.h[3]
1260        mla             \d0\().8h, \s4\().8h, v0.h[4]
1261        mla             \d0\().8h, \s5\().8h, v0.h[5]
1262        mla             \d0\().8h, \s6\().8h, v0.h[6]
1263        mla             \d0\().8h, \s7\().8h, v0.h[7]
1264.endm
1265.macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
1266        mul             \d0\().8h, \s0\().8h, v0.h[0]
1267        mla             \d0\().8h, \s1\().8h, v0.h[1]
1268        mla             \d0\().8h, \s2\().8h, v0.h[2]
1269        mla             \d0\().8h, \s3\().8h, v0.h[3]
1270        mla             \d0\().8h, \s4\().8h, v0.h[4]
1271        mla             \d0\().8h, \s5\().8h, v0.h[5]
1272        mla             \d0\().8h, \s6\().8h, v0.h[6]
1273        mla             \d0\().8h, \s7\().8h, v0.h[7]
1274        mul             \d1\().8h, \s1\().8h, v0.h[0]
1275        mla             \d1\().8h, \s2\().8h, v0.h[1]
1276        mla             \d1\().8h, \s3\().8h, v0.h[2]
1277        mla             \d1\().8h, \s4\().8h, v0.h[3]
1278        mla             \d1\().8h, \s5\().8h, v0.h[4]
1279        mla             \d1\().8h, \s6\().8h, v0.h[5]
1280        mla             \d1\().8h, \s7\().8h, v0.h[6]
1281        mla             \d1\().8h, \s8\().8h, v0.h[7]
1282.endm
1283.macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
1284        mul             \d0\().8h, \s0\().8h, v0.h[0]
1285        mla             \d0\().8h, \s1\().8h, v0.h[1]
1286        mla             \d0\().8h, \s2\().8h, v0.h[2]
1287        mla             \d0\().8h, \s3\().8h, v0.h[3]
1288        mla             \d0\().8h, \s4\().8h, v0.h[4]
1289        mla             \d0\().8h, \s5\().8h, v0.h[5]
1290        mla             \d0\().8h, \s6\().8h, v0.h[6]
1291        mla             \d0\().8h, \s7\().8h, v0.h[7]
1292        mul             \d1\().8h, \s2\().8h, v0.h[0]
1293        mla             \d1\().8h, \s3\().8h, v0.h[1]
1294        mla             \d1\().8h, \s4\().8h, v0.h[2]
1295        mla             \d1\().8h, \s5\().8h, v0.h[3]
1296        mla             \d1\().8h, \s6\().8h, v0.h[4]
1297        mla             \d1\().8h, \s7\().8h, v0.h[5]
1298        mla             \d1\().8h, \s8\().8h, v0.h[6]
1299        mla             \d1\().8h, \s9\().8h, v0.h[7]
1300.endm
1301.macro sqrshrun_b shift, r0, r1, r2, r3
1302        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
1303.ifnb \r1
1304        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
1305.endif
1306.ifnb \r2
1307        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
1308        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
1309.endif
1310.endm
1311.macro srshr_h shift, r0, r1, r2, r3
1312        srshr           \r0\().8h, \r0\().8h,  #\shift
1313.ifnb \r1
1314        srshr           \r1\().8h, \r1\().8h,  #\shift
1315.endif
1316.ifnb \r2
1317        srshr           \r2\().8h, \r2\().8h,  #\shift
1318        srshr           \r3\().8h, \r3\().8h,  #\shift
1319.endif
1320.endm
1321.macro st_h strd, reg, lanes
1322        st1             {\reg\().h}[0], [x0], \strd
1323        st1             {\reg\().h}[1], [x8], \strd
1324.if \lanes > 2
1325        st1             {\reg\().h}[2], [x0], \strd
1326        st1             {\reg\().h}[3], [x8], \strd
1327.endif
1328.endm
1329.macro st_s strd, r0, r1
1330        st1             {\r0\().s}[0], [x0], \strd
1331        st1             {\r0\().s}[1], [x8], \strd
1332.ifnb \r1
1333        st1             {\r1\().s}[0], [x0], \strd
1334        st1             {\r1\().s}[1], [x8], \strd
1335.endif
1336.endm
1337.macro st_d strd, r0, r1
1338        st1             {\r0\().d}[0], [x0], \strd
1339        st1             {\r0\().d}[1], [x8], \strd
1340.ifnb \r1
1341        st1             {\r1\().d}[0], [x0], \strd
1342        st1             {\r1\().d}[1], [x8], \strd
1343.endif
1344.endm
1345.macro shift_store_4 type, strd, r0, r1
1346.ifc \type, put
1347        sqrshrun_b      6,     \r0, \r1
1348        st_s            \strd, \r0, \r1
1349.else
1350        srshr_h         2,     \r0, \r1
1351        st_d            \strd, \r0, \r1
1352.endif
1353.endm
1354.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1355        st1             {\r0\wd}, [x0], \strd
1356        st1             {\r1\wd}, [x8], \strd
1357.ifnb \r2
1358        st1             {\r2\wd}, [x0], \strd
1359        st1             {\r3\wd}, [x8], \strd
1360.endif
1361.ifnb \r4
1362        st1             {\r4\wd}, [x0], \strd
1363        st1             {\r5\wd}, [x8], \strd
1364        st1             {\r6\wd}, [x0], \strd
1365        st1             {\r7\wd}, [x8], \strd
1366.endif
1367.endm
1368.macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
1369        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1370.endm
1371.macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
1372        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1373.endm
1374.macro shift_store_8 type, strd, r0, r1, r2, r3
1375.ifc \type, put
1376        sqrshrun_b      6,     \r0, \r1, \r2, \r3
1377        st_8b           \strd, \r0, \r1, \r2, \r3
1378.else
1379        srshr_h         2,     \r0, \r1, \r2, \r3
1380        st_16b          \strd, \r0, \r1, \r2, \r3
1381.endif
1382.endm
1383.macro shift_store_16 type, strd, r0, r1, r2, r3
1384.ifc \type, put
1385        sqrshrun        \r0\().8b,  \r0\().8h, #6
1386        sqrshrun2       \r0\().16b, \r1\().8h, #6
1387        sqrshrun        \r2\().8b,  \r2\().8h, #6
1388        sqrshrun2       \r2\().16b, \r3\().8h, #6
1389        st_16b          \strd, \r0, \r2
1390.else
1391        srshr_h         2,     \r0, \r1, \r2, \r3
1392        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
1393        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
1394.endif
1395.endm
1396
1397.macro make_8tap_fn op, type, type_h, type_v, taps
1398function \op\()_8tap_\type\()_8bpc_neon, export=1
1399        mov             x8,  \type_h
1400        mov             x9,  \type_v
1401        b               \op\()_\taps\()_neon
1402endfunc
1403.endm
1404
1405// No spaces in these expressions, due to gas-preprocessor.
1406#define REGULAR ((0*15<<7)|3*15)
1407#define SMOOTH  ((1*15<<7)|4*15)
1408#define SHARP   ((2*15<<7)|3*15)
1409
1410.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
1411function \type\()_\taps\()_neon
1412        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1413        mul             \mx,  \mx, w10
1414        mul             \my,  \my, w10
1415        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
1416        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
1417.ifc \type, prep
1418        uxtw            \d_strd, \w
1419        lsl             \d_strd, \d_strd, #1
1420.endif
1421
1422        clz             w8,  \w
1423        tst             \mx, #(0x7f << 14)
1424        sub             w8,  w8,  #24
1425        movrel          x10, X(mc_subpel_filters), -8
1426        b.ne            L(\type\()_\taps\()_h)
1427        tst             \my, #(0x7f << 14)
1428        b.ne            L(\type\()_\taps\()_v)
1429        b               \type\()_neon
1430
1431L(\type\()_\taps\()_h):
1432        cmp             \w,  #4
1433        ubfx            w9,  \mx, #7, #7
1434        and             \mx, \mx, #0x7f
1435        b.le            4f
1436        mov             \mx,  w9
14374:
1438        tst             \my,  #(0x7f << 14)
1439        add             \xmx, x10, \mx, uxtw #3
1440        b.ne            L(\type\()_\taps\()_hv)
1441
1442        adr             x9,  L(\type\()_\taps\()_h_tbl)
1443        ldrh            w8,  [x9, x8, lsl #1]
1444        sub             x9,  x9,  w8, uxtw
1445        br              x9
1446
144720:     // 2xN h
1448        AARCH64_VALID_JUMP_TARGET
1449.ifc \type, put
1450        add             \xmx,  \xmx,  #2
1451        ld1             {v0.s}[0], [\xmx]
1452        sub             \src,  \src,  #1
1453        add             \ds2,  \dst,  \d_strd
1454        add             \sr2,  \src,  \s_strd
1455        lsl             \d_strd,  \d_strd,  #1
1456        lsl             \s_strd,  \s_strd,  #1
1457        sxtl            v0.8h,  v0.8b
14582:
1459        ld1             {v4.8b},  [\src], \s_strd
1460        ld1             {v6.8b},  [\sr2], \s_strd
1461        uxtl            v4.8h,  v4.8b
1462        uxtl            v6.8h,  v6.8b
1463        ext             v5.16b, v4.16b, v4.16b, #2
1464        ext             v7.16b, v6.16b, v6.16b, #2
1465        subs            \h,  \h,  #2
1466        trn1            v3.2s,  v4.2s,  v6.2s
1467        trn2            v6.2s,  v4.2s,  v6.2s
1468        trn1            v4.2s,  v5.2s,  v7.2s
1469        trn2            v7.2s,  v5.2s,  v7.2s
1470        mul             v3.4h,  v3.4h,  v0.h[0]
1471        mla             v3.4h,  v4.4h,  v0.h[1]
1472        mla             v3.4h,  v6.4h,  v0.h[2]
1473        mla             v3.4h,  v7.4h,  v0.h[3]
1474        srshr           v3.4h,  v3.4h,  #2
1475        sqrshrun        v3.8b,  v3.8h,  #4
1476        st1             {v3.h}[0], [\dst], \d_strd
1477        st1             {v3.h}[1], [\ds2], \d_strd
1478        b.gt            2b
1479        ret
1480.endif
1481
148240:     // 4xN h
1483        AARCH64_VALID_JUMP_TARGET
1484        add             \xmx,  \xmx,  #2
1485        ld1             {v0.s}[0], [\xmx]
1486        sub             \src,  \src,  #1
1487        add             \ds2,  \dst,  \d_strd
1488        add             \sr2,  \src,  \s_strd
1489        lsl             \d_strd,  \d_strd,  #1
1490        lsl             \s_strd,  \s_strd,  #1
1491        sxtl            v0.8h,  v0.8b
14924:
1493        ld1             {v16.8b}, [\src], \s_strd
1494        ld1             {v20.8b}, [\sr2], \s_strd
1495        uxtl            v16.8h,  v16.8b
1496        uxtl            v20.8h,  v20.8b
1497        ext             v17.16b, v16.16b, v16.16b, #2
1498        ext             v18.16b, v16.16b, v16.16b, #4
1499        ext             v19.16b, v16.16b, v16.16b, #6
1500        ext             v21.16b, v20.16b, v20.16b, #2
1501        ext             v22.16b, v20.16b, v20.16b, #4
1502        ext             v23.16b, v20.16b, v20.16b, #6
1503        subs            \h,  \h,  #2
1504        mul             v16.4h,  v16.4h,  v0.h[0]
1505        mla             v16.4h,  v17.4h,  v0.h[1]
1506        mla             v16.4h,  v18.4h,  v0.h[2]
1507        mla             v16.4h,  v19.4h,  v0.h[3]
1508        mul             v20.4h,  v20.4h,  v0.h[0]
1509        mla             v20.4h,  v21.4h,  v0.h[1]
1510        mla             v20.4h,  v22.4h,  v0.h[2]
1511        mla             v20.4h,  v23.4h,  v0.h[3]
1512        srshr           v16.4h,  v16.4h,  #2
1513        srshr           v20.4h,  v20.4h,  #2
1514.ifc \type, put
1515        sqrshrun        v16.8b,  v16.8h,  #4
1516        sqrshrun        v20.8b,  v20.8h,  #4
1517        st1             {v16.s}[0], [\dst], \d_strd
1518        st1             {v20.s}[0], [\ds2], \d_strd
1519.else
1520        st1             {v16.4h}, [\dst], \d_strd
1521        st1             {v20.4h}, [\ds2], \d_strd
1522.endif
1523        b.gt            4b
1524        ret
1525
152680:     // 8xN h
1527        AARCH64_VALID_JUMP_TARGET
1528        ld1             {v0.8b}, [\xmx]
1529        sub             \src,  \src,  #3
1530        add             \ds2,  \dst,  \d_strd
1531        add             \sr2,  \src,  \s_strd
1532        lsl             \d_strd,  \d_strd,  #1
1533        lsl             \s_strd,  \s_strd,  #1
1534        sxtl            v0.8h, v0.8b
15358:
1536        ld1             {v16.8b, v17.8b},  [\src], \s_strd
1537        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
1538        uxtl            v16.8h,  v16.8b
1539        uxtl            v17.8h,  v17.8b
1540        uxtl            v20.8h,  v20.8b
1541        uxtl            v21.8h,  v21.8b
1542
1543.ifc \taps, 6tap
1544        ext             v19.16b, v16.16b, v17.16b, #2
1545        ext             v23.16b, v20.16b, v21.16b, #2
1546        mul             v18.8h,  v19.8h,  v0.h[1]
1547        mul             v22.8h,  v23.8h,  v0.h[1]
1548.irpc i, 23456
1549        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1550        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1551        mla             v18.8h,  v19.8h,  v0.h[\i]
1552        mla             v22.8h,  v23.8h,  v0.h[\i]
1553.endr
1554.else   // 8tap
1555        mul             v18.8h,  v16.8h,  v0.h[0]
1556        mul             v22.8h,  v20.8h,  v0.h[0]
1557.irpc i, 1234567
1558        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
1559        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
1560        mla             v18.8h,  v19.8h,  v0.h[\i]
1561        mla             v22.8h,  v23.8h,  v0.h[\i]
1562.endr
1563.endif
1564        subs            \h,  \h,  #2
1565        srshr           v18.8h,  v18.8h, #2
1566        srshr           v22.8h,  v22.8h, #2
1567.ifc \type, put
1568        sqrshrun        v18.8b,  v18.8h, #4
1569        sqrshrun        v22.8b,  v22.8h, #4
1570        st1             {v18.8b}, [\dst], \d_strd
1571        st1             {v22.8b}, [\ds2], \d_strd
1572.else
1573        st1             {v18.8h}, [\dst], \d_strd
1574        st1             {v22.8h}, [\ds2], \d_strd
1575.endif
1576        b.gt            8b
1577        ret
1578160:
1579320:
1580640:
15811280:   // 16xN, 32xN, ... h
1582        AARCH64_VALID_JUMP_TARGET
1583        ld1             {v0.8b}, [\xmx]
1584        sub             \src,  \src,  #3
1585        add             \ds2,  \dst,  \d_strd
1586        add             \sr2,  \src,  \s_strd
1587        lsl             \s_strd,  \s_strd,  #1
1588        sxtl            v0.8h, v0.8b
1589
1590        sub             \s_strd,  \s_strd,  \w, uxtw
1591        sub             \s_strd,  \s_strd,  #8
1592.ifc \type, put
1593        lsl             \d_strd,  \d_strd,  #1
1594        sub             \d_strd,  \d_strd,  \w, uxtw
1595.endif
1596161:
1597        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
1598        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
1599        mov             \mx, \w
1600        uxtl            v16.8h,  v16.8b
1601        uxtl            v17.8h,  v17.8b
1602        uxtl            v18.8h,  v18.8b
1603        uxtl            v20.8h,  v20.8b
1604        uxtl            v21.8h,  v21.8b
1605        uxtl            v22.8h,  v22.8b
1606
160716:
1608.ifc \taps, 6tap
1609        ext             v28.16b, v16.16b, v17.16b, #2
1610        ext             v29.16b, v17.16b, v18.16b, #2
1611        ext             v30.16b, v20.16b, v21.16b, #2
1612        ext             v31.16b, v21.16b, v22.16b, #2
1613        mul             v24.8h,  v28.8h,  v0.h[1]
1614        mul             v25.8h,  v29.8h,  v0.h[1]
1615        mul             v26.8h,  v30.8h,  v0.h[1]
1616        mul             v27.8h,  v31.8h,  v0.h[1]
1617.irpc i, 23456
1618        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1619        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1620        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1621        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1622        mla             v24.8h,  v28.8h,  v0.h[\i]
1623        mla             v25.8h,  v29.8h,  v0.h[\i]
1624        mla             v26.8h,  v30.8h,  v0.h[\i]
1625        mla             v27.8h,  v31.8h,  v0.h[\i]
1626.endr
1627.else   // 8tap
1628        mul             v24.8h,  v16.8h,  v0.h[0]
1629        mul             v25.8h,  v17.8h,  v0.h[0]
1630        mul             v26.8h,  v20.8h,  v0.h[0]
1631        mul             v27.8h,  v21.8h,  v0.h[0]
1632.irpc i, 1234567
1633        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
1634        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
1635        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
1636        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
1637        mla             v24.8h,  v28.8h,  v0.h[\i]
1638        mla             v25.8h,  v29.8h,  v0.h[\i]
1639        mla             v26.8h,  v30.8h,  v0.h[\i]
1640        mla             v27.8h,  v31.8h,  v0.h[\i]
1641.endr
1642.endif
1643        srshr           v24.8h,  v24.8h, #2
1644        srshr           v25.8h,  v25.8h, #2
1645        srshr           v26.8h,  v26.8h, #2
1646        srshr           v27.8h,  v27.8h, #2
1647        subs            \mx, \mx, #16
1648.ifc \type, put
1649        sqrshrun        v24.8b,  v24.8h, #4
1650        sqrshrun2       v24.16b, v25.8h, #4
1651        sqrshrun        v26.8b,  v26.8h, #4
1652        sqrshrun2       v26.16b, v27.8h, #4
1653        st1             {v24.16b}, [\dst], #16
1654        st1             {v26.16b}, [\ds2], #16
1655.else
1656        st1             {v24.8h, v25.8h}, [\dst], #32
1657        st1             {v26.8h, v27.8h}, [\ds2], #32
1658.endif
1659        b.le            9f
1660
1661        mov             v16.16b, v18.16b
1662        mov             v20.16b, v22.16b
1663        ld1             {v17.8b, v18.8b}, [\src], #16
1664        ld1             {v21.8b, v22.8b}, [\sr2], #16
1665        uxtl            v17.8h,  v17.8b
1666        uxtl            v18.8h,  v18.8b
1667        uxtl            v21.8h,  v21.8b
1668        uxtl            v22.8h,  v22.8b
1669        b               16b
1670
16719:
1672        add             \dst,  \dst,  \d_strd
1673        add             \ds2,  \ds2,  \d_strd
1674        add             \src,  \src,  \s_strd
1675        add             \sr2,  \sr2,  \s_strd
1676
1677        subs            \h,  \h,  #2
1678        b.gt            161b
1679        ret
1680
1681L(\type\()_\taps\()_h_tbl):
1682        .hword L(\type\()_\taps\()_h_tbl) - 1280b
1683        .hword L(\type\()_\taps\()_h_tbl) -  640b
1684        .hword L(\type\()_\taps\()_h_tbl) -  320b
1685        .hword L(\type\()_\taps\()_h_tbl) -  160b
1686        .hword L(\type\()_\taps\()_h_tbl) -   80b
1687        .hword L(\type\()_\taps\()_h_tbl) -   40b
1688        .hword L(\type\()_\taps\()_h_tbl) -   20b
1689        .hword 0
1690
1691
1692L(\type\()_\taps\()_v):
1693        cmp             \h,  #4
1694        ubfx            w9,  \my, #7, #7
1695        and             \my, \my, #0x7f
1696        b.le            4f
1697        mov             \my, w9
16984:
1699        add             \xmy, x10, \my, uxtw #3
1700
1701        adr             x9,  L(\type\()_\taps\()_v_tbl)
1702        ldrh            w8,  [x9, x8, lsl #1]
1703        sub             x9,  x9,  w8, uxtw
1704        br              x9
1705
170620:     // 2xN v
1707        AARCH64_VALID_JUMP_TARGET
1708.ifc \type, put
1709        b.gt            28f
1710
1711        cmp             \h,  #2
1712        add             \xmy, \xmy, #2
1713        ld1             {v0.s}[0], [\xmy]
1714        sub             \src,  \src,  \s_strd
1715        add             \ds2,  \dst,  \d_strd
1716        add             \sr2,  \src,  \s_strd
1717        lsl             \s_strd,  \s_strd,  #1
1718        lsl             \d_strd,  \d_strd,  #1
1719        sxtl            v0.8h, v0.8b
1720
1721        // 2x2 v
1722        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1723        interleave_1_h  v1, v2, v3, v4, v5
1724        b.gt            24f
1725        uxtl_b          v1, v2, v3, v4
1726        mul_mla_4tap    v6, v1, v2, v3, v4, .4h
1727        sqrshrun_b      6,  v6
1728        st_h            \d_strd, v6, 2
1729        ret
1730
173124:     // 2x4 v
1732        load_h          \sr2, \src, \s_strd, v6, v7
1733        interleave_1_h  v5, v6, v7
1734        interleave_2_s  v1, v2, v3, v4, v5, v6
1735        uxtl_b          v1, v2, v3, v4
1736        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1737        sqrshrun_b      6,  v6
1738        st_h            \d_strd, v6, 4
1739        ret
1740
174128:     // 2x6, 2x8, 2x12, 2x16 v
1742        ld1             {v0.8b}, [\xmy]
1743        sub             \sr2,  \src,  \s_strd, lsl #1
1744        add             \ds2,  \dst,  \d_strd
1745        sub             \src,  \sr2,  \s_strd
1746        lsl             \d_strd,  \d_strd,  #1
1747        lsl             \s_strd,  \s_strd,  #1
1748        sxtl            v0.8h, v0.8b
1749
1750        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1751        interleave_1_h  v1,  v2,  v3,  v4,  v5
1752        interleave_1_h  v5,  v6,  v7
1753        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
1754        uxtl_b          v1,  v2,  v3,  v4
1755216:
1756        subs            \h,  \h,  #4
1757        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
1758        interleave_1_h  v7,  v16, v17, v18, v19
1759        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
1760        uxtl_b          v5,  v6,  v7,  v16
1761        mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
1762        sqrshrun_b      6,   v30
1763        st_h            \d_strd, v30, 4
1764        b.le            0f
1765        cmp             \h,  #2
1766        mov             v1.16b,  v5.16b
1767        mov             v2.16b,  v6.16b
1768        mov             v3.16b,  v7.16b
1769        mov             v4.16b,  v16.16b
1770        mov             v5.16b,  v17.16b
1771        mov             v6.16b,  v18.16b
1772        mov             v7.16b,  v19.16b
1773        b.eq            26f
1774        b               216b
177526:
1776        load_h          \sr2, \src, \s_strd, v16, v17
1777        interleave_1_h  v7,  v16, v17
1778        uxtl_b          v5,  v6,  v7,  v16
1779        mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
1780        sqrshrun_b      6,   v30
1781        st_h            \d_strd, v30, 2
17820:
1783        ret
1784.endif
1785
178640:
1787        AARCH64_VALID_JUMP_TARGET
1788        b.gt            480f
1789
1790        // 4x2, 4x4 v
1791        cmp             \h,  #2
1792        add             \xmy, \xmy, #2
1793        ld1             {v0.s}[0], [\xmy]
1794        sub             \src, \src, \s_strd
1795        add             \ds2, \dst, \d_strd
1796        add             \sr2, \src, \s_strd
1797        lsl             \s_strd, \s_strd, #1
1798        lsl             \d_strd, \d_strd, #1
1799        sxtl            v0.8h, v0.8b
1800
1801        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1802        interleave_1_s  v1, v2, v3, v4, v5
1803        uxtl_b          v1, v2, v3, v4
1804        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1805        shift_store_4   \type, \d_strd, v6
1806        b.le            0f
1807        load_s          \sr2, \src, \s_strd, v6, v7
1808        interleave_1_s  v5, v6, v7
1809        uxtl_b          v5, v6
1810        mul_mla_4tap    v7, v3, v4, v5, v6, .8h
1811        shift_store_4   \type, \d_strd, v7
18120:
1813        ret
1814
1815480:    // 4x6, 4x8, 4x12, 4x16 v
1816        ld1             {v0.8b}, [\xmy]
1817        sub             \sr2, \src, \s_strd, lsl #1
1818        add             \ds2, \dst, \d_strd
1819        sub             \src, \sr2, \s_strd
1820        lsl             \s_strd, \s_strd, #1
1821        lsl             \d_strd, \d_strd, #1
1822        sxtl            v0.8h, v0.8b
1823
1824        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1825        interleave_1_s  v16, v17, v18
1826        interleave_1_s  v18, v19, v20, v21, v22
1827        uxtl_b          v16, v17
1828        uxtl_b          v18, v19, v20, v21
1829
183048:
1831        subs            \h,  \h,  #4
1832        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
1833        interleave_1_s  v22, v23, v24, v25, v26
1834        uxtl_b          v22, v23, v24, v25
1835        mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
1836        shift_store_4   \type, \d_strd, v1, v2
1837        b.le            0f
1838        load_s          \sr2,  \src, \s_strd, v27, v16
1839        subs            \h,  \h,  #2
1840        interleave_1_s  v26, v27, v16
1841        uxtl_b          v26, v27
1842        mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
1843        shift_store_4   \type, \d_strd, v1
1844        b.le            0f
1845        load_s          \sr2,  \src, \s_strd, v17, v18
1846        subs            \h,  \h,  #2
1847        interleave_1_s  v16, v17, v18
1848        uxtl_b          v16, v17
1849        mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
1850        shift_store_4   \type, \d_strd, v2
1851        b.le            0f
1852        subs            \h,  \h,  #4
1853        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
1854        interleave_1_s  v18, v19, v20, v21, v22
1855        uxtl_b          v18, v19, v20, v21
1856        mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
1857        shift_store_4   \type, \d_strd, v1, v2
1858        b.gt            48b
18590:
1860        ret
1861
186280:
1863        AARCH64_VALID_JUMP_TARGET
1864        b.gt            880f
1865
1866        // 8x2, 8x4 v
1867        cmp             \h,  #2
1868        add             \xmy, \xmy, #2
1869        ld1             {v0.s}[0], [\xmy]
1870        sub             \src, \src, \s_strd
1871        add             \ds2, \dst, \d_strd
1872        add             \sr2, \src, \s_strd
1873        lsl             \s_strd, \s_strd, #1
1874        lsl             \d_strd, \d_strd, #1
1875        sxtl            v0.8h, v0.8b
1876
1877        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1878        uxtl_b          v1, v2, v3, v4, v5
1879        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
1880        mul_mla_4tap    v7, v2, v3, v4, v5, .8h
1881        shift_store_8   \type, \d_strd, v6, v7
1882        b.le            0f
1883        load_8b         \sr2, \src, \s_strd, v6, v7
1884        uxtl_b          v6, v7
1885        mul_mla_4tap    v1, v3, v4, v5, v6, .8h
1886        mul_mla_4tap    v2, v4, v5, v6, v7, .8h
1887        shift_store_8   \type, \d_strd, v1, v2
18880:
1889        ret
1890
1891880:    // 8x6, 8x8, 8x16, 8x32 v
18921680:   // 16x8, 16x16, ...
1893320:    // 32x8, 32x16, ...
1894640:
18951280:
1896        AARCH64_VALID_JUMP_TARGET
1897        ld1             {v0.8b}, [\xmy]
1898        sub             \src, \src, \s_strd
1899        sub             \src, \src, \s_strd, lsl #1
1900        sxtl            v0.8h, v0.8b
1901        mov             \my,  \h
1902168:
1903        add             \ds2, \dst, \d_strd
1904        add             \sr2, \src, \s_strd
1905        lsl             \s_strd, \s_strd, #1
1906        lsl             \d_strd, \d_strd, #1
1907
1908        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1909        uxtl_b          v16, v17, v18, v19, v20, v21, v22
1910
191188:
1912        subs            \h,  \h,  #2
1913        load_8b         \sr2, \src, \s_strd, v23, v24
1914        uxtl_b          v23, v24
1915        mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
1916        shift_store_8   \type, \d_strd, v1, v2
1917        b.le            9f
1918        subs            \h,  \h,  #2
1919        load_8b         \sr2, \src, \s_strd, v25, v26
1920        uxtl_b          v25, v26
1921        mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
1922        shift_store_8   \type, \d_strd, v3, v4
1923        b.le            9f
1924        subs            \h,  \h,  #2
1925        load_8b         \sr2, \src, \s_strd, v27, v16
1926        uxtl_b          v27, v16
1927        mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
1928        shift_store_8   \type, \d_strd, v1, v2
1929        b.le            9f
1930        subs            \h,  \h,  #2
1931        load_8b         \sr2, \src, \s_strd, v17, v18
1932        uxtl_b          v17, v18
1933        mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
1934        shift_store_8   \type, \d_strd, v3, v4
1935        b.le            9f
1936        subs            \h,  \h,  #4
1937        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
1938        uxtl_b          v19, v20, v21, v22
1939        mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
1940        mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
1941        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1942        b.gt            88b
19439:
1944        subs            \w,  \w,  #8
1945        b.le            0f
1946        asr             \s_strd, \s_strd, #1
1947        asr             \d_strd, \d_strd, #1
1948        msub            \src, \s_strd, \xmy, \src
1949        msub            \dst, \d_strd, \xmy, \dst
1950        sub             \src, \src, \s_strd, lsl #3
1951        mov             \h,  \my
1952        add             \src, \src, #8
1953.ifc \type, put
1954        add             \dst, \dst, #8
1955.else
1956        add             \dst, \dst, #16
1957.endif
1958        b               168b
19590:
1960        ret
1961
1962160:
1963        AARCH64_VALID_JUMP_TARGET
1964        b.gt            1680b
1965
1966        // 16x2, 16x4 v
1967        add             \xmy, \xmy, #2
1968        ld1             {v0.s}[0], [\xmy]
1969        sub             \src, \src, \s_strd
1970        add             \ds2, \dst, \d_strd
1971        add             \sr2, \src, \s_strd
1972        lsl             \s_strd, \s_strd, #1
1973        lsl             \d_strd, \d_strd, #1
1974        sxtl            v0.8h, v0.8b
1975
1976        cmp             \h,  #2
1977        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
1978        uxtl            v16.8h, v1.8b
1979        uxtl            v17.8h, v2.8b
1980        uxtl            v18.8h, v3.8b
1981        uxtl            v19.8h, v4.8b
1982        uxtl            v20.8h, v5.8b
1983        uxtl2           v23.8h, v1.16b
1984        uxtl2           v24.8h, v2.16b
1985        uxtl2           v25.8h, v3.16b
1986        uxtl2           v26.8h, v4.16b
1987        uxtl2           v27.8h, v5.16b
1988        mul_mla_4tap    v1,  v16, v17, v18, v19, .8h
1989        mul_mla_4tap    v16, v17, v18, v19, v20, .8h
1990        mul_mla_4tap    v2,  v23, v24, v25, v26, .8h
1991        mul_mla_4tap    v17, v24, v25, v26, v27, .8h
1992        shift_store_16  \type, \d_strd, v1, v2, v16, v17
1993        b.le            0f
1994        load_16b        \sr2, \src, \s_strd, v6,  v7
1995        uxtl            v21.8h, v6.8b
1996        uxtl            v22.8h, v7.8b
1997        uxtl2           v28.8h, v6.16b
1998        uxtl2           v29.8h, v7.16b
1999        mul_mla_4tap    v1,  v18, v19, v20, v21, .8h
2000        mul_mla_4tap    v3,  v19, v20, v21, v22, .8h
2001        mul_mla_4tap    v2,  v25, v26, v27, v28, .8h
2002        mul_mla_4tap    v4,  v26, v27, v28, v29, .8h
2003        shift_store_16  \type, \d_strd, v1, v2, v3, v4
20040:
2005        ret
2006
2007L(\type\()_\taps\()_v_tbl):
2008        .hword L(\type\()_\taps\()_v_tbl) - 1280b
2009        .hword L(\type\()_\taps\()_v_tbl) -  640b
2010        .hword L(\type\()_\taps\()_v_tbl) -  320b
2011        .hword L(\type\()_\taps\()_v_tbl) -  160b
2012        .hword L(\type\()_\taps\()_v_tbl) -   80b
2013        .hword L(\type\()_\taps\()_v_tbl) -   40b
2014        .hword L(\type\()_\taps\()_v_tbl) -   20b
2015        .hword 0
2016
2017L(\type\()_\taps\()_hv):
2018        cmp             \h,  #4
2019        ubfx            w9,  \my, #7, #7
2020        and             \my, \my, #0x7f
2021        b.le            4f
2022        mov             \my,  w9
20234:
2024        add             \xmy,  x10, \my, uxtw #3
2025
2026        adr             x9,  L(\type\()_\taps\()_hv_tbl)
2027        ldrh            w8,  [x9, x8, lsl #1]
2028        sub             x9,  x9,  w8, uxtw
2029        br              x9
2030
203120:
2032        AARCH64_VALID_JUMP_TARGET
2033.ifc \type, put
2034        add             \xmx,  \xmx,  #2
2035        ld1             {v0.s}[0],  [\xmx]
2036        b.gt            280f
2037        add             \xmy,  \xmy,  #2
2038        ld1             {v1.s}[0],  [\xmy]
2039
2040        // 2x2, 2x4 hv
2041        sub             \sr2, \src, #1
2042        sub             \src, \sr2, \s_strd
2043        add             \ds2, \dst, \d_strd
2044        lsl             \s_strd, \s_strd, #1
2045        lsl             \d_strd, \d_strd, #1
2046        sxtl            v0.8h,  v0.8b
2047        sxtl            v1.8h,  v1.8b
2048        mov             x15, x30
2049
2050        ld1             {v28.8b}, [\src], \s_strd
2051        uxtl            v28.8h,  v28.8b
2052        ext             v29.16b, v28.16b, v28.16b, #2
2053        mul             v28.4h,  v28.4h,  v0.4h
2054        mul             v29.4h,  v29.4h,  v0.4h
2055        addp            v28.4h,  v28.4h,  v29.4h
2056        addp            v16.4h,  v28.4h,  v28.4h
2057        srshr           v16.4h,  v16.4h,  #2
2058        bl              L(\type\()_\taps\()_filter_2)
2059
2060        trn1            v16.2s, v16.2s, v28.2s
2061        mov             v17.8b, v28.8b
2062
20632:
2064        bl              L(\type\()_\taps\()_filter_2)
2065
2066        ext             v18.8b, v17.8b, v28.8b, #4
2067        smull           v2.4s,  v16.4h, v1.h[0]
2068        smlal           v2.4s,  v17.4h, v1.h[1]
2069        smlal           v2.4s,  v18.4h, v1.h[2]
2070        smlal           v2.4s,  v28.4h, v1.h[3]
2071
2072        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2073        sqxtun          v2.8b,  v2.8h
2074        subs            \h,  \h,  #2
2075        st1             {v2.h}[0], [\dst], \d_strd
2076        st1             {v2.h}[1], [\ds2], \d_strd
2077        b.le            0f
2078        mov             v16.8b, v18.8b
2079        mov             v17.8b, v28.8b
2080        b               2b
2081
2082280:    // 2x8, 2x16, 2x32 hv
2083        ld1             {v1.8b},  [\xmy]
2084        sub             \src, \src, #1
2085        sub             \sr2, \src, \s_strd, lsl #1
2086        sub             \src, \sr2, \s_strd
2087        add             \ds2, \dst, \d_strd
2088        lsl             \s_strd, \s_strd, #1
2089        lsl             \d_strd, \d_strd, #1
2090        sxtl            v0.8h,  v0.8b
2091        sxtl            v1.8h,  v1.8b
2092        mov             x15, x30
2093
2094        ld1             {v28.8b}, [\src], \s_strd
2095        uxtl            v28.8h,  v28.8b
2096        ext             v29.16b, v28.16b, v28.16b, #2
2097        mul             v28.4h,  v28.4h,  v0.4h
2098        mul             v29.4h,  v29.4h,  v0.4h
2099        addp            v28.4h,  v28.4h,  v29.4h
2100        addp            v16.4h,  v28.4h,  v28.4h
2101        srshr           v16.4h,  v16.4h,  #2
2102
2103        bl              L(\type\()_\taps\()_filter_2)
2104        trn1            v16.2s, v16.2s, v28.2s
2105        mov             v17.8b, v28.8b
2106        bl              L(\type\()_\taps\()_filter_2)
2107        ext             v18.8b, v17.8b, v28.8b, #4
2108        mov             v19.8b, v28.8b
2109        bl              L(\type\()_\taps\()_filter_2)
2110        ext             v20.8b, v19.8b, v28.8b, #4
2111        mov             v21.8b, v28.8b
2112
211328:
2114        bl              L(\type\()_\taps\()_filter_2)
2115        ext             v22.8b, v21.8b, v28.8b, #4
2116.ifc \taps, 6tap
2117        smull           v2.4s,  v17.4h, v1.h[1]
2118        smlal           v2.4s,  v18.4h, v1.h[2]
2119        smlal           v2.4s,  v19.4h, v1.h[3]
2120        smlal           v2.4s,  v20.4h, v1.h[4]
2121        smlal           v2.4s,  v21.4h, v1.h[5]
2122        smlal           v2.4s,  v22.4h, v1.h[6]
2123.else   // 8tap
2124        smull           v2.4s,  v16.4h, v1.h[0]
2125        smlal           v2.4s,  v17.4h, v1.h[1]
2126        smlal           v2.4s,  v18.4h, v1.h[2]
2127        smlal           v2.4s,  v19.4h, v1.h[3]
2128        smlal           v2.4s,  v20.4h, v1.h[4]
2129        smlal           v2.4s,  v21.4h, v1.h[5]
2130        smlal           v2.4s,  v22.4h, v1.h[6]
2131        smlal           v2.4s,  v28.4h, v1.h[7]
2132.endif
2133
2134        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2135        sqxtun          v2.8b,  v2.8h
2136        subs            \h,  \h,  #2
2137        st1             {v2.h}[0], [\dst], \d_strd
2138        st1             {v2.h}[1], [\ds2], \d_strd
2139        b.le            0f
2140        mov             v16.8b, v18.8b
2141        mov             v17.8b, v19.8b
2142        mov             v18.8b, v20.8b
2143        mov             v19.8b, v21.8b
2144        mov             v20.8b, v22.8b
2145        mov             v21.8b, v28.8b
2146        b               28b
2147
21480:
2149        ret             x15
2150
2151L(\type\()_\taps\()_filter_2):
2152        ld1             {v28.8b},  [\sr2], \s_strd
2153        ld1             {v30.8b},  [\src], \s_strd
2154        uxtl            v28.8h,  v28.8b
2155        uxtl            v30.8h,  v30.8b
2156        ext             v29.16b, v28.16b, v28.16b, #2
2157        ext             v31.16b, v30.16b, v30.16b, #2
2158        trn1            v27.2s,  v28.2s,  v30.2s
2159        trn2            v30.2s,  v28.2s,  v30.2s
2160        trn1            v28.2s,  v29.2s,  v31.2s
2161        trn2            v31.2s,  v29.2s,  v31.2s
2162        mul             v27.4h,  v27.4h,  v0.h[0]
2163        mla             v27.4h,  v28.4h,  v0.h[1]
2164        mla             v27.4h,  v30.4h,  v0.h[2]
2165        mla             v27.4h,  v31.4h,  v0.h[3]
2166        srshr           v28.4h,  v27.4h,  #2
2167        ret
2168.endif
2169
217040:
2171        AARCH64_VALID_JUMP_TARGET
2172        add             \xmx, \xmx, #2
2173        ld1             {v0.s}[0],  [\xmx]
2174        b.gt            480f
2175        add             \xmy, \xmy,  #2
2176        ld1             {v1.s}[0],  [\xmy]
2177        sub             \sr2, \src, #1
2178        sub             \src, \sr2, \s_strd
2179        add             \ds2, \dst, \d_strd
2180        lsl             \s_strd, \s_strd, #1
2181        lsl             \d_strd, \d_strd, #1
2182        sxtl            v0.8h,  v0.8b
2183        sxtl            v1.8h,  v1.8b
2184        mov             x15, x30
2185
2186        // 4x2, 4x4 hv
2187        ld1             {v26.8b}, [\src], \s_strd
2188        uxtl            v26.8h,  v26.8b
2189        ext             v28.16b, v26.16b, v26.16b, #2
2190        ext             v29.16b, v26.16b, v26.16b, #4
2191        ext             v30.16b, v26.16b, v26.16b, #6
2192        mul             v31.4h,  v26.4h,  v0.h[0]
2193        mla             v31.4h,  v28.4h,  v0.h[1]
2194        mla             v31.4h,  v29.4h,  v0.h[2]
2195        mla             v31.4h,  v30.4h,  v0.h[3]
2196        srshr           v16.4h,  v31.4h,  #2
2197
2198        bl              L(\type\()_\taps\()_filter_4)
2199        mov             v17.8b, v28.8b
2200        mov             v18.8b, v29.8b
2201
22024:
2203        bl              L(\type\()_\taps\()_filter_4)
2204        // Interleaving the mul/mla chains actually hurts performance
2205        // significantly on Cortex A53, thus keeping mul/mla tightly
2206        // chained like this.
2207        smull           v2.4s,  v16.4h, v1.h[0]
2208        smlal           v2.4s,  v17.4h, v1.h[1]
2209        smlal           v2.4s,  v18.4h, v1.h[2]
2210        smlal           v2.4s,  v28.4h, v1.h[3]
2211        smull           v3.4s,  v17.4h, v1.h[0]
2212        smlal           v3.4s,  v18.4h, v1.h[1]
2213        smlal           v3.4s,  v28.4h, v1.h[2]
2214        smlal           v3.4s,  v29.4h, v1.h[3]
2215        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2216        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2217        subs            \h,  \h,  #2
2218.ifc \type, put
2219        sqxtun          v2.8b,  v2.8h
2220        sqxtun          v3.8b,  v3.8h
2221        st1             {v2.s}[0], [\dst], \d_strd
2222        st1             {v3.s}[0], [\ds2], \d_strd
2223.else
2224        st1             {v2.4h}, [\dst], \d_strd
2225        st1             {v3.4h}, [\ds2], \d_strd
2226.endif
2227        b.le            0f
2228        mov             v16.8b,  v18.8b
2229        mov             v17.8b,  v28.8b
2230        mov             v18.8b,  v29.8b
2231        b               4b
2232
2233480:    // 4x8, 4x16, 4x32 hv
2234        ld1             {v1.8b},  [\xmy]
2235        sub             \src, \src, #1
2236.ifc \taps, 6tap
2237        sub             \sr2, \src, \s_strd
2238        sub             \src, \src, \s_strd, lsl #1
2239.else
2240        sub             \sr2, \src, \s_strd, lsl #1
2241        sub             \src, \sr2, \s_strd
2242.endif
2243        add             \ds2, \dst, \d_strd
2244        lsl             \s_strd, \s_strd, #1
2245        lsl             \d_strd, \d_strd, #1
2246        sxtl            v0.8h,  v0.8b
2247        sxtl            v1.8h,  v1.8b
2248        mov             x15, x30
2249
2250        ld1             {v26.8b}, [\src], \s_strd
2251        uxtl            v26.8h,  v26.8b
2252        ext             v28.16b, v26.16b, v26.16b, #2
2253        ext             v29.16b, v26.16b, v26.16b, #4
2254        ext             v30.16b, v26.16b, v26.16b, #6
2255        mul             v31.4h,  v26.4h,  v0.h[0]
2256        mla             v31.4h,  v28.4h,  v0.h[1]
2257        mla             v31.4h,  v29.4h,  v0.h[2]
2258        mla             v31.4h,  v30.4h,  v0.h[3]
2259.ifc \taps, 6tap
2260        srshr           v18.4h,  v31.4h,  #2
2261.else
2262        srshr           v16.4h,  v31.4h,  #2
2263
2264        bl              L(\type\()_\taps\()_filter_4)
2265        mov             v17.8b, v28.8b
2266        mov             v18.8b, v29.8b
2267.endif
2268        bl              L(\type\()_\taps\()_filter_4)
2269        mov             v19.8b, v28.8b
2270        mov             v20.8b, v29.8b
2271        bl              L(\type\()_\taps\()_filter_4)
2272        mov             v21.8b, v28.8b
2273        mov             v22.8b, v29.8b
2274
227548:
2276        bl              L(\type\()_\taps\()_filter_4)
2277.ifc \taps, 6tap
2278        smull           v2.4s,  v18.4h, v1.h[1]
2279        smlal           v2.4s,  v19.4h, v1.h[2]
2280        smlal           v2.4s,  v20.4h, v1.h[3]
2281        smlal           v2.4s,  v21.4h, v1.h[4]
2282        smlal           v2.4s,  v22.4h, v1.h[5]
2283        smlal           v2.4s,  v28.4h, v1.h[6]
2284        smull           v3.4s,  v19.4h, v1.h[1]
2285        smlal           v3.4s,  v20.4h, v1.h[2]
2286        smlal           v3.4s,  v21.4h, v1.h[3]
2287        smlal           v3.4s,  v22.4h, v1.h[4]
2288        smlal           v3.4s,  v28.4h, v1.h[5]
2289        smlal           v3.4s,  v29.4h, v1.h[6]
2290.else   // 8tap
2291        smull           v2.4s,  v16.4h, v1.h[0]
2292        smlal           v2.4s,  v17.4h, v1.h[1]
2293        smlal           v2.4s,  v18.4h, v1.h[2]
2294        smlal           v2.4s,  v19.4h, v1.h[3]
2295        smlal           v2.4s,  v20.4h, v1.h[4]
2296        smlal           v2.4s,  v21.4h, v1.h[5]
2297        smlal           v2.4s,  v22.4h, v1.h[6]
2298        smlal           v2.4s,  v28.4h, v1.h[7]
2299        smull           v3.4s,  v17.4h, v1.h[0]
2300        smlal           v3.4s,  v18.4h, v1.h[1]
2301        smlal           v3.4s,  v19.4h, v1.h[2]
2302        smlal           v3.4s,  v20.4h, v1.h[3]
2303        smlal           v3.4s,  v21.4h, v1.h[4]
2304        smlal           v3.4s,  v22.4h, v1.h[5]
2305        smlal           v3.4s,  v28.4h, v1.h[6]
2306        smlal           v3.4s,  v29.4h, v1.h[7]
2307.endif
2308        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2309        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
2310        subs            \h,  \h,  #2
2311.ifc \type, put
2312        sqxtun          v2.8b,  v2.8h
2313        sqxtun          v3.8b,  v3.8h
2314        st1             {v2.s}[0], [\dst], \d_strd
2315        st1             {v3.s}[0], [\ds2], \d_strd
2316.else
2317        st1             {v2.4h}, [\dst], \d_strd
2318        st1             {v3.4h}, [\ds2], \d_strd
2319.endif
2320        b.le            0f
2321.ifc \taps, 8tap
2322        mov             v16.8b,  v18.8b
2323        mov             v17.8b,  v19.8b
2324.endif
2325        mov             v18.8b,  v20.8b
2326        mov             v19.8b,  v21.8b
2327        mov             v20.8b,  v22.8b
2328        mov             v21.8b,  v28.8b
2329        mov             v22.8b,  v29.8b
2330        b               48b
23310:
2332        ret             x15
2333
2334L(\type\()_\taps\()_filter_4):
2335        ld1             {v26.8b}, [\sr2], \s_strd
2336        ld1             {v27.8b}, [\src], \s_strd
2337        uxtl            v26.8h,  v26.8b
2338        uxtl            v27.8h,  v27.8b
2339        ext             v28.16b, v26.16b, v26.16b, #2
2340        ext             v29.16b, v26.16b, v26.16b, #4
2341        ext             v30.16b, v26.16b, v26.16b, #6
2342        mul             v31.4h,  v26.4h,  v0.h[0]
2343        mla             v31.4h,  v28.4h,  v0.h[1]
2344        mla             v31.4h,  v29.4h,  v0.h[2]
2345        mla             v31.4h,  v30.4h,  v0.h[3]
2346        ext             v28.16b, v27.16b, v27.16b, #2
2347        ext             v29.16b, v27.16b, v27.16b, #4
2348        ext             v30.16b, v27.16b, v27.16b, #6
2349        mul             v27.4h,  v27.4h,  v0.h[0]
2350        mla             v27.4h,  v28.4h,  v0.h[1]
2351        mla             v27.4h,  v29.4h,  v0.h[2]
2352        mla             v27.4h,  v30.4h,  v0.h[3]
2353        srshr           v28.4h,  v31.4h,  #2
2354        srshr           v29.4h,  v27.4h,  #2
2355        ret
2356
235780:
2358160:
2359320:
2360        AARCH64_VALID_JUMP_TARGET
2361        b.gt            880f
2362        add             \xmy,  \xmy,  #2
2363        ld1             {v0.8b},  [\xmx]
2364        ld1             {v1.s}[0],  [\xmy]
2365        sub             \src,  \src,  #3
2366        sub             \src,  \src,  \s_strd
2367        sxtl            v0.8h,  v0.8b
2368        sxtl            v1.8h,  v1.8b
2369        mov             x15, x30
2370        mov             \my,  \h
2371
2372164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2373        add             \ds2,  \dst,  \d_strd
2374        add             \sr2,  \src,  \s_strd
2375        lsl             \d_strd, \d_strd, #1
2376        lsl             \s_strd, \s_strd, #1
2377
2378        bl              L(\type\()_\taps\()_filter_8_first)
2379        bl              L(\type\()_\taps\()_filter_8)
2380        mov             v17.16b, v24.16b
2381        mov             v18.16b, v25.16b
2382
23838:
2384        smull           v2.4s,  v16.4h, v1.h[0]
2385        smull2          v3.4s,  v16.8h, v1.h[0]
2386        bl              L(\type\()_\taps\()_filter_8)
2387        smull           v4.4s,  v17.4h, v1.h[0]
2388        smull2          v5.4s,  v17.8h, v1.h[0]
2389        smlal           v2.4s,  v17.4h, v1.h[1]
2390        smlal2          v3.4s,  v17.8h, v1.h[1]
2391        smlal           v4.4s,  v18.4h, v1.h[1]
2392        smlal2          v5.4s,  v18.8h, v1.h[1]
2393        smlal           v2.4s,  v18.4h, v1.h[2]
2394        smlal2          v3.4s,  v18.8h, v1.h[2]
2395        smlal           v4.4s,  v24.4h, v1.h[2]
2396        smlal2          v5.4s,  v24.8h, v1.h[2]
2397        smlal           v2.4s,  v24.4h, v1.h[3]
2398        smlal2          v3.4s,  v24.8h, v1.h[3]
2399        smlal           v4.4s,  v25.4h, v1.h[3]
2400        smlal2          v5.4s,  v25.8h, v1.h[3]
2401        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2402        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2403        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2404        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2405        subs            \h,  \h,  #2
2406.ifc \type, put
2407        sqxtun          v2.8b,  v2.8h
2408        sqxtun          v4.8b,  v4.8h
2409        st1             {v2.8b}, [\dst], \d_strd
2410        st1             {v4.8b}, [\ds2], \d_strd
2411.else
2412        st1             {v2.8h}, [\dst], \d_strd
2413        st1             {v4.8h}, [\ds2], \d_strd
2414.endif
2415        b.le            9f
2416        mov             v16.16b, v18.16b
2417        mov             v17.16b, v24.16b
2418        mov             v18.16b, v25.16b
2419        b               8b
24209:
2421        subs            \w,  \w,  #8
2422        b.le            0f
2423        asr             \s_strd,  \s_strd,  #1
2424        asr             \d_strd,  \d_strd,  #1
2425        msub            \src,  \s_strd,  \xmy,  \src
2426        msub            \dst,  \d_strd,  \xmy,  \dst
2427        sub             \src,  \src,  \s_strd,  lsl #2
2428        mov             \h,  \my
2429        add             \src,  \src,  #8
2430.ifc \type, put
2431        add             \dst,  \dst,  #8
2432.else
2433        add             \dst,  \dst,  #16
2434.endif
2435        b               164b
2436
2437880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2438640:
24391280:
2440        AARCH64_VALID_JUMP_TARGET
2441        ld1             {v0.8b},  [\xmx]
2442        ld1             {v1.8b},  [\xmy]
2443        sub             \src,  \src,  #3
2444.ifc \taps, 8tap
2445        sub             \src,  \src,  \s_strd
2446.endif
2447        sub             \src,  \src,  \s_strd, lsl #1
2448        sxtl            v0.8h,  v0.8b
2449        sxtl            v1.8h,  v1.8b
2450        mov             x15, x30
2451        mov             \my,  \h
2452
2453168:
2454        add             \ds2,  \dst,  \d_strd
2455        add             \sr2,  \src,  \s_strd
2456        lsl             \d_strd, \d_strd, #1
2457        lsl             \s_strd, \s_strd, #1
2458
2459        bl              L(\type\()_\taps\()_filter_8_first)
2460.ifc \taps, 6tap
2461        mov             v18.16b, v16.16b
2462.else
2463        bl              L(\type\()_\taps\()_filter_8)
2464        mov             v17.16b, v24.16b
2465        mov             v18.16b, v25.16b
2466.endif
2467        bl              L(\type\()_\taps\()_filter_8)
2468        mov             v19.16b, v24.16b
2469        mov             v20.16b, v25.16b
2470        bl              L(\type\()_\taps\()_filter_8)
2471        mov             v21.16b, v24.16b
2472        mov             v22.16b, v25.16b
2473
247488:
2475.ifc \taps, 6tap
2476        smull           v2.4s,  v18.4h, v1.h[1]
2477        smull2          v3.4s,  v18.8h, v1.h[1]
2478        bl              L(\type\()_\taps\()_filter_8)
2479        smull           v4.4s,  v19.4h, v1.h[1]
2480        smull2          v5.4s,  v19.8h, v1.h[1]
2481        smlal           v2.4s,  v19.4h, v1.h[2]
2482        smlal2          v3.4s,  v19.8h, v1.h[2]
2483        smlal           v4.4s,  v20.4h, v1.h[2]
2484        smlal2          v5.4s,  v20.8h, v1.h[2]
2485        smlal           v2.4s,  v20.4h, v1.h[3]
2486        smlal2          v3.4s,  v20.8h, v1.h[3]
2487        smlal           v4.4s,  v21.4h, v1.h[3]
2488        smlal2          v5.4s,  v21.8h, v1.h[3]
2489        smlal           v2.4s,  v21.4h, v1.h[4]
2490        smlal2          v3.4s,  v21.8h, v1.h[4]
2491        smlal           v4.4s,  v22.4h, v1.h[4]
2492        smlal2          v5.4s,  v22.8h, v1.h[4]
2493        smlal           v2.4s,  v22.4h, v1.h[5]
2494        smlal2          v3.4s,  v22.8h, v1.h[5]
2495        smlal           v4.4s,  v24.4h, v1.h[5]
2496        smlal2          v5.4s,  v24.8h, v1.h[5]
2497        smlal           v2.4s,  v24.4h, v1.h[6]
2498        smlal2          v3.4s,  v24.8h, v1.h[6]
2499        smlal           v4.4s,  v25.4h, v1.h[6]
2500        smlal2          v5.4s,  v25.8h, v1.h[6]
2501.else   // 8tap
2502        smull           v2.4s,  v16.4h, v1.h[0]
2503        smull2          v3.4s,  v16.8h, v1.h[0]
2504        bl              L(\type\()_\taps\()_filter_8)
2505        smull           v4.4s,  v17.4h, v1.h[0]
2506        smull2          v5.4s,  v17.8h, v1.h[0]
2507        smlal           v2.4s,  v17.4h, v1.h[1]
2508        smlal2          v3.4s,  v17.8h, v1.h[1]
2509        smlal           v4.4s,  v18.4h, v1.h[1]
2510        smlal2          v5.4s,  v18.8h, v1.h[1]
2511        smlal           v2.4s,  v18.4h, v1.h[2]
2512        smlal2          v3.4s,  v18.8h, v1.h[2]
2513        smlal           v4.4s,  v19.4h, v1.h[2]
2514        smlal2          v5.4s,  v19.8h, v1.h[2]
2515        smlal           v2.4s,  v19.4h, v1.h[3]
2516        smlal2          v3.4s,  v19.8h, v1.h[3]
2517        smlal           v4.4s,  v20.4h, v1.h[3]
2518        smlal2          v5.4s,  v20.8h, v1.h[3]
2519        smlal           v2.4s,  v20.4h, v1.h[4]
2520        smlal2          v3.4s,  v20.8h, v1.h[4]
2521        smlal           v4.4s,  v21.4h, v1.h[4]
2522        smlal2          v5.4s,  v21.8h, v1.h[4]
2523        smlal           v2.4s,  v21.4h, v1.h[5]
2524        smlal2          v3.4s,  v21.8h, v1.h[5]
2525        smlal           v4.4s,  v22.4h, v1.h[5]
2526        smlal2          v5.4s,  v22.8h, v1.h[5]
2527        smlal           v2.4s,  v22.4h, v1.h[6]
2528        smlal2          v3.4s,  v22.8h, v1.h[6]
2529        smlal           v4.4s,  v24.4h, v1.h[6]
2530        smlal2          v5.4s,  v24.8h, v1.h[6]
2531        smlal           v2.4s,  v24.4h, v1.h[7]
2532        smlal2          v3.4s,  v24.8h, v1.h[7]
2533        smlal           v4.4s,  v25.4h, v1.h[7]
2534        smlal2          v5.4s,  v25.8h, v1.h[7]
2535.endif
2536        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
2537        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
2538        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
2539        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
2540        subs            \h,  \h,  #2
2541.ifc \type, put
2542        sqxtun          v2.8b,  v2.8h
2543        sqxtun          v4.8b,  v4.8h
2544        st1             {v2.8b}, [\dst], \d_strd
2545        st1             {v4.8b}, [\ds2], \d_strd
2546.else
2547        st1             {v2.8h}, [\dst], \d_strd
2548        st1             {v4.8h}, [\ds2], \d_strd
2549.endif
2550        b.le            9f
2551.ifc \taps, 8tap
2552        mov             v16.16b, v18.16b
2553        mov             v17.16b, v19.16b
2554.endif
2555        mov             v18.16b, v20.16b
2556        mov             v19.16b, v21.16b
2557        mov             v20.16b, v22.16b
2558        mov             v21.16b, v24.16b
2559        mov             v22.16b, v25.16b
2560        b               88b
25619:
2562        subs            \w,  \w,  #8
2563        b.le            0f
2564        asr             \s_strd,  \s_strd,  #1
2565        asr             \d_strd,  \d_strd,  #1
2566        msub            \src,  \s_strd,  \xmy,  \src
2567        msub            \dst,  \d_strd,  \xmy,  \dst
2568        sub             \src,  \src,  \s_strd,  lsl #3
2569        mov             \h,  \my
2570        add             \src,  \src,  #8
2571.ifc \type, put
2572        add             \dst,  \dst,  #8
2573.else
2574        add             \dst,  \dst,  #16
2575.endif
2576.ifc \taps, 6tap
2577        add             \src,  \src,  \s_strd,  lsl #1
2578.endif
2579        b               168b
25800:
2581        ret             x15
2582
2583L(\type\()_\taps\()_filter_8_first):
2584        ld1             {v28.8b, v29.8b},  [\src], \s_strd
2585        uxtl            v28.8h,  v28.8b
2586        uxtl            v29.8h,  v29.8b
2587.ifc \taps, 6tap
2588        ext             v24.16b, v28.16b, v29.16b, #(2*1)
2589        ext             v25.16b, v28.16b, v29.16b, #(2*2)
2590        ext             v26.16b, v28.16b, v29.16b, #(2*3)
2591        ext             v27.16b, v28.16b, v29.16b, #(2*4)
2592        mul             v16.8h,  v24.8h,  v0.h[1]
2593        mla             v16.8h,  v25.8h,  v0.h[2]
2594        mla             v16.8h,  v26.8h,  v0.h[3]
2595        mla             v16.8h,  v27.8h,  v0.h[4]
2596        ext             v24.16b, v28.16b, v29.16b, #(2*5)
2597        ext             v25.16b, v28.16b, v29.16b, #(2*6)
2598        ext             v26.16b, v28.16b, v29.16b, #(2*7)
2599        mla             v16.8h,  v24.8h,  v0.h[5]
2600        mla             v16.8h,  v25.8h,  v0.h[6]
2601.else   // 8tap
2602        mul             v16.8h,  v28.8h,  v0.h[0]
2603        ext             v24.16b, v28.16b, v29.16b, #(2*1)
2604        ext             v25.16b, v28.16b, v29.16b, #(2*2)
2605        ext             v26.16b, v28.16b, v29.16b, #(2*3)
2606        ext             v27.16b, v28.16b, v29.16b, #(2*4)
2607        mla             v16.8h,  v24.8h,  v0.h[1]
2608        mla             v16.8h,  v25.8h,  v0.h[2]
2609        mla             v16.8h,  v26.8h,  v0.h[3]
2610        mla             v16.8h,  v27.8h,  v0.h[4]
2611        ext             v24.16b, v28.16b, v29.16b, #(2*5)
2612        ext             v25.16b, v28.16b, v29.16b, #(2*6)
2613        ext             v26.16b, v28.16b, v29.16b, #(2*7)
2614        mla             v16.8h,  v24.8h,  v0.h[5]
2615        mla             v16.8h,  v25.8h,  v0.h[6]
2616        mla             v16.8h,  v26.8h,  v0.h[7]
2617.endif
2618        srshr           v16.8h,  v16.8h,  #2
2619        ret
2620
2621L(\type\()_\taps\()_filter_8):
2622        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
2623        ld1             {v30.8b, v31.8b},  [\src], \s_strd
2624        uxtl            v28.8h,  v28.8b
2625        uxtl            v29.8h,  v29.8b
2626        uxtl            v30.8h,  v30.8b
2627        uxtl            v31.8h,  v31.8b
2628.ifc \taps, 6tap
2629        ext             v26.16b, v28.16b, v29.16b, #2
2630        ext             v27.16b, v30.16b, v31.16b, #2
2631        mul             v24.8h,  v26.8h,  v0.h[1]
2632        mul             v25.8h,  v27.8h,  v0.h[1]
2633.irpc i, 23456
2634        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2635        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2636        mla             v24.8h,  v26.8h,  v0.h[\i]
2637        mla             v25.8h,  v27.8h,  v0.h[\i]
2638.endr
2639.else   // 8tap
2640        mul             v24.8h,  v28.8h,  v0.h[0]
2641        mul             v25.8h,  v30.8h,  v0.h[0]
2642.irpc i, 1234567
2643        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
2644        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
2645        mla             v24.8h,  v26.8h,  v0.h[\i]
2646        mla             v25.8h,  v27.8h,  v0.h[\i]
2647.endr
2648.endif
2649        srshr           v24.8h,  v24.8h, #2
2650        srshr           v25.8h,  v25.8h, #2
2651        ret
2652
2653L(\type\()_\taps\()_hv_tbl):
2654        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
2655        .hword L(\type\()_\taps\()_hv_tbl) -  640b
2656        .hword L(\type\()_\taps\()_hv_tbl) -  320b
2657        .hword L(\type\()_\taps\()_hv_tbl) -  160b
2658        .hword L(\type\()_\taps\()_hv_tbl) -   80b
2659        .hword L(\type\()_\taps\()_hv_tbl) -   40b
2660        .hword L(\type\()_\taps\()_hv_tbl) -   20b
2661        .hword 0
2662endfunc
2663.endm
2664
2665
2666.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
2667function \type\()_bilin_8bpc_neon, export=1
2668        dup             v1.16b, \mx
2669        dup             v3.16b, \my
2670        mov             w9,  #16
2671        sub             w8, w9, \mx
2672        sub             w9, w9, \my
2673        dup             v0.16b, w8
2674        dup             v2.16b, w9
2675.ifc \type, prep
2676        uxtw            \d_strd, \w
2677        lsl             \d_strd, \d_strd, #1
2678.endif
2679
2680        clz             w8,  \w
2681        sub             w8,  w8,  #24
2682        cbnz            \mx, L(\type\()_bilin_h)
2683        cbnz            \my, L(\type\()_bilin_v)
2684        b               \type\()_neon
2685
2686L(\type\()_bilin_h):
2687        cbnz            \my, L(\type\()_bilin_hv)
2688
2689        adr             x9,  L(\type\()_bilin_h_tbl)
2690        ldrh            w8,  [x9, x8, lsl #1]
2691        sub             x9,  x9,  w8, uxtw
2692        br              x9
2693
269420:     // 2xN h
2695        AARCH64_VALID_JUMP_TARGET
2696.ifc \type, put
2697        add             \ds2,  \dst,  \d_strd
2698        add             \sr2,  \src,  \s_strd
2699        lsl             \d_strd,  \d_strd,  #1
2700        lsl             \s_strd,  \s_strd,  #1
27012:
2702        ld1             {v4.s}[0],  [\src], \s_strd
2703        ld1             {v6.s}[0],  [\sr2], \s_strd
2704        ext             v5.8b,  v4.8b,  v4.8b, #1
2705        ext             v7.8b,  v6.8b,  v6.8b, #1
2706        trn1            v4.4h,  v4.4h,  v6.4h
2707        trn1            v5.4h,  v5.4h,  v7.4h
2708        subs            \h,  \h,  #2
2709        umull           v4.8h,  v4.8b,  v0.8b
2710        umlal           v4.8h,  v5.8b,  v1.8b
2711        uqrshrn         v4.8b,  v4.8h,  #4
2712        st1             {v4.h}[0], [\dst], \d_strd
2713        st1             {v4.h}[1], [\ds2], \d_strd
2714        b.gt            2b
2715        ret
2716.endif
2717
271840:     // 4xN h
2719        AARCH64_VALID_JUMP_TARGET
2720        add             \ds2,  \dst,  \d_strd
2721        add             \sr2,  \src,  \s_strd
2722        lsl             \d_strd,  \d_strd,  #1
2723        lsl             \s_strd,  \s_strd,  #1
27244:
2725        ld1             {v4.8b}, [\src], \s_strd
2726        ld1             {v6.8b}, [\sr2], \s_strd
2727        ext             v5.8b,  v4.8b,  v4.8b, #1
2728        ext             v7.8b,  v6.8b,  v6.8b, #1
2729        trn1            v4.2s,  v4.2s,  v6.2s
2730        trn1            v5.2s,  v5.2s,  v7.2s
2731        subs            \h,  \h,  #2
2732        umull           v4.8h,  v4.8b,  v0.8b
2733        umlal           v4.8h,  v5.8b,  v1.8b
2734.ifc \type, put
2735        uqrshrn         v4.8b,  v4.8h,  #4
2736        st1             {v4.s}[0], [\dst], \d_strd
2737        st1             {v4.s}[1], [\ds2], \d_strd
2738.else
2739        st1             {v4.d}[0], [\dst], \d_strd
2740        st1             {v4.d}[1], [\ds2], \d_strd
2741.endif
2742        b.gt            4b
2743        ret
2744
274580:     // 8xN h
2746        AARCH64_VALID_JUMP_TARGET
2747        add             \ds2,  \dst,  \d_strd
2748        add             \sr2,  \src,  \s_strd
2749        lsl             \d_strd,  \d_strd,  #1
2750        lsl             \s_strd,  \s_strd,  #1
27518:
2752        ld1             {v4.16b}, [\src], \s_strd
2753        ld1             {v6.16b}, [\sr2], \s_strd
2754        ext             v5.16b, v4.16b, v4.16b, #1
2755        ext             v7.16b, v6.16b, v6.16b, #1
2756        subs            \h,  \h,  #2
2757        umull           v4.8h,  v4.8b,  v0.8b
2758        umull           v6.8h,  v6.8b,  v0.8b
2759        umlal           v4.8h,  v5.8b,  v1.8b
2760        umlal           v6.8h,  v7.8b,  v1.8b
2761.ifc \type, put
2762        uqrshrn         v4.8b,  v4.8h,  #4
2763        uqrshrn         v6.8b,  v6.8h,  #4
2764        st1             {v4.8b}, [\dst], \d_strd
2765        st1             {v6.8b}, [\ds2], \d_strd
2766.else
2767        st1             {v4.8h}, [\dst], \d_strd
2768        st1             {v6.8h}, [\ds2], \d_strd
2769.endif
2770        b.gt            8b
2771        ret
2772160:
2773320:
2774640:
27751280:   // 16xN, 32xN, ... h
2776        AARCH64_VALID_JUMP_TARGET
2777        add             \ds2,  \dst,  \d_strd
2778        add             \sr2,  \src,  \s_strd
2779        lsl             \s_strd,  \s_strd,  #1
2780
2781        sub             \s_strd,  \s_strd,  \w, uxtw
2782        sub             \s_strd,  \s_strd,  #8
2783.ifc \type, put
2784        lsl             \d_strd,  \d_strd,  #1
2785        sub             \d_strd,  \d_strd,  \w, uxtw
2786.endif
2787161:
2788        ld1             {v16.d}[1],  [\src], #8
2789        ld1             {v20.d}[1],  [\sr2], #8
2790        mov             \mx, \w
2791
279216:
2793        ld1             {v18.16b},  [\src], #16
2794        ld1             {v22.16b},  [\sr2], #16
2795        ext             v17.16b, v16.16b, v18.16b, #8
2796        ext             v19.16b, v16.16b, v18.16b, #9
2797        ext             v21.16b, v20.16b, v22.16b, #8
2798        ext             v23.16b, v20.16b, v22.16b, #9
2799        umull           v16.8h,  v17.8b,  v0.8b
2800        umull2          v17.8h,  v17.16b, v0.16b
2801        umull           v20.8h,  v21.8b,  v0.8b
2802        umull2          v21.8h,  v21.16b, v0.16b
2803        umlal           v16.8h,  v19.8b,  v1.8b
2804        umlal2          v17.8h,  v19.16b, v1.16b
2805        umlal           v20.8h,  v23.8b,  v1.8b
2806        umlal2          v21.8h,  v23.16b, v1.16b
2807        subs            \mx, \mx, #16
2808.ifc \type, put
2809        uqrshrn         v16.8b,  v16.8h, #4
2810        uqrshrn2        v16.16b, v17.8h, #4
2811        uqrshrn         v20.8b,  v20.8h, #4
2812        uqrshrn2        v20.16b, v21.8h, #4
2813        st1             {v16.16b}, [\dst], #16
2814        st1             {v20.16b}, [\ds2], #16
2815.else
2816        st1             {v16.8h, v17.8h}, [\dst], #32
2817        st1             {v20.8h, v21.8h}, [\ds2], #32
2818.endif
2819        b.le            9f
2820
2821        mov             v16.16b, v18.16b
2822        mov             v20.16b, v22.16b
2823        b               16b
2824
28259:
2826        add             \dst,  \dst,  \d_strd
2827        add             \ds2,  \ds2,  \d_strd
2828        add             \src,  \src,  \s_strd
2829        add             \sr2,  \sr2,  \s_strd
2830
2831        subs            \h,  \h,  #2
2832        b.gt            161b
2833        ret
2834
2835L(\type\()_bilin_h_tbl):
2836        .hword L(\type\()_bilin_h_tbl) - 1280b
2837        .hword L(\type\()_bilin_h_tbl) -  640b
2838        .hword L(\type\()_bilin_h_tbl) -  320b
2839        .hword L(\type\()_bilin_h_tbl) -  160b
2840        .hword L(\type\()_bilin_h_tbl) -   80b
2841        .hword L(\type\()_bilin_h_tbl) -   40b
2842        .hword L(\type\()_bilin_h_tbl) -   20b
2843        .hword 0
2844
2845
2846L(\type\()_bilin_v):
2847        cmp             \h,  #4
2848        adr             x9,  L(\type\()_bilin_v_tbl)
2849        ldrh            w8,  [x9, x8, lsl #1]
2850        sub             x9,  x9,  w8, uxtw
2851        br              x9
2852
285320:     // 2xN v
2854        AARCH64_VALID_JUMP_TARGET
2855.ifc \type, put
2856        cmp             \h,  #2
2857        add             \ds2,  \dst,  \d_strd
2858        add             \sr2,  \src,  \s_strd
2859        lsl             \s_strd,  \s_strd,  #1
2860        lsl             \d_strd,  \d_strd,  #1
2861
2862        // 2x2 v
2863        ld1             {v16.h}[0], [\src], \s_strd
2864        b.gt            24f
286522:
2866        ld1             {v17.h}[0], [\sr2], \s_strd
2867        ld1             {v18.h}[0], [\src], \s_strd
2868        trn1            v16.4h, v16.4h, v17.4h
2869        trn1            v17.4h, v17.4h, v18.4h
2870        umull           v4.8h,  v16.8b,  v2.8b
2871        umlal           v4.8h,  v17.8b,  v3.8b
2872        uqrshrn         v4.8b,  v4.8h,  #4
2873        st1             {v4.h}[0], [\dst]
2874        st1             {v4.h}[1], [\ds2]
2875        ret
287624:     // 2x4, 2x6, 2x8, ... v
2877        ld1             {v17.h}[0], [\sr2], \s_strd
2878        ld1             {v18.h}[0], [\src], \s_strd
2879        ld1             {v19.h}[0], [\sr2], \s_strd
2880        ld1             {v20.h}[0], [\src], \s_strd
2881        sub             \h,  \h,  #4
2882        trn1            v16.4h, v16.4h, v17.4h
2883        trn1            v17.4h, v17.4h, v18.4h
2884        trn1            v18.4h, v18.4h, v19.4h
2885        trn1            v19.4h, v19.4h, v20.4h
2886        trn1            v16.2s, v16.2s, v18.2s
2887        trn1            v17.2s, v17.2s, v19.2s
2888        umull           v4.8h,  v16.8b,  v2.8b
2889        umlal           v4.8h,  v17.8b,  v3.8b
2890        cmp             \h,  #2
2891        uqrshrn         v4.8b,  v4.8h,  #4
2892        st1             {v4.h}[0], [\dst], \d_strd
2893        st1             {v4.h}[1], [\ds2], \d_strd
2894        st1             {v4.h}[2], [\dst], \d_strd
2895        st1             {v4.h}[3], [\ds2], \d_strd
2896        b.lt            0f
2897        mov             v16.8b, v20.8b
2898        b.eq            22b
2899        b               24b
29000:
2901        ret
2902.endif
2903
290440:     // 4xN v
2905        AARCH64_VALID_JUMP_TARGET
2906        add             \ds2,  \dst,  \d_strd
2907        add             \sr2,  \src,  \s_strd
2908        lsl             \s_strd,  \s_strd,  #1
2909        lsl             \d_strd,  \d_strd,  #1
2910        ld1             {v16.s}[0], [\src], \s_strd
29114:
2912        ld1             {v17.s}[0], [\sr2], \s_strd
2913        ld1             {v18.s}[0], [\src], \s_strd
2914        trn1            v16.2s, v16.2s, v17.2s
2915        trn1            v17.2s, v17.2s, v18.2s
2916        umull           v4.8h,  v16.8b,  v2.8b
2917        umlal           v4.8h,  v17.8b,  v3.8b
2918        subs            \h,  \h,  #2
2919.ifc \type, put
2920        uqrshrn         v4.8b,  v4.8h,  #4
2921        st1             {v4.s}[0], [\dst], \d_strd
2922        st1             {v4.s}[1], [\ds2], \d_strd
2923.else
2924        st1             {v4.d}[0], [\dst], \d_strd
2925        st1             {v4.d}[1], [\ds2], \d_strd
2926.endif
2927        b.le            0f
2928        mov             v16.8b, v18.8b
2929        b               4b
29300:
2931        ret
2932
293380:     // 8xN v
2934        AARCH64_VALID_JUMP_TARGET
2935        add             \ds2,  \dst,  \d_strd
2936        add             \sr2,  \src,  \s_strd
2937        lsl             \s_strd,  \s_strd,  #1
2938        lsl             \d_strd,  \d_strd,  #1
2939        ld1             {v16.8b}, [\src], \s_strd
29408:
2941        ld1             {v17.8b}, [\sr2], \s_strd
2942        ld1             {v18.8b}, [\src], \s_strd
2943        umull           v4.8h,  v16.8b,  v2.8b
2944        umull           v5.8h,  v17.8b,  v2.8b
2945        umlal           v4.8h,  v17.8b,  v3.8b
2946        umlal           v5.8h,  v18.8b,  v3.8b
2947        subs            \h,  \h,  #2
2948.ifc \type, put
2949        uqrshrn         v4.8b,  v4.8h,  #4
2950        uqrshrn         v5.8b,  v5.8h,  #4
2951        st1             {v4.8b}, [\dst], \d_strd
2952        st1             {v5.8b}, [\ds2], \d_strd
2953.else
2954        st1             {v4.8h}, [\dst], \d_strd
2955        st1             {v5.8h}, [\ds2], \d_strd
2956.endif
2957        b.le            0f
2958        mov             v16.8b, v18.8b
2959        b               8b
29600:
2961        ret
2962
2963160:    // 16xN, 32xN, ...
2964320:
2965640:
29661280:
2967        AARCH64_VALID_JUMP_TARGET
2968        mov             \my,  \h
29691:
2970        add             \ds2, \dst, \d_strd
2971        add             \sr2, \src, \s_strd
2972        lsl             \s_strd, \s_strd, #1
2973        lsl             \d_strd, \d_strd, #1
2974
2975        ld1             {v16.16b}, [\src], \s_strd
29762:
2977        ld1             {v17.16b}, [\sr2], \s_strd
2978        ld1             {v18.16b}, [\src], \s_strd
2979        umull           v4.8h,  v16.8b,  v2.8b
2980        umull2          v5.8h,  v16.16b, v2.16b
2981        umull           v6.8h,  v17.8b,  v2.8b
2982        umull2          v7.8h,  v17.16b, v2.16b
2983        umlal           v4.8h,  v17.8b,  v3.8b
2984        umlal2          v5.8h,  v17.16b, v3.16b
2985        umlal           v6.8h,  v18.8b,  v3.8b
2986        umlal2          v7.8h,  v18.16b, v3.16b
2987        subs            \h,  \h,  #2
2988.ifc \type, put
2989        uqrshrn         v4.8b,  v4.8h,  #4
2990        uqrshrn2        v4.16b, v5.8h,  #4
2991        uqrshrn         v6.8b,  v6.8h,  #4
2992        uqrshrn2        v6.16b, v7.8h,  #4
2993        st1             {v4.16b}, [\dst], \d_strd
2994        st1             {v6.16b}, [\ds2], \d_strd
2995.else
2996        st1             {v4.8h, v5.8h}, [\dst], \d_strd
2997        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
2998.endif
2999        b.le            9f
3000        mov             v16.16b, v18.16b
3001        b               2b
30029:
3003        subs            \w,  \w,  #16
3004        b.le            0f
3005        asr             \s_strd, \s_strd, #1
3006        asr             \d_strd, \d_strd, #1
3007        msub            \src, \s_strd, \xmy, \src
3008        msub            \dst, \d_strd, \xmy, \dst
3009        sub             \src, \src, \s_strd, lsl #1
3010        mov             \h,  \my
3011        add             \src, \src, #16
3012.ifc \type, put
3013        add             \dst, \dst, #16
3014.else
3015        add             \dst, \dst, #32
3016.endif
3017        b               1b
30180:
3019        ret
3020
3021L(\type\()_bilin_v_tbl):
3022        .hword L(\type\()_bilin_v_tbl) - 1280b
3023        .hword L(\type\()_bilin_v_tbl) -  640b
3024        .hword L(\type\()_bilin_v_tbl) -  320b
3025        .hword L(\type\()_bilin_v_tbl) -  160b
3026        .hword L(\type\()_bilin_v_tbl) -   80b
3027        .hword L(\type\()_bilin_v_tbl) -   40b
3028        .hword L(\type\()_bilin_v_tbl) -   20b
3029        .hword 0
3030
3031L(\type\()_bilin_hv):
3032        uxtl            v2.8h, v2.8b
3033        uxtl            v3.8h, v3.8b
3034        adr             x9,  L(\type\()_bilin_hv_tbl)
3035        ldrh            w8,  [x9, x8, lsl #1]
3036        sub             x9,  x9,  w8, uxtw
3037        br              x9
3038
303920:     // 2xN hv
3040        AARCH64_VALID_JUMP_TARGET
3041.ifc \type, put
3042        add             \sr2, \src, \s_strd
3043        add             \ds2, \dst, \d_strd
3044        lsl             \s_strd, \s_strd, #1
3045        lsl             \d_strd, \d_strd, #1
3046
3047        ld1             {v28.s}[0],  [\src], \s_strd
3048        ext             v29.8b, v28.8b, v28.8b, #1
3049        umull           v16.8h, v28.8b, v0.8b
3050        umlal           v16.8h, v29.8b, v1.8b
3051
30522:
3053        ld1             {v28.s}[0],  [\sr2], \s_strd
3054        ld1             {v30.s}[0],  [\src], \s_strd
3055        ext             v29.8b, v28.8b, v28.8b, #1
3056        ext             v31.8b, v30.8b, v30.8b, #1
3057        trn1            v28.4h, v28.4h, v30.4h
3058        trn1            v29.4h, v29.4h, v31.4h
3059        umull           v17.8h, v28.8b, v0.8b
3060        umlal           v17.8h, v29.8b, v1.8b
3061
3062        trn1            v16.2s, v16.2s, v17.2s
3063
3064        mul             v4.4h,  v16.4h, v2.4h
3065        mla             v4.4h,  v17.4h, v3.4h
3066        uqrshrn         v4.8b,  v4.8h,  #8
3067        subs            \h,  \h,  #2
3068        st1             {v4.h}[0], [\dst], \d_strd
3069        st1             {v4.h}[1], [\ds2], \d_strd
3070        b.le            0f
3071        trn2            v16.2s, v17.2s, v17.2s
3072        b               2b
30730:
3074        ret
3075.endif
3076
307740:     // 4xN hv
3078        AARCH64_VALID_JUMP_TARGET
3079        add             \sr2, \src, \s_strd
3080        add             \ds2, \dst, \d_strd
3081        lsl             \s_strd, \s_strd, #1
3082        lsl             \d_strd, \d_strd, #1
3083
3084        ld1             {v28.8b},  [\src], \s_strd
3085        ext             v29.8b, v28.8b, v28.8b, #1
3086        umull           v16.8h, v28.8b, v0.8b
3087        umlal           v16.8h, v29.8b, v1.8b
3088
30894:
3090        ld1             {v28.8b},  [\sr2], \s_strd
3091        ld1             {v30.8b},  [\src], \s_strd
3092        ext             v29.8b, v28.8b, v28.8b, #1
3093        ext             v31.8b, v30.8b, v30.8b, #1
3094        trn1            v28.2s, v28.2s, v30.2s
3095        trn1            v29.2s, v29.2s, v31.2s
3096        umull           v17.8h, v28.8b, v0.8b
3097        umlal           v17.8h, v29.8b, v1.8b
3098
3099        trn1            v16.2d, v16.2d, v17.2d
3100
3101        mul             v4.8h,  v16.8h, v2.8h
3102        mla             v4.8h,  v17.8h, v3.8h
3103        subs            \h,  \h,  #2
3104.ifc \type, put
3105        uqrshrn         v4.8b,  v4.8h,  #8
3106        st1             {v4.s}[0], [\dst], \d_strd
3107        st1             {v4.s}[1], [\ds2], \d_strd
3108.else
3109        urshr           v4.8h,  v4.8h,  #4
3110        st1             {v4.d}[0], [\dst], \d_strd
3111        st1             {v4.d}[1], [\ds2], \d_strd
3112.endif
3113        b.le            0f
3114        trn2            v16.2d, v17.2d, v17.2d
3115        b               4b
31160:
3117        ret
3118
311980:     // 8xN, 16xN, ... hv
3120160:
3121320:
3122640:
31231280:
3124        AARCH64_VALID_JUMP_TARGET
3125        mov             \my,  \h
3126
31271:
3128        add             \sr2, \src, \s_strd
3129        add             \ds2, \dst, \d_strd
3130        lsl             \s_strd, \s_strd, #1
3131        lsl             \d_strd, \d_strd, #1
3132
3133        ld1             {v28.16b},  [\src], \s_strd
3134        ext             v29.16b, v28.16b, v28.16b, #1
3135        umull           v16.8h, v28.8b, v0.8b
3136        umlal           v16.8h, v29.8b, v1.8b
3137
31382:
3139        ld1             {v28.16b},  [\sr2], \s_strd
3140        ld1             {v30.16b},  [\src], \s_strd
3141        ext             v29.16b, v28.16b, v28.16b, #1
3142        ext             v31.16b, v30.16b, v30.16b, #1
3143        umull           v17.8h, v28.8b, v0.8b
3144        umlal           v17.8h, v29.8b, v1.8b
3145        umull           v18.8h, v30.8b, v0.8b
3146        umlal           v18.8h, v31.8b, v1.8b
3147
3148        mul             v4.8h,  v16.8h, v2.8h
3149        mla             v4.8h,  v17.8h, v3.8h
3150        mul             v5.8h,  v17.8h, v2.8h
3151        mla             v5.8h,  v18.8h, v3.8h
3152        subs            \h,  \h,  #2
3153.ifc \type, put
3154        uqrshrn         v4.8b,  v4.8h,  #8
3155        uqrshrn         v5.8b,  v5.8h,  #8
3156        st1             {v4.8b}, [\dst], \d_strd
3157        st1             {v5.8b}, [\ds2], \d_strd
3158.else
3159        urshr           v4.8h,  v4.8h,  #4
3160        urshr           v5.8h,  v5.8h,  #4
3161        st1             {v4.8h}, [\dst], \d_strd
3162        st1             {v5.8h}, [\ds2], \d_strd
3163.endif
3164        b.le            9f
3165        mov             v16.16b, v18.16b
3166        b               2b
31679:
3168        subs            \w,  \w,  #8
3169        b.le            0f
3170        asr             \s_strd,  \s_strd,  #1
3171        asr             \d_strd,  \d_strd,  #1
3172        msub            \src,  \s_strd,  \xmy,  \src
3173        msub            \dst,  \d_strd,  \xmy,  \dst
3174        sub             \src,  \src,  \s_strd,  lsl #1
3175        mov             \h,  \my
3176        add             \src,  \src,  #8
3177.ifc \type, put
3178        add             \dst,  \dst,  #8
3179.else
3180        add             \dst,  \dst,  #16
3181.endif
3182        b               1b
31830:
3184        ret
3185
3186L(\type\()_bilin_hv_tbl):
3187        .hword L(\type\()_bilin_hv_tbl) - 1280b
3188        .hword L(\type\()_bilin_hv_tbl) -  640b
3189        .hword L(\type\()_bilin_hv_tbl) -  320b
3190        .hword L(\type\()_bilin_hv_tbl) -  160b
3191        .hword L(\type\()_bilin_hv_tbl) -   80b
3192        .hword L(\type\()_bilin_hv_tbl) -   40b
3193        .hword L(\type\()_bilin_hv_tbl) -   20b
3194        .hword 0
3195endfunc
3196.endm
3197
3198make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
3199make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3200make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
3201make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
3202make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3203filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
3204
3205make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
3206make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
3207make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
3208make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
3209filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
3210filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
3211
3212make_8tap_fn    prep, regular_sharp,  REGULAR, SHARP,   8tap
3213make_8tap_fn    prep, smooth_sharp,   SMOOTH,  SHARP,   8tap
3214make_8tap_fn    prep, sharp,          SHARP,   SHARP,   8tap
3215make_8tap_fn    prep, sharp_regular,  SHARP,   REGULAR, 8tap
3216make_8tap_fn    prep, sharp_smooth,   SHARP,   SMOOTH,  8tap
3217filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  8tap
3218
3219make_8tap_fn    prep, regular,        REGULAR, REGULAR, 6tap
3220make_8tap_fn    prep, regular_smooth, REGULAR, SMOOTH,  6tap
3221make_8tap_fn    prep, smooth,         SMOOTH,  SMOOTH,  6tap
3222make_8tap_fn    prep, smooth_regular, SMOOTH,  REGULAR, 6tap
3223filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  6tap
3224filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
3225
3226
3227.macro load_filter_row dst, src, inc
3228        asr             w13, \src, #10
3229        add             \src, \src, \inc
3230        ldr             \dst, [x11, w13, sxtw #3]
3231.endm
3232
3233function warp_filter_horz_neon
3234        add             w12, w5,  #512
3235
3236        ld1             {v16.8b, v17.8b}, [x2], x3
3237
3238        load_filter_row d0, w12, w7
3239        load_filter_row d1, w12, w7
3240        load_filter_row d2, w12, w7
3241        load_filter_row d3, w12, w7
3242        load_filter_row d4, w12, w7
3243        load_filter_row d5, w12, w7
3244        load_filter_row d6, w12, w7
3245        // subtract by 128 to allow using smull
3246        eor             v16.8b,  v16.8b,  v22.8b
3247        eor             v17.8b,  v17.8b,  v22.8b
3248        load_filter_row d7, w12, w7
3249
3250        ext             v18.8b,  v16.8b,  v17.8b,  #1
3251        ext             v19.8b,  v16.8b,  v17.8b,  #2
3252        smull           v0.8h,   v0.8b,   v16.8b
3253        smull           v1.8h,   v1.8b,   v18.8b
3254        ext             v18.8b,  v16.8b,  v17.8b,  #3
3255        ext             v20.8b,  v16.8b,  v17.8b,  #4
3256        smull           v2.8h,   v2.8b,   v19.8b
3257        smull           v3.8h,   v3.8b,   v18.8b
3258        ext             v18.8b,  v16.8b,  v17.8b,  #5
3259        ext             v19.8b,  v16.8b,  v17.8b,  #6
3260        smull           v4.8h,   v4.8b,   v20.8b
3261        smull           v5.8h,   v5.8b,   v18.8b
3262        ext             v18.8b,  v16.8b,  v17.8b,  #7
3263        smull           v6.8h,   v6.8b,   v19.8b
3264        smull           v7.8h,   v7.8b,   v18.8b
3265
3266        addp            v0.8h,   v0.8h,   v1.8h
3267        addp            v2.8h,   v2.8h,   v3.8h
3268        addp            v4.8h,   v4.8h,   v5.8h
3269        addp            v6.8h,   v6.8h,   v7.8h
3270
3271        addp            v0.8h,   v0.8h,   v2.8h
3272        addp            v4.8h,   v4.8h,   v6.8h
3273
3274        addp            v0.8h,   v0.8h,   v4.8h
3275
3276        add             w5,  w5,  w8
3277
3278        ret
3279endfunc
3280
3281// void dav1d_warp_affine_8x8_8bpc_neon(
3282//         pixel *dst, const ptrdiff_t dst_stride,
3283//         const pixel *src, const ptrdiff_t src_stride,
3284//         const int16_t *const abcd, int mx, int my)
3285.macro warp t, shift
3286function warp_affine_8x8\t\()_8bpc_neon, export=1
3287        ldr             x4,  [x4]
3288        sbfx            x7,  x4, #0,  #16
3289        sbfx            x8,  x4, #16, #16
3290        sbfx            x9,  x4, #32, #16
3291        sbfx            x4,  x4, #48, #16
3292        mov             w10, #8
3293        sub             x2,  x2,  x3, lsl #1
3294        sub             x2,  x2,  x3
3295        sub             x2,  x2,  #3
3296        movrel          x11, X(mc_warp_filter), 64*8
3297        mov             x15, x30
3298.ifnb \t
3299        lsl             x1,  x1,  #1
3300.endif
3301
3302        movi            v22.8b,  #128
3303.ifb \t
3304        movi            v23.8h,  #128
3305.else
3306        movi            v23.8h,  #8, lsl #8
3307.endif
3308
3309        bl              warp_filter_horz_neon
3310        srshr           v24.8h,  v0.8h,  #3
3311        bl              warp_filter_horz_neon
3312        srshr           v25.8h,  v0.8h,  #3
3313        bl              warp_filter_horz_neon
3314        srshr           v26.8h,  v0.8h,  #3
3315        bl              warp_filter_horz_neon
3316        srshr           v27.8h,  v0.8h,  #3
3317        bl              warp_filter_horz_neon
3318        srshr           v28.8h,  v0.8h,  #3
3319        bl              warp_filter_horz_neon
3320        srshr           v29.8h,  v0.8h,  #3
3321        bl              warp_filter_horz_neon
3322        srshr           v30.8h,  v0.8h,  #3
3323
33241:
3325        add             w14, w6,  #512
3326        bl              warp_filter_horz_neon
3327        srshr           v31.8h,  v0.8h,  #3
3328
3329        load_filter_row d0, w14, w9
3330        load_filter_row d1, w14, w9
3331        load_filter_row d2, w14, w9
3332        load_filter_row d3, w14, w9
3333        load_filter_row d4, w14, w9
3334        load_filter_row d5, w14, w9
3335        load_filter_row d6, w14, w9
3336        load_filter_row d7, w14, w9
3337        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3338
3339        // This ordering of smull/smlal/smull2/smlal2 is highly
3340        // beneficial for Cortex A53 here.
3341        smull           v16.4s,  v24.4h,  v0.4h
3342        smlal           v16.4s,  v25.4h,  v1.4h
3343        smlal           v16.4s,  v26.4h,  v2.4h
3344        smlal           v16.4s,  v27.4h,  v3.4h
3345        smlal           v16.4s,  v28.4h,  v4.4h
3346        smlal           v16.4s,  v29.4h,  v5.4h
3347        smlal           v16.4s,  v30.4h,  v6.4h
3348        smlal           v16.4s,  v31.4h,  v7.4h
3349        smull2          v17.4s,  v24.8h,  v0.8h
3350        smlal2          v17.4s,  v25.8h,  v1.8h
3351        smlal2          v17.4s,  v26.8h,  v2.8h
3352        smlal2          v17.4s,  v27.8h,  v3.8h
3353        smlal2          v17.4s,  v28.8h,  v4.8h
3354        smlal2          v17.4s,  v29.8h,  v5.8h
3355        smlal2          v17.4s,  v30.8h,  v6.8h
3356        smlal2          v17.4s,  v31.8h,  v7.8h
3357
3358        mov             v24.16b, v25.16b
3359        mov             v25.16b, v26.16b
3360        sqrshrn         v16.4h,  v16.4s,  #\shift
3361        mov             v26.16b, v27.16b
3362        sqrshrn2        v16.8h,  v17.4s,  #\shift
3363        mov             v27.16b, v28.16b
3364        mov             v28.16b, v29.16b
3365        add             v16.8h,  v16.8h,  v23.8h
3366.ifb \t
3367        sqxtun          v16.8b,  v16.8h
3368.endif
3369        mov             v29.16b, v30.16b
3370        mov             v30.16b, v31.16b
3371        subs            w10, w10, #1
3372.ifnb \t
3373        st1             {v16.8h}, [x0], x1
3374.else
3375        st1             {v16.8b}, [x0], x1
3376.endif
3377
3378        add             w6,  w6,  w4
3379        b.gt            1b
3380
3381        ret             x15
3382endfunc
3383.endm
3384
3385warp  , 11
3386warp t, 7
3387
3388// void dav1d_emu_edge_8bpc_neon(
3389//         const intptr_t bw, const intptr_t bh,
3390//         const intptr_t iw, const intptr_t ih,
3391//         const intptr_t x, const intptr_t y,
3392//         pixel *dst, const ptrdiff_t dst_stride,
3393//         const pixel *ref, const ptrdiff_t ref_stride)
3394function emu_edge_8bpc_neon, export=1
3395        ldp             x8,  x9,  [sp]
3396
3397        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3398        // ref += iclip(x, 0, iw - 1)
3399        sub             x12, x3,  #1           // ih - 1
3400        cmp             x5,  x3
3401        sub             x13, x2,  #1           // iw - 1
3402        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3403        cmp             x4,  x2
3404        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3405        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3406        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3407        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3408        add             x8,  x8,  x13          // ref += iclip()
3409
3410        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3411        // top_ext = iclip(-y, 0, bh - 1)
3412        add             x10, x5,  x1           // y + bh
3413        neg             x5,  x5                // -y
3414        sub             x10, x10, x3           // y + bh - ih
3415        sub             x12, x1,  #1           // bh - 1
3416        cmp             x10, x1
3417        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3418        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3419        cmp             x5,  x1
3420        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3421        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3422
3423        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3424        // left_ext = iclip(-x, 0, bw - 1)
3425        add             x11, x4,  x0           // x + bw
3426        neg             x4,  x4                // -x
3427        sub             x11, x11, x2           // x + bw - iw
3428        sub             x13, x0,  #1           // bw - 1
3429        cmp             x11, x0
3430        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3431        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3432        cmp             x4,  x0
3433        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3434        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3435
3436        // center_h = bh - top_ext - bottom_ext
3437        // dst += top_ext * PXSTRIDE(dst_stride)
3438        // center_w = bw - left_ext - right_ext
3439        sub             x1,  x1,  x5           // bh - top_ext
3440        madd            x6,  x5,  x7,  x6
3441        sub             x2,  x0,  x4           // bw - left_ext
3442        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3443        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3444
3445        mov             x14, x6                // backup of dst
3446
3447.macro v_loop need_left, need_right
34480:
3449.if \need_left
3450        ld1r            {v0.16b}, [x8]
3451        mov             x12, x6                // out = dst
3452        mov             x3,  x4
34531:
3454        subs            x3,  x3,  #16
3455        st1             {v0.16b}, [x12], #16
3456        b.gt            1b
3457.endif
3458        mov             x13, x8
3459        add             x12, x6,  x4           // out = dst + left_ext
3460        mov             x3,  x2
34611:
3462        ld1             {v0.16b, v1.16b}, [x13], #32
3463        subs            x3,  x3,  #32
3464        st1             {v0.16b, v1.16b}, [x12], #32
3465        b.gt            1b
3466.if \need_right
3467        add             x3,  x8,  x2           // in + center_w
3468        sub             x3,  x3,  #1           // in + center_w - 1
3469        add             x12, x6,  x4           // dst + left_ext
3470        ld1r            {v0.16b}, [x3]
3471        add             x12, x12, x2           // out = dst + left_ext + center_w
3472        mov             x3,  x11
34731:
3474        subs            x3,  x3,  #16
3475        st1             {v0.16b}, [x12], #16
3476        b.gt            1b
3477.endif
3478
3479        subs            x1,  x1,  #1           // center_h--
3480        add             x6,  x6,  x7
3481        add             x8,  x8,  x9
3482        b.gt            0b
3483.endm
3484
3485        cbz             x4,  2f
3486        // need_left
3487        cbz             x11, 3f
3488        // need_left + need_right
3489        v_loop          1,   1
3490        b               5f
3491
34922:
3493        // !need_left
3494        cbz             x11, 4f
3495        // !need_left + need_right
3496        v_loop          0,   1
3497        b               5f
3498
34993:
3500        // need_left + !need_right
3501        v_loop          1,   0
3502        b               5f
3503
35044:
3505        // !need_left + !need_right
3506        v_loop          0,   0
3507
35085:
3509
3510        cbz             x10, 3f
3511        // need_bottom
3512        sub             x8,  x6,  x7           // ref = dst - stride
3513        mov             x4,  x0
35141:
3515        ld1             {v0.16b, v1.16b}, [x8], #32
3516        mov             x3,  x10
35172:
3518        subs            x3,  x3,  #1
3519        st1             {v0.16b, v1.16b}, [x6], x7
3520        b.gt            2b
3521        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3522        subs            x4,  x4,  #32          // bw -= 32
3523        add             x6,  x6,  #32          // dst += 32
3524        b.gt            1b
3525
35263:
3527        cbz             x5,  3f
3528        // need_top
3529        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
35301:
3531        ld1             {v0.16b, v1.16b}, [x14], #32
3532        mov             x3,  x5
35332:
3534        subs            x3,  x3,  #1
3535        st1             {v0.16b, v1.16b}, [x6], x7
3536        b.gt            2b
3537        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3538        subs            x0,  x0,  #32          // bw -= 32
3539        add             x6,  x6,  #32          // dst += 32
3540        b.gt            1b
3541
35423:
3543        ret
3544endfunc
3545