• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2018, Janne Grunau
4 * Copyright © 2020, Martin Storsjo
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright notice, this
11 *    list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright notice,
14 *    this list of conditions and the following disclaimer in the documentation
15 *    and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "src/arm/asm.S"
30#include "util.S"
31
32#define PREP_BIAS 8192
33
34.macro avg d0, d1, t0, t1, t2, t3
35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
45.endm
46
47.macro w_avg d0, d1, t0, t1, t2, t3
48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
50        // This difference requires a 17 bit range, and all bits are
51        // significant for the following multiplication.
52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
56        mul             \d0\().4s,  \d0\().4s,  v27.4s
57        mul             \t0\().4s,  \t0\().4s,  v27.4s
58        mul             \d1\().4s,  \d1\().4s,  v27.4s
59        mul             \t1\().4s,  \t1\().4s,  v27.4s
60        sshr            \d0\().4s,  \d0\().4s,  #4
61        sshr            \t0\().4s,  \t0\().4s,  #4
62        sshr            \d1\().4s,  \d1\().4s,  #4
63        sshr            \t1\().4s,  \t1\().4s,  #4
64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
68        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
69        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
70        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
71        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
72        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
73        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
74        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
75        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
76        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
77        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
78.endm
79
80.macro mask d0, d1, t0, t1, t2, t3
81        ld1             {v27.16b}, [x6],  16
82        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
83        neg             v27.16b, v27.16b
84        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
85        sxtl            v26.8h,  v27.8b
86        sxtl2           v27.8h,  v27.16b
87        sxtl            v24.4s,  v26.4h
88        sxtl2           v25.4s,  v26.8h
89        sxtl            v26.4s,  v27.4h
90        sxtl2           v27.4s,  v27.8h
91        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
92        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
93        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
94        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
95        mul             \d0\().4s,  \d0\().4s,  v24.4s
96        mul             \t0\().4s,  \t0\().4s,  v25.4s
97        mul             \d1\().4s,  \d1\().4s,  v26.4s
98        mul             \t1\().4s,  \t1\().4s,  v27.4s
99        sshr            \d0\().4s,  \d0\().4s,  #6
100        sshr            \t0\().4s,  \t0\().4s,  #6
101        sshr            \d1\().4s,  \d1\().4s,  #6
102        sshr            \t1\().4s,  \t1\().4s,  #6
103        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
104        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
105        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
106        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
107        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
108        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
109        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
110        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
111        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
112        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
113        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
114        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
115        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
116        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
117.endm
118
119.macro bidir_fn type, bdmax
120function \type\()_16bpc_neon, export=1
121        clz             w4,  w4
122.ifnc \type, avg
123        dup             v31.8h,  \bdmax // bitdepth_max
124        movi            v30.8h,  #0
125.endif
126        clz             w7,  \bdmax
127        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
128.ifc \type, avg
129        mov             w9,  #1
130        mov             w8,  #-2*PREP_BIAS
131        lsl             w9,  w9,  w7    // 1 << intermediate_bits
132        add             w7,  w7,  #1
133        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
134        neg             w7,  w7         // -(intermediate_bits+1)
135        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
136        dup             v29.8h,   w7    // -(intermediate_bits+1)
137.else
138        mov             w8,  #PREP_BIAS
139        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
140        neg             w7,  w7         // -intermediate_bits
141        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
142        dup             v29.8h,  w7     // -intermediate_bits
143.endif
144.ifc \type, w_avg
145        dup             v27.4s,  w6
146        neg             v27.4s,  v27.4s
147.endif
148        adr             x7,  L(\type\()_tbl)
149        sub             w4,  w4,  #24
150        \type           v4,  v5,  v0,  v1,  v2,  v3
151        ldrh            w4,  [x7, x4, lsl #1]
152        sub             x7,  x7,  w4, uxtw
153        br              x7
15440:
155        AARCH64_VALID_JUMP_TARGET
156        add             x7,  x0,  x1
157        lsl             x1,  x1,  #1
1584:
159        subs            w5,  w5,  #4
160        st1             {v4.d}[0],  [x0], x1
161        st1             {v4.d}[1],  [x7], x1
162        st1             {v5.d}[0],  [x0], x1
163        st1             {v5.d}[1],  [x7], x1
164        b.le            0f
165        \type           v4,  v5,  v0,  v1,  v2,  v3
166        b               4b
16780:
168        AARCH64_VALID_JUMP_TARGET
169        add             x7,  x0,  x1
170        lsl             x1,  x1,  #1
1718:
172        st1             {v4.8h},  [x0], x1
173        subs            w5,  w5,  #2
174        st1             {v5.8h},  [x7], x1
175        b.le            0f
176        \type           v4,  v5,  v0,  v1,  v2,  v3
177        b               8b
17816:
179        AARCH64_VALID_JUMP_TARGET
180        \type           v6,  v7,  v0,  v1,  v2,  v3
181        st1             {v4.8h, v5.8h}, [x0], x1
182        subs            w5,  w5,  #2
183        st1             {v6.8h, v7.8h}, [x0], x1
184        b.le            0f
185        \type           v4,  v5,  v0,  v1,  v2,  v3
186        b               16b
18732:
188        AARCH64_VALID_JUMP_TARGET
189        \type           v6,  v7,  v0,  v1,  v2,  v3
190        subs            w5,  w5,  #1
191        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
192        b.le            0f
193        \type           v4,  v5,  v0,  v1,  v2,  v3
194        b               32b
195640:
196        AARCH64_VALID_JUMP_TARGET
197        add             x7,  x0,  #64
19864:
199        \type           v6,  v7,  v0,  v1,  v2,  v3
200        \type           v16, v17, v0,  v1,  v2,  v3
201        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
202        \type           v18, v19, v0,  v1,  v2,  v3
203        subs            w5,  w5,  #1
204        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
205        b.le            0f
206        \type           v4,  v5,  v0,  v1,  v2,  v3
207        b               64b
2081280:
209        AARCH64_VALID_JUMP_TARGET
210        add             x7,  x0,  #64
211        mov             x8,  #128
212        sub             x1,  x1,  #128
213128:
214        \type           v6,  v7,  v0,  v1,  v2,  v3
215        \type           v16, v17, v0,  v1,  v2,  v3
216        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
217        \type           v18, v19, v0,  v1,  v2,  v3
218        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
219        \type           v4,  v5,  v0,  v1,  v2,  v3
220        \type           v6,  v7,  v0,  v1,  v2,  v3
221        \type           v16, v17, v0,  v1,  v2,  v3
222        subs            w5,  w5,  #1
223        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
224        \type           v18, v19, v0,  v1,  v2,  v3
225        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
226        b.le            0f
227        \type           v4,  v5,  v0,  v1,  v2,  v3
228        b               128b
2290:
230        ret
231L(\type\()_tbl):
232        .hword L(\type\()_tbl) - 1280b
233        .hword L(\type\()_tbl) -  640b
234        .hword L(\type\()_tbl) -   32b
235        .hword L(\type\()_tbl) -   16b
236        .hword L(\type\()_tbl) -   80b
237        .hword L(\type\()_tbl) -   40b
238endfunc
239.endm
240
241bidir_fn avg, w6
242bidir_fn w_avg, w7
243bidir_fn mask, w7
244
245
246.macro w_mask_fn type
247function w_mask_\type\()_16bpc_neon, export=1
248        ldr             w8,  [sp]
249        clz             w9,  w4
250        adr             x10, L(w_mask_\type\()_tbl)
251        dup             v31.8h,  w8   // bitdepth_max
252        sub             w9,  w9,  #24
253        clz             w8,  w8       // clz(bitdepth_max)
254        ldrh            w9,  [x10,  x9,  lsl #1]
255        sub             x10, x10, w9,  uxtw
256        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
257        mov             w9,  #PREP_BIAS*64
258        neg             w8,  w8       // -sh
259        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
260        dup             v30.4s,  w9   // PREP_BIAS*64
261        dup             v29.4s,  w8   // -sh
262        dup             v0.8h,   w11
263.if \type == 444
264        movi            v1.16b,  #64
265.elseif \type == 422
266        dup             v2.8b,   w7
267        movi            v3.8b,   #129
268        sub             v3.8b,   v3.8b,   v2.8b
269.elseif \type == 420
270        dup             v2.8h,   w7
271        movi            v3.8h,   #1, lsl #8
272        sub             v3.8h,   v3.8h,   v2.8h
273.endif
274        add             x12,  x0,  x1
275        lsl             x1,   x1,  #1
276        br              x10
2774:
278        AARCH64_VALID_JUMP_TARGET
279        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
280        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
281        subs            w5,  w5,  #4
282        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
283        sabd            v21.8h,  v5.8h,   v7.8h
284        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
285        ssubl2          v17.4s,  v6.8h,   v4.8h
286        ssubl           v18.4s,  v7.4h,   v5.4h
287        ssubl2          v19.4s,  v7.8h,   v5.8h
288        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
289        uqsub           v21.8h,  v0.8h,   v21.8h
290        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
291        sshll           v6.4s,   v5.4h,   #6
292        sshll2          v5.4s,   v4.8h,   #6
293        sshll           v4.4s,   v4.4h,   #6
294        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
295        ushr            v21.8h,  v21.8h,  #10
296        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
297        add             v5.4s,   v5.4s,   v30.4s
298        add             v6.4s,   v6.4s,   v30.4s
299        add             v7.4s,   v7.4s,   v30.4s
300        uxtl            v22.4s,  v20.4h
301        uxtl2           v23.4s,  v20.8h
302        uxtl            v24.4s,  v21.4h
303        uxtl2           v25.4s,  v21.8h
304        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
305        mla             v5.4s,   v17.4s,  v23.4s
306        mla             v6.4s,   v18.4s,  v24.4s
307        mla             v7.4s,   v19.4s,  v25.4s
308        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
309        srshl           v5.4s,   v5.4s,   v29.4s
310        srshl           v6.4s,   v6.4s,   v29.4s
311        srshl           v7.4s,   v7.4s,   v29.4s
312        sqxtun          v4.4h,   v4.4s            // iclip_pixel
313        sqxtun2         v4.8h,   v5.4s
314        sqxtun          v5.4h,   v6.4s
315        sqxtun2         v5.8h,   v7.4s
316        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
317        umin            v5.8h,   v5.8h,   v31.8h
318.if \type == 444
319        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
320        sub             v20.16b, v1.16b,  v20.16b // m
321        st1             {v20.16b}, [x6], #16
322.elseif \type == 422
323        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
324        xtn             v20.8b,  v20.8h
325        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
326        st1             {v20.8b}, [x6], #8
327.elseif \type == 420
328        trn1            v24.2d,  v20.2d,  v21.2d
329        trn2            v25.2d,  v20.2d,  v21.2d
330        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
331        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
332        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
333        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
334        st1             {v20.s}[0], [x6], #4
335.endif
336        st1             {v4.d}[0],  [x0],  x1
337        st1             {v4.d}[1],  [x12], x1
338        st1             {v5.d}[0],  [x0],  x1
339        st1             {v5.d}[1],  [x12], x1
340        b.gt            4b
341        ret
3428:
343        AARCH64_VALID_JUMP_TARGET
344        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
345        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
346        subs            w5,  w5,  #2
347        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
348        sabd            v21.8h,  v5.8h,   v7.8h
349        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
350        ssubl2          v17.4s,  v6.8h,   v4.8h
351        ssubl           v18.4s,  v7.4h,   v5.4h
352        ssubl2          v19.4s,  v7.8h,   v5.8h
353        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
354        uqsub           v21.8h,  v0.8h,   v21.8h
355        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
356        sshll           v6.4s,   v5.4h,   #6
357        sshll2          v5.4s,   v4.8h,   #6
358        sshll           v4.4s,   v4.4h,   #6
359        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
360        ushr            v21.8h,  v21.8h,  #10
361        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
362        add             v5.4s,   v5.4s,   v30.4s
363        add             v6.4s,   v6.4s,   v30.4s
364        add             v7.4s,   v7.4s,   v30.4s
365        uxtl            v22.4s,  v20.4h
366        uxtl2           v23.4s,  v20.8h
367        uxtl            v24.4s,  v21.4h
368        uxtl2           v25.4s,  v21.8h
369        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
370        mla             v5.4s,   v17.4s,  v23.4s
371        mla             v6.4s,   v18.4s,  v24.4s
372        mla             v7.4s,   v19.4s,  v25.4s
373        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
374        srshl           v5.4s,   v5.4s,   v29.4s
375        srshl           v6.4s,   v6.4s,   v29.4s
376        srshl           v7.4s,   v7.4s,   v29.4s
377        sqxtun          v4.4h,   v4.4s            // iclip_pixel
378        sqxtun2         v4.8h,   v5.4s
379        sqxtun          v5.4h,   v6.4s
380        sqxtun2         v5.8h,   v7.4s
381        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
382        umin            v5.8h,   v5.8h,   v31.8h
383.if \type == 444
384        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
385        sub             v20.16b, v1.16b,  v20.16b // m
386        st1             {v20.16b}, [x6], #16
387.elseif \type == 422
388        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
389        xtn             v20.8b,  v20.8h
390        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
391        st1             {v20.8b}, [x6], #8
392.elseif \type == 420
393        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
394        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
395        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
396        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
397        st1             {v20.s}[0], [x6], #4
398.endif
399        st1             {v4.8h}, [x0],  x1
400        st1             {v5.8h}, [x12], x1
401        b.gt            8b
402        ret
4031280:
404640:
405320:
406160:
407        AARCH64_VALID_JUMP_TARGET
408        mov             w11, w4
409        sub             x1,  x1,  w4,  uxtw #1
410.if \type == 444
411        add             x10, x6,  w4,  uxtw
412.elseif \type == 422
413        add             x10, x6,  x11, lsr #1
414.endif
415        add             x9,  x3,  w4,  uxtw #1
416        add             x7,  x2,  w4,  uxtw #1
417161:
418        mov             w8,  w4
41916:
420        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
421        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
422        ld1             {v6.8h,   v7.8h},  [x7], #32
423        ld1             {v18.8h,  v19.8h}, [x9], #32
424        subs            w8,  w8,  #16
425        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
426        sabd            v21.8h,  v5.8h,   v17.8h
427        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
428        ssubl2          v23.4s,  v16.8h,  v4.8h
429        ssubl           v24.4s,  v17.4h,  v5.4h
430        ssubl2          v25.4s,  v17.8h,  v5.8h
431        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
432        uqsub           v21.8h,  v0.8h,   v21.8h
433        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
434        sshll           v26.4s,  v5.4h,   #6
435        sshll2          v5.4s,   v4.8h,   #6
436        sshll           v4.4s,   v4.4h,   #6
437        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
438        ushr            v21.8h,  v21.8h,  #10
439        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
440        add             v5.4s,   v5.4s,   v30.4s
441        add             v26.4s,  v26.4s,  v30.4s
442        add             v27.4s,  v27.4s,  v30.4s
443        uxtl            v16.4s,  v20.4h
444        uxtl2           v17.4s,  v20.8h
445        uxtl            v28.4s,  v21.4h
446        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
447        uxtl2           v16.4s,  v21.8h
448        mla             v5.4s,   v23.4s,  v17.4s
449        mla             v26.4s,  v24.4s,  v28.4s
450        mla             v27.4s,  v25.4s,  v16.4s
451        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
452        srshl           v5.4s,   v5.4s,   v29.4s
453        srshl           v26.4s,  v26.4s,  v29.4s
454        srshl           v27.4s,  v27.4s,  v29.4s
455        sqxtun          v4.4h,   v4.4s            // iclip_pixel
456        sqxtun2         v4.8h,   v5.4s
457        sqxtun          v5.4h,   v26.4s
458        sqxtun2         v5.8h,   v27.4s
459
460        // Start of other half
461        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
462        sabd            v23.8h,  v7.8h,   v19.8h
463
464        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
465        umin            v5.8h,   v5.8h,   v31.8h
466
467        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
468        ssubl2          v17.4s,  v18.8h,  v6.8h
469        ssubl           v18.4s,  v19.4h,  v7.4h
470        ssubl2          v19.4s,  v19.8h,  v7.8h
471        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
472        uqsub           v23.8h,  v0.8h,   v23.8h
473        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
474        sshll2          v25.4s,  v6.8h,   #6
475        sshll           v26.4s,  v7.4h,   #6
476        sshll2          v27.4s,  v7.8h,   #6
477        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
478        ushr            v23.8h,  v23.8h,  #10
479        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
480        add             v25.4s,  v25.4s,  v30.4s
481        add             v26.4s,  v26.4s,  v30.4s
482        add             v27.4s,  v27.4s,  v30.4s
483        uxtl            v6.4s,   v22.4h
484        uxtl2           v7.4s,   v22.8h
485        uxtl            v28.4s,  v23.4h
486        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
487        uxtl2           v6.4s,   v23.8h
488        mla             v25.4s,  v17.4s,  v7.4s
489        mla             v26.4s,  v18.4s,  v28.4s
490        mla             v27.4s,  v19.4s,  v6.4s
491        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
492        srshl           v25.4s,  v25.4s,  v29.4s
493        srshl           v26.4s,  v26.4s,  v29.4s
494        srshl           v27.4s,  v27.4s,  v29.4s
495        sqxtun          v6.4h,   v24.4s           // iclip_pixel
496        sqxtun2         v6.8h,   v25.4s
497        sqxtun          v7.4h,   v26.4s
498        sqxtun2         v7.8h,   v27.4s
499        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
500        umin            v7.8h,   v7.8h,   v31.8h
501.if \type == 444
502        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
503        uzp1            v21.16b, v22.16b, v23.16b
504        sub             v20.16b, v1.16b,  v20.16b // m
505        sub             v21.16b, v1.16b,  v21.16b
506        st1             {v20.16b}, [x6],  #16
507        st1             {v21.16b}, [x10], #16
508.elseif \type == 422
509        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
510        addp            v21.8h,  v22.8h,  v23.8h
511        xtn             v20.8b,  v20.8h
512        xtn             v21.8b,  v21.8h
513        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
514        uhsub           v21.8b,  v3.8b,   v21.8b
515        st1             {v20.8b}, [x6],  #8
516        st1             {v21.8b}, [x10], #8
517.elseif \type == 420
518        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
519        add             v21.8h,  v21.8h,  v23.8h
520        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
521        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
522        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
523        st1             {v20.8b}, [x6], #8
524.endif
525        st1             {v4.8h, v5.8h}, [x0],  #32
526        st1             {v6.8h, v7.8h}, [x12], #32
527        b.gt            16b
528        subs            w5,  w5,  #2
529        add             x2,  x2,  w4,  uxtw #1
530        add             x3,  x3,  w4,  uxtw #1
531        add             x7,  x7,  w4,  uxtw #1
532        add             x9,  x9,  w4,  uxtw #1
533.if \type == 444
534        add             x6,  x6,  w4,  uxtw
535        add             x10, x10, w4,  uxtw
536.elseif \type == 422
537        add             x6,  x6,  x11, lsr #1
538        add             x10, x10, x11, lsr #1
539.endif
540        add             x0,  x0,  x1
541        add             x12, x12, x1
542        b.gt            161b
543        ret
544L(w_mask_\type\()_tbl):
545        .hword L(w_mask_\type\()_tbl) - 1280b
546        .hword L(w_mask_\type\()_tbl) -  640b
547        .hword L(w_mask_\type\()_tbl) -  320b
548        .hword L(w_mask_\type\()_tbl) -  160b
549        .hword L(w_mask_\type\()_tbl) -    8b
550        .hword L(w_mask_\type\()_tbl) -    4b
551endfunc
552.endm
553
554w_mask_fn 444
555w_mask_fn 422
556w_mask_fn 420
557
558
559function blend_16bpc_neon, export=1
560        adr             x6,  L(blend_tbl)
561        clz             w3,  w3
562        sub             w3,  w3,  #26
563        ldrh            w3,  [x6,  x3,  lsl #1]
564        sub             x6,  x6,  w3,  uxtw
565        add             x8,  x0,  x1
566        br              x6
56740:
568        AARCH64_VALID_JUMP_TARGET
569        lsl             x1,  x1,  #1
5704:
571        ld1             {v2.8b},   [x5], #8
572        ld1             {v1.8h},   [x2], #16
573        ld1             {v0.d}[0], [x0]
574        neg             v2.8b,   v2.8b            // -m
575        subs            w4,  w4,  #2
576        ld1             {v0.d}[1], [x8]
577        sxtl            v2.8h,   v2.8b
578        shl             v2.8h,   v2.8h,   #9      // -m << 9
579        sub             v1.8h,   v0.8h,   v1.8h   // a - b
580        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
581        add             v0.8h,   v0.8h,   v1.8h
582        st1             {v0.d}[0], [x0], x1
583        st1             {v0.d}[1], [x8], x1
584        b.gt            4b
585        ret
58680:
587        AARCH64_VALID_JUMP_TARGET
588        lsl             x1,  x1,  #1
5898:
590        ld1             {v4.16b},       [x5], #16
591        ld1             {v2.8h, v3.8h}, [x2], #32
592        neg             v5.16b,  v4.16b           // -m
593        ld1             {v0.8h},   [x0]
594        ld1             {v1.8h},   [x8]
595        sxtl            v4.8h,   v5.8b
596        sxtl2           v5.8h,   v5.16b
597        shl             v4.8h,   v4.8h,   #9      // -m << 9
598        shl             v5.8h,   v5.8h,   #9
599        sub             v2.8h,   v0.8h,   v2.8h   // a - b
600        sub             v3.8h,   v1.8h,   v3.8h
601        subs            w4,  w4,  #2
602        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
603        sqrdmulh        v3.8h,   v3.8h,   v5.8h
604        add             v0.8h,   v0.8h,   v2.8h
605        add             v1.8h,   v1.8h,   v3.8h
606        st1             {v0.8h}, [x0], x1
607        st1             {v1.8h}, [x8], x1
608        b.gt            8b
609        ret
610160:
611        AARCH64_VALID_JUMP_TARGET
612        lsl             x1,  x1,  #1
61316:
614        ld1             {v16.16b, v17.16b},           [x5], #32
615        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
616        subs            w4,  w4,  #2
617        neg             v18.16b, v16.16b          // -m
618        neg             v19.16b, v17.16b
619        ld1             {v0.8h, v1.8h}, [x0]
620        sxtl            v16.8h,  v18.8b
621        sxtl2           v17.8h,  v18.16b
622        sxtl            v18.8h,  v19.8b
623        sxtl2           v19.8h,  v19.16b
624        ld1             {v2.8h, v3.8h}, [x8]
625        shl             v16.8h,  v16.8h,  #9      // -m << 9
626        shl             v17.8h,  v17.8h,  #9
627        shl             v18.8h,  v18.8h,  #9
628        shl             v19.8h,  v19.8h,  #9
629        sub             v4.8h,   v0.8h,   v4.8h   // a - b
630        sub             v5.8h,   v1.8h,   v5.8h
631        sub             v6.8h,   v2.8h,   v6.8h
632        sub             v7.8h,   v3.8h,   v7.8h
633        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
634        sqrdmulh        v5.8h,   v5.8h,   v17.8h
635        sqrdmulh        v6.8h,   v6.8h,   v18.8h
636        sqrdmulh        v7.8h,   v7.8h,   v19.8h
637        add             v0.8h,   v0.8h,   v4.8h
638        add             v1.8h,   v1.8h,   v5.8h
639        add             v2.8h,   v2.8h,   v6.8h
640        add             v3.8h,   v3.8h,   v7.8h
641        st1             {v0.8h, v1.8h}, [x0], x1
642        st1             {v2.8h, v3.8h}, [x8], x1
643        b.gt            16b
644        ret
64532:
646        AARCH64_VALID_JUMP_TARGET
647        ld1             {v16.16b, v17.16b},           [x5], #32
648        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
649        subs            w4,  w4,  #1
650        neg             v18.16b, v16.16b          // -m
651        neg             v19.16b, v17.16b
652        sxtl            v16.8h,  v18.8b
653        sxtl2           v17.8h,  v18.16b
654        sxtl            v18.8h,  v19.8b
655        sxtl2           v19.8h,  v19.16b
656        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
657        shl             v16.8h,  v16.8h,  #9      // -m << 9
658        shl             v17.8h,  v17.8h,  #9
659        shl             v18.8h,  v18.8h,  #9
660        shl             v19.8h,  v19.8h,  #9
661        sub             v4.8h,   v0.8h,   v4.8h   // a - b
662        sub             v5.8h,   v1.8h,   v5.8h
663        sub             v6.8h,   v2.8h,   v6.8h
664        sub             v7.8h,   v3.8h,   v7.8h
665        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
666        sqrdmulh        v5.8h,   v5.8h,   v17.8h
667        sqrdmulh        v6.8h,   v6.8h,   v18.8h
668        sqrdmulh        v7.8h,   v7.8h,   v19.8h
669        add             v0.8h,   v0.8h,   v4.8h
670        add             v1.8h,   v1.8h,   v5.8h
671        add             v2.8h,   v2.8h,   v6.8h
672        add             v3.8h,   v3.8h,   v7.8h
673        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
674        b.gt            32b
675        ret
676L(blend_tbl):
677        .hword L(blend_tbl) -  32b
678        .hword L(blend_tbl) - 160b
679        .hword L(blend_tbl) -  80b
680        .hword L(blend_tbl) -  40b
681endfunc
682
683function blend_h_16bpc_neon, export=1
684        adr             x6,  L(blend_h_tbl)
685        movrel          x5,  X(obmc_masks)
686        add             x5,  x5,  w4,  uxtw
687        sub             w4,  w4,  w4,  lsr #2
688        clz             w7,  w3
689        add             x8,  x0,  x1
690        lsl             x1,  x1,  #1
691        sub             w7,  w7,  #24
692        ldrh            w7,  [x6,  x7,  lsl #1]
693        sub             x6,  x6,  w7, uxtw
694        br              x6
6952:
696        AARCH64_VALID_JUMP_TARGET
697        ld2r            {v2.8b, v3.8b}, [x5], #2
698        ld1             {v1.4h},        [x2], #8
699        ext             v2.8b,   v2.8b,   v3.8b,   #6
700        subs            w4,  w4,  #2
701        neg             v2.8b,   v2.8b            // -m
702        ld1             {v0.s}[0], [x0]
703        ld1             {v0.s}[1], [x8]
704        sxtl            v2.8h,   v2.8b
705        shl             v2.4h,   v2.4h,   #9      // -m << 9
706        sub             v1.4h,   v0.4h,   v1.4h   // a - b
707        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
708        add             v0.4h,   v0.4h,   v1.4h
709        st1             {v0.s}[0], [x0], x1
710        st1             {v0.s}[1], [x8], x1
711        b.gt            2b
712        ret
7134:
714        AARCH64_VALID_JUMP_TARGET
715        ld2r            {v2.8b, v3.8b}, [x5], #2
716        ld1             {v1.8h},        [x2], #16
717        ext             v2.8b,   v2.8b,   v3.8b,   #4
718        subs            w4,  w4,  #2
719        neg             v2.8b,   v2.8b            // -m
720        ld1             {v0.d}[0],   [x0]
721        ld1             {v0.d}[1],   [x8]
722        sxtl            v2.8h,   v2.8b
723        shl             v2.8h,   v2.8h,   #9      // -m << 9
724        sub             v1.8h,   v0.8h,   v1.8h   // a - b
725        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
726        add             v0.8h,   v0.8h,   v1.8h
727        st1             {v0.d}[0], [x0], x1
728        st1             {v0.d}[1], [x8], x1
729        b.gt            4b
730        ret
7318:
732        AARCH64_VALID_JUMP_TARGET
733        ld2r            {v4.8b, v5.8b}, [x5], #2
734        ld1             {v2.8h, v3.8h}, [x2], #32
735        neg             v4.8b,   v4.8b            // -m
736        neg             v5.8b,   v5.8b
737        ld1             {v0.8h}, [x0]
738        subs            w4,  w4,  #2
739        sxtl            v4.8h,   v4.8b
740        sxtl            v5.8h,   v5.8b
741        ld1             {v1.8h}, [x8]
742        shl             v4.8h,   v4.8h,   #9      // -m << 9
743        shl             v5.8h,   v5.8h,   #9
744        sub             v2.8h,   v0.8h,   v2.8h   // a - b
745        sub             v3.8h,   v1.8h,   v3.8h
746        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
747        sqrdmulh        v3.8h,   v3.8h,   v5.8h
748        add             v0.8h,   v0.8h,   v2.8h
749        add             v1.8h,   v1.8h,   v3.8h
750        st1             {v0.8h}, [x0], x1
751        st1             {v1.8h}, [x8], x1
752        b.gt            8b
753        ret
75416:
755        AARCH64_VALID_JUMP_TARGET
756        ld2r            {v16.8b, v17.8b}, [x5], #2
757        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
758        neg             v16.8b,  v16.8b           // -m
759        neg             v17.8b,  v17.8b
760        ld1             {v0.8h, v1.8h},  [x0]
761        ld1             {v2.8h, v3.8h},  [x8]
762        subs            w4,  w4,  #2
763        sxtl            v16.8h,  v16.8b
764        sxtl            v17.8h,  v17.8b
765        shl             v16.8h,  v16.8h,  #9      // -m << 9
766        shl             v17.8h,  v17.8h,  #9
767        sub             v4.8h,   v0.8h,   v4.8h   // a - b
768        sub             v5.8h,   v1.8h,   v5.8h
769        sub             v6.8h,   v2.8h,   v6.8h
770        sub             v7.8h,   v3.8h,   v7.8h
771        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
772        sqrdmulh        v5.8h,   v5.8h,   v16.8h
773        sqrdmulh        v6.8h,   v6.8h,   v17.8h
774        sqrdmulh        v7.8h,   v7.8h,   v17.8h
775        add             v0.8h,   v0.8h,   v4.8h
776        add             v1.8h,   v1.8h,   v5.8h
777        add             v2.8h,   v2.8h,   v6.8h
778        add             v3.8h,   v3.8h,   v7.8h
779        st1             {v0.8h, v1.8h}, [x0], x1
780        st1             {v2.8h, v3.8h}, [x8], x1
781        b.gt            16b
782        ret
7831280:
784640:
785320:
786        AARCH64_VALID_JUMP_TARGET
787        sub             x1,  x1,  w3,  uxtw #1
788        add             x7,  x2,  w3,  uxtw #1
789321:
790        ld2r            {v24.8b, v25.8b}, [x5], #2
791        mov             w6,  w3
792        neg             v24.8b,  v24.8b           // -m
793        neg             v25.8b,  v25.8b
794        sxtl            v24.8h,  v24.8b
795        sxtl            v25.8h,  v25.8b
796        shl             v24.8h,  v24.8h,  #9      // -m << 9
797        shl             v25.8h,  v25.8h,  #9
79832:
799        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
800        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
801        subs            w6,  w6,  #32
802        sub             v16.8h,  v0.8h,   v16.8h  // a - b
803        sub             v17.8h,  v1.8h,   v17.8h
804        sub             v18.8h,  v2.8h,   v18.8h
805        sub             v19.8h,  v3.8h,   v19.8h
806        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
807        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
808        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
809        sqrdmulh        v17.8h,  v17.8h,  v24.8h
810        sqrdmulh        v18.8h,  v18.8h,  v24.8h
811        sqrdmulh        v19.8h,  v19.8h,  v24.8h
812        sub             v20.8h,  v4.8h,   v20.8h  // a - b
813        sub             v21.8h,  v5.8h,   v21.8h
814        sub             v22.8h,  v6.8h,   v22.8h
815        sub             v23.8h,  v7.8h,   v23.8h
816        add             v0.8h,   v0.8h,   v16.8h
817        add             v1.8h,   v1.8h,   v17.8h
818        add             v2.8h,   v2.8h,   v18.8h
819        add             v3.8h,   v3.8h,   v19.8h
820        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
821        sqrdmulh        v21.8h,  v21.8h,  v25.8h
822        sqrdmulh        v22.8h,  v22.8h,  v25.8h
823        sqrdmulh        v23.8h,  v23.8h,  v25.8h
824        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
825        add             v4.8h,   v4.8h,   v20.8h
826        add             v5.8h,   v5.8h,   v21.8h
827        add             v6.8h,   v6.8h,   v22.8h
828        add             v7.8h,   v7.8h,   v23.8h
829        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
830        b.gt            32b
831        subs            w4,  w4,  #2
832        add             x0,  x0,  x1
833        add             x8,  x8,  x1
834        add             x2,  x2,  w3,  uxtw #1
835        add             x7,  x7,  w3,  uxtw #1
836        b.gt            321b
837        ret
838L(blend_h_tbl):
839        .hword L(blend_h_tbl) - 1280b
840        .hword L(blend_h_tbl) -  640b
841        .hword L(blend_h_tbl) -  320b
842        .hword L(blend_h_tbl) -   16b
843        .hword L(blend_h_tbl) -    8b
844        .hword L(blend_h_tbl) -    4b
845        .hword L(blend_h_tbl) -    2b
846endfunc
847
848function blend_v_16bpc_neon, export=1
849        adr             x6,  L(blend_v_tbl)
850        movrel          x5,  X(obmc_masks)
851        add             x5,  x5,  w3,  uxtw
852        clz             w3,  w3
853        add             x8,  x0,  x1
854        lsl             x1,  x1,  #1
855        sub             w3,  w3,  #26
856        ldrh            w3,  [x6,  x3,  lsl #1]
857        sub             x6,  x6,  w3,  uxtw
858        br              x6
85920:
860        AARCH64_VALID_JUMP_TARGET
861        ld1r            {v2.8b}, [x5]
862        neg             v2.8b,   v2.8b            // -m
863        sxtl            v2.8h,   v2.8b
864        shl             v2.4h,   v2.4h,   #9      // -m << 9
8652:
866        ld1             {v1.s}[0], [x2], #4
867        ld1             {v0.h}[0], [x0]
868        subs            w4,  w4,  #2
869        ld1             {v1.h}[1], [x2]
870        ld1             {v0.h}[1], [x8]
871        add             x2,  x2,  #4
872        sub             v1.4h,   v0.4h,   v1.4h   // a - b
873        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
874        add             v0.4h,   v0.4h,   v1.4h
875        st1             {v0.h}[0], [x0],  x1
876        st1             {v0.h}[1], [x8],  x1
877        b.gt            2b
878        ret
87940:
880        AARCH64_VALID_JUMP_TARGET
881        ld1r            {v2.2s}, [x5]
882        sub             x1,  x1,  #4
883        neg             v2.8b,   v2.8b            // -m
884        sxtl            v2.8h,   v2.8b
885        shl             v2.8h,   v2.8h,   #9      // -m << 9
8864:
887        ld1             {v1.8h},   [x2], #16
888        ld1             {v0.d}[0], [x0]
889        ld1             {v0.d}[1], [x8]
890        subs            w4,  w4,  #2
891        sub             v1.8h,   v0.8h,   v1.8h   // a - b
892        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
893        add             v0.8h,   v0.8h,   v1.8h
894        st1             {v0.s}[0], [x0], #4
895        st1             {v0.s}[2], [x8], #4
896        st1             {v0.h}[2], [x0], x1
897        st1             {v0.h}[6], [x8], x1
898        b.gt            4b
899        ret
90080:
901        AARCH64_VALID_JUMP_TARGET
902        ld1             {v4.8b}, [x5]
903        sub             x1,  x1,  #8
904        neg             v4.8b,   v4.8b            // -m
905        sxtl            v4.8h,   v4.8b
906        shl             v4.8h,   v4.8h,   #9      // -m << 9
9078:
908        ld1             {v2.8h, v3.8h}, [x2], #32
909        ld1             {v0.8h}, [x0]
910        ld1             {v1.8h}, [x8]
911        subs            w4,  w4,  #2
912        sub             v2.8h,   v0.8h,   v2.8h   // a - b
913        sub             v3.8h,   v1.8h,   v3.8h
914        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
915        sqrdmulh        v3.8h,   v3.8h,   v4.8h
916        add             v0.8h,   v0.8h,   v2.8h
917        add             v1.8h,   v1.8h,   v3.8h
918        st1             {v0.d}[0], [x0], #8
919        st1             {v1.d}[0], [x8], #8
920        st1             {v0.s}[2], [x0], x1
921        st1             {v1.s}[2], [x8], x1
922        b.gt            8b
923        ret
924160:
925        AARCH64_VALID_JUMP_TARGET
926        ld1             {v16.16b}, [x5]
927        sub             x1,  x1,  #16
928        neg             v17.16b, v16.16b          // -m
929        sxtl            v16.8h,  v17.8b
930        sxtl2           v17.8h,  v17.16b
931        shl             v16.8h,  v16.8h,  #9      // -m << 9
932        shl             v17.4h,  v17.4h,  #9
93316:
934        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
935        ld1             {v0.8h, v1.8h}, [x0]
936        subs            w4,  w4,  #2
937        ld1             {v2.8h, v3.8h}, [x8]
938        sub             v4.8h,   v0.8h,   v4.8h   // a - b
939        sub             v5.4h,   v1.4h,   v5.4h
940        sub             v6.8h,   v2.8h,   v6.8h
941        sub             v7.4h,   v3.4h,   v7.4h
942        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
943        sqrdmulh        v5.4h,   v5.4h,   v17.4h
944        sqrdmulh        v6.8h,   v6.8h,   v16.8h
945        sqrdmulh        v7.4h,   v7.4h,   v17.4h
946        add             v0.8h,   v0.8h,   v4.8h
947        add             v1.4h,   v1.4h,   v5.4h
948        add             v2.8h,   v2.8h,   v6.8h
949        add             v3.4h,   v3.4h,   v7.4h
950        st1             {v0.8h}, [x0], #16
951        st1             {v2.8h}, [x8], #16
952        st1             {v1.4h}, [x0], x1
953        st1             {v3.4h}, [x8], x1
954        b.gt            16b
955        ret
956320:
957        AARCH64_VALID_JUMP_TARGET
958        ld1             {v24.16b, v25.16b},  [x5]
959        neg             v26.16b, v24.16b          // -m
960        neg             v27.8b,  v25.8b
961        sxtl            v24.8h,  v26.8b
962        sxtl2           v25.8h,  v26.16b
963        sxtl            v26.8h,  v27.8b
964        shl             v24.8h,  v24.8h,  #9      // -m << 9
965        shl             v25.8h,  v25.8h,  #9
966        shl             v26.8h,  v26.8h,  #9
96732:
968        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
969        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
970        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
971        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
972        subs            w4,  w4,  #2
973        sub             v16.8h,  v0.8h,   v16.8h  // a - b
974        sub             v17.8h,  v1.8h,   v17.8h
975        sub             v18.8h,  v2.8h,   v18.8h
976        sub             v20.8h,  v4.8h,   v20.8h
977        sub             v21.8h,  v5.8h,   v21.8h
978        sub             v22.8h,  v6.8h,   v22.8h
979        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
980        sqrdmulh        v17.8h,  v17.8h,  v25.8h
981        sqrdmulh        v18.8h,  v18.8h,  v26.8h
982        sqrdmulh        v20.8h,  v20.8h,  v24.8h
983        sqrdmulh        v21.8h,  v21.8h,  v25.8h
984        sqrdmulh        v22.8h,  v22.8h,  v26.8h
985        add             v0.8h,   v0.8h,   v16.8h
986        add             v1.8h,   v1.8h,   v17.8h
987        add             v2.8h,   v2.8h,   v18.8h
988        add             v4.8h,   v4.8h,   v20.8h
989        add             v5.8h,   v5.8h,   v21.8h
990        add             v6.8h,   v6.8h,   v22.8h
991        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
992        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
993        b.gt            32b
994        ret
995L(blend_v_tbl):
996        .hword L(blend_v_tbl) - 320b
997        .hword L(blend_v_tbl) - 160b
998        .hword L(blend_v_tbl) -  80b
999        .hword L(blend_v_tbl) -  40b
1000        .hword L(blend_v_tbl) -  20b
1001endfunc
1002
1003
1004// This has got the same signature as the put_8tap functions,
1005// and assumes that x9 is set to (clz(w)-24).
1006function put_neon
1007        adr             x10, L(put_tbl)
1008        ldrh            w9, [x10, x9, lsl #1]
1009        sub             x10, x10, w9, uxtw
1010        br              x10
1011
10122:
1013        AARCH64_VALID_JUMP_TARGET
1014        ld1             {v0.s}[0], [x2], x3
1015        ld1             {v1.s}[0], [x2], x3
1016        subs            w5,  w5,  #2
1017        st1             {v0.s}[0], [x0], x1
1018        st1             {v1.s}[0], [x0], x1
1019        b.gt            2b
1020        ret
10214:
1022        AARCH64_VALID_JUMP_TARGET
1023        ld1             {v0.4h}, [x2], x3
1024        ld1             {v1.4h}, [x2], x3
1025        subs            w5,  w5,  #2
1026        st1             {v0.4h}, [x0], x1
1027        st1             {v1.4h}, [x0], x1
1028        b.gt            4b
1029        ret
103080:
1031        AARCH64_VALID_JUMP_TARGET
1032        add             x8,  x0,  x1
1033        lsl             x1,  x1,  #1
1034        add             x9,  x2,  x3
1035        lsl             x3,  x3,  #1
10368:
1037        ld1             {v0.8h}, [x2], x3
1038        ld1             {v1.8h}, [x9], x3
1039        subs            w5,  w5,  #2
1040        st1             {v0.8h}, [x0], x1
1041        st1             {v1.8h}, [x8], x1
1042        b.gt            8b
1043        ret
104416:
1045        AARCH64_VALID_JUMP_TARGET
1046        ldp             x6,  x7,  [x2]
1047        ldp             x8,  x9,  [x2, #16]
1048        stp             x6,  x7,  [x0]
1049        subs            w5,  w5,  #1
1050        stp             x8,  x9,  [x0, #16]
1051        add             x2,  x2,  x3
1052        add             x0,  x0,  x1
1053        b.gt            16b
1054        ret
105532:
1056        AARCH64_VALID_JUMP_TARGET
1057        ldp             x6,  x7,  [x2]
1058        ldp             x8,  x9,  [x2, #16]
1059        stp             x6,  x7,  [x0]
1060        ldp             x10, x11, [x2, #32]
1061        stp             x8,  x9,  [x0, #16]
1062        subs            w5,  w5,  #1
1063        ldp             x12, x13, [x2, #48]
1064        stp             x10, x11, [x0, #32]
1065        stp             x12, x13, [x0, #48]
1066        add             x2,  x2,  x3
1067        add             x0,  x0,  x1
1068        b.gt            32b
1069        ret
107064:
1071        AARCH64_VALID_JUMP_TARGET
1072        ldp             q0,  q1,  [x2]
1073        ldp             q2,  q3,  [x2, #32]
1074        stp             q0,  q1,  [x0]
1075        ldp             q4,  q5,  [x2, #64]
1076        stp             q2,  q3,  [x0, #32]
1077        ldp             q6,  q7,  [x2, #96]
1078        subs            w5,  w5,  #1
1079        stp             q4,  q5,  [x0, #64]
1080        stp             q6,  q7,  [x0, #96]
1081        add             x2,  x2,  x3
1082        add             x0,  x0,  x1
1083        b.gt            64b
1084        ret
1085128:
1086        AARCH64_VALID_JUMP_TARGET
1087        ldp             q0,  q1,  [x2]
1088        ldp             q2,  q3,  [x2, #32]
1089        stp             q0,  q1,  [x0]
1090        ldp             q4,  q5,  [x2, #64]
1091        stp             q2,  q3,  [x0, #32]
1092        ldp             q6,  q7,  [x2, #96]
1093        subs            w5,  w5,  #1
1094        stp             q4,  q5,  [x0, #64]
1095        ldp             q16, q17, [x2, #128]
1096        stp             q6,  q7,  [x0, #96]
1097        ldp             q18, q19, [x2, #160]
1098        stp             q16, q17, [x0, #128]
1099        ldp             q20, q21, [x2, #192]
1100        stp             q18, q19, [x0, #160]
1101        ldp             q22, q23, [x2, #224]
1102        stp             q20, q21, [x0, #192]
1103        stp             q22, q23, [x0, #224]
1104        add             x2,  x2,  x3
1105        add             x0,  x0,  x1
1106        b.gt            128b
1107        ret
1108
1109L(put_tbl):
1110        .hword L(put_tbl) - 128b
1111        .hword L(put_tbl) -  64b
1112        .hword L(put_tbl) -  32b
1113        .hword L(put_tbl) -  16b
1114        .hword L(put_tbl) -  80b
1115        .hword L(put_tbl) -   4b
1116        .hword L(put_tbl) -   2b
1117endfunc
1118
1119
1120// This has got the same signature as the prep_8tap functions,
1121// and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
1122// x8 to w*2.
1123function prep_neon
1124        adr             x10, L(prep_tbl)
1125        ldrh            w9, [x10, x9, lsl #1]
1126        dup             v31.8h,  w7   // intermediate_bits
1127        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
1128        sub             x10, x10, w9, uxtw
1129        br              x10
1130
113140:
1132        AARCH64_VALID_JUMP_TARGET
1133        add             x9,  x1,  x2
1134        lsl             x2,  x2,  #1
11354:
1136        ld1             {v0.d}[0], [x1], x2
1137        ld1             {v0.d}[1], [x9], x2
1138        subs            w4,  w4,  #2
1139        sshl            v0.8h,   v0.8h,   v31.8h
1140        sub             v0.8h,   v0.8h,   v30.8h
1141        st1             {v0.8h}, [x0], #16
1142        b.gt            4b
1143        ret
114480:
1145        AARCH64_VALID_JUMP_TARGET
1146        add             x9,  x1,  x2
1147        lsl             x2,  x2,  #1
11488:
1149        ld1             {v0.8h}, [x1], x2
1150        ld1             {v1.8h}, [x9], x2
1151        subs            w4,  w4,  #2
1152        sshl            v0.8h,   v0.8h,   v31.8h
1153        sshl            v1.8h,   v1.8h,   v31.8h
1154        sub             v0.8h,   v0.8h,   v30.8h
1155        sub             v1.8h,   v1.8h,   v30.8h
1156        st1             {v0.8h, v1.8h}, [x0], #32
1157        b.gt            8b
1158        ret
115916:
1160        AARCH64_VALID_JUMP_TARGET
1161        ldp             q0,  q1,  [x1]
1162        add             x1,  x1,  x2
1163        sshl            v0.8h,   v0.8h,   v31.8h
1164        ldp             q2,  q3,  [x1]
1165        add             x1,  x1,  x2
1166        subs            w4,  w4,  #2
1167        sshl            v1.8h,   v1.8h,   v31.8h
1168        sshl            v2.8h,   v2.8h,   v31.8h
1169        sshl            v3.8h,   v3.8h,   v31.8h
1170        sub             v0.8h,   v0.8h,   v30.8h
1171        sub             v1.8h,   v1.8h,   v30.8h
1172        sub             v2.8h,   v2.8h,   v30.8h
1173        sub             v3.8h,   v3.8h,   v30.8h
1174        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1175        b.gt            16b
1176        ret
117732:
1178        AARCH64_VALID_JUMP_TARGET
1179        ldp             q0,  q1,  [x1]
1180        sshl            v0.8h,   v0.8h,   v31.8h
1181        ldp             q2,  q3,  [x1, #32]
1182        add             x1,  x1,  x2
1183        sshl            v1.8h,   v1.8h,   v31.8h
1184        sshl            v2.8h,   v2.8h,   v31.8h
1185        sshl            v3.8h,   v3.8h,   v31.8h
1186        subs            w4,  w4,  #1
1187        sub             v0.8h,   v0.8h,   v30.8h
1188        sub             v1.8h,   v1.8h,   v30.8h
1189        sub             v2.8h,   v2.8h,   v30.8h
1190        sub             v3.8h,   v3.8h,   v30.8h
1191        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
1192        b.gt            32b
1193        ret
119464:
1195        AARCH64_VALID_JUMP_TARGET
1196        ldp             q0,  q1,  [x1]
1197        subs            w4,  w4,  #1
1198        sshl            v0.8h,   v0.8h,   v31.8h
1199        ldp             q2,  q3,  [x1, #32]
1200        sshl            v1.8h,   v1.8h,   v31.8h
1201        ldp             q4,  q5,  [x1, #64]
1202        sshl            v2.8h,   v2.8h,   v31.8h
1203        sshl            v3.8h,   v3.8h,   v31.8h
1204        ldp             q6,  q7,  [x1, #96]
1205        add             x1,  x1,  x2
1206        sshl            v4.8h,   v4.8h,   v31.8h
1207        sshl            v5.8h,   v5.8h,   v31.8h
1208        sshl            v6.8h,   v6.8h,   v31.8h
1209        sshl            v7.8h,   v7.8h,   v31.8h
1210        sub             v0.8h,   v0.8h,   v30.8h
1211        sub             v1.8h,   v1.8h,   v30.8h
1212        sub             v2.8h,   v2.8h,   v30.8h
1213        sub             v3.8h,   v3.8h,   v30.8h
1214        stp             q0,  q1,  [x0]
1215        sub             v4.8h,   v4.8h,   v30.8h
1216        sub             v5.8h,   v5.8h,   v30.8h
1217        stp             q2,  q3,  [x0, #32]
1218        sub             v6.8h,   v6.8h,   v30.8h
1219        sub             v7.8h,   v7.8h,   v30.8h
1220        stp             q4,  q5,  [x0, #64]
1221        stp             q6,  q7,  [x0, #96]
1222        add             x0,  x0,  x8
1223        b.gt            64b
1224        ret
1225128:
1226        AARCH64_VALID_JUMP_TARGET
1227        ldp             q0,  q1,  [x1]
1228        subs            w4,  w4,  #1
1229        sshl            v0.8h,   v0.8h,   v31.8h
1230        ldp             q2,  q3,  [x1, #32]
1231        sshl            v1.8h,   v1.8h,   v31.8h
1232        ldp             q4,  q5,  [x1, #64]
1233        sshl            v2.8h,   v2.8h,   v31.8h
1234        sshl            v3.8h,   v3.8h,   v31.8h
1235        ldp             q6,  q7,  [x1, #96]
1236        sshl            v4.8h,   v4.8h,   v31.8h
1237        sshl            v5.8h,   v5.8h,   v31.8h
1238        ldp             q16, q17, [x1, #128]
1239        sshl            v6.8h,   v6.8h,   v31.8h
1240        sshl            v7.8h,   v7.8h,   v31.8h
1241        ldp             q18, q19, [x1, #160]
1242        sshl            v16.8h,  v16.8h,  v31.8h
1243        sshl            v17.8h,  v17.8h,  v31.8h
1244        ldp             q20, q21, [x1, #192]
1245        sshl            v18.8h,  v18.8h,  v31.8h
1246        sshl            v19.8h,  v19.8h,  v31.8h
1247        ldp             q22, q23, [x1, #224]
1248        add             x1,  x1,  x2
1249        sshl            v20.8h,  v20.8h,  v31.8h
1250        sshl            v21.8h,  v21.8h,  v31.8h
1251        sshl            v22.8h,  v22.8h,  v31.8h
1252        sshl            v23.8h,  v23.8h,  v31.8h
1253        sub             v0.8h,   v0.8h,   v30.8h
1254        sub             v1.8h,   v1.8h,   v30.8h
1255        sub             v2.8h,   v2.8h,   v30.8h
1256        sub             v3.8h,   v3.8h,   v30.8h
1257        stp             q0,  q1,  [x0]
1258        sub             v4.8h,   v4.8h,   v30.8h
1259        sub             v5.8h,   v5.8h,   v30.8h
1260        stp             q2,  q3,  [x0, #32]
1261        sub             v6.8h,   v6.8h,   v30.8h
1262        sub             v7.8h,   v7.8h,   v30.8h
1263        stp             q4,  q5,  [x0, #64]
1264        sub             v16.8h,  v16.8h,  v30.8h
1265        sub             v17.8h,  v17.8h,  v30.8h
1266        stp             q6,  q7,  [x0, #96]
1267        sub             v18.8h,  v18.8h,  v30.8h
1268        sub             v19.8h,  v19.8h,  v30.8h
1269        stp             q16, q17, [x0, #128]
1270        sub             v20.8h,  v20.8h,  v30.8h
1271        sub             v21.8h,  v21.8h,  v30.8h
1272        stp             q18, q19, [x0, #160]
1273        sub             v22.8h,  v22.8h,  v30.8h
1274        sub             v23.8h,  v23.8h,  v30.8h
1275        stp             q20, q21, [x0, #192]
1276        stp             q22, q23, [x0, #224]
1277        add             x0,  x0,  x8
1278        b.gt            128b
1279        ret
1280
1281L(prep_tbl):
1282        .hword L(prep_tbl) - 128b
1283        .hword L(prep_tbl) -  64b
1284        .hword L(prep_tbl) -  32b
1285        .hword L(prep_tbl) -  16b
1286        .hword L(prep_tbl) -  80b
1287        .hword L(prep_tbl) -  40b
1288endfunc
1289
1290
1291.macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1292        ld1             {\d0\wd}[0], [\s0], \strd
1293        ld1             {\d1\wd}[0], [\s1], \strd
1294.ifnb \d2
1295        ld1             {\d2\wd}[0], [\s0], \strd
1296        ld1             {\d3\wd}[0], [\s1], \strd
1297.endif
1298.ifnb \d4
1299        ld1             {\d4\wd}[0], [\s0], \strd
1300.endif
1301.ifnb \d5
1302        ld1             {\d5\wd}[0], [\s1], \strd
1303.endif
1304.ifnb \d6
1305        ld1             {\d6\wd}[0], [\s0], \strd
1306.endif
1307.endm
1308.macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
1309        ld1             {\d0\wd}, [\s0], \strd
1310        ld1             {\d1\wd}, [\s1], \strd
1311.ifnb \d2
1312        ld1             {\d2\wd}, [\s0], \strd
1313        ld1             {\d3\wd}, [\s1], \strd
1314.endif
1315.ifnb \d4
1316        ld1             {\d4\wd}, [\s0], \strd
1317.endif
1318.ifnb \d5
1319        ld1             {\d5\wd}, [\s1], \strd
1320.endif
1321.ifnb \d6
1322        ld1             {\d6\wd}, [\s0], \strd
1323.endif
1324.endm
1325.macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
1326        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
1327.ifnb \d2
1328        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
1329.endif
1330.ifnb \d4
1331        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
1332.endif
1333.endm
1334.macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1335        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1336.endm
1337.macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1338        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1339.endm
1340.macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
1341        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
1342.endm
1343.macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
1344        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
1345.endm
1346.macro interleave_1 wd, r0, r1, r2, r3, r4
1347        trn1            \r0\wd, \r0\wd, \r1\wd
1348        trn1            \r1\wd, \r1\wd, \r2\wd
1349.ifnb \r3
1350        trn1            \r2\wd, \r2\wd, \r3\wd
1351        trn1            \r3\wd, \r3\wd, \r4\wd
1352.endif
1353.endm
1354.macro interleave_1_s r0, r1, r2, r3, r4
1355        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
1356.endm
1357.macro umin_h c, wd, r0, r1, r2, r3
1358        umin            \r0\wd,  \r0\wd,  \c\wd
1359.ifnb \r1
1360        umin            \r1\wd,  \r1\wd,  \c\wd
1361.endif
1362.ifnb \r2
1363        umin            \r2\wd,  \r2\wd,  \c\wd
1364        umin            \r3\wd,  \r3\wd,  \c\wd
1365.endif
1366.endm
1367.macro sub_h c, wd, r0, r1, r2, r3
1368        sub             \r0\wd,  \r0\wd,  \c\wd
1369.ifnb \r1
1370        sub             \r1\wd,  \r1\wd,  \c\wd
1371.endif
1372.ifnb \r2
1373        sub             \r2\wd,  \r2\wd,  \c\wd
1374        sub             \r3\wd,  \r3\wd,  \c\wd
1375.endif
1376.endm
1377.macro smull_smlal_4tap d, s0, s1, s2, s3
1378        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1379        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1380        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1381        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1382.endm
1383.macro smull2_smlal2_4tap d, s0, s1, s2, s3
1384        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1385        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1386        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1387        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1388.endm
1389.macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1390        smull           \d\().4s,  \s1\().4h,  v0.h[1]
1391        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1392        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1393        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1394        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1395        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1396.endm
1397.macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
1398        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
1399        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1400        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1401        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1402        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1403        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1404.endm
1405.macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1406        smull           \d\().4s,  \s0\().4h,  v0.h[0]
1407        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
1408        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
1409        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
1410        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
1411        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
1412        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
1413        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
1414.endm
1415.macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
1416        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
1417        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
1418        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
1419        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
1420        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
1421        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
1422        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
1423        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
1424.endm
1425.macro sqrshrun_h shift, r0, r1, r2, r3
1426        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
1427.ifnb \r1
1428        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
1429.endif
1430.ifnb \r2
1431        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
1432        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
1433.endif
1434.endm
1435.macro xtn_h r0, r1, r2, r3
1436        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
1437.ifnb \r2
1438        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
1439.endif
1440.endm
1441.macro srshl_s shift, r0, r1, r2, r3
1442        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
1443        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
1444.ifnb \r2
1445        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
1446        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
1447.endif
1448.endm
1449.macro st_s strd, reg, lanes
1450        st1             {\reg\().s}[0], [x0], \strd
1451        st1             {\reg\().s}[1], [x9], \strd
1452.if \lanes > 2
1453        st1             {\reg\().s}[2], [x0], \strd
1454        st1             {\reg\().s}[3], [x9], \strd
1455.endif
1456.endm
1457.macro st_d strd, r0, r1
1458        st1             {\r0\().d}[0], [x0], \strd
1459        st1             {\r0\().d}[1], [x9], \strd
1460.ifnb \r1
1461        st1             {\r1\().d}[0], [x0], \strd
1462        st1             {\r1\().d}[1], [x9], \strd
1463.endif
1464.endm
1465.macro shift_store_4 type, strd, r0, r1, r2, r3
1466.ifc \type, put
1467        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1468        umin_h          v31, .8h, \r0, \r2
1469.else
1470        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1471        xtn_h           \r0, \r1, \r2, \r3
1472        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1473.endif
1474        st_d            \strd, \r0, \r2
1475.endm
1476.macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
1477        st1             {\r0\wd}, [x0], \strd
1478        st1             {\r1\wd}, [x9], \strd
1479.ifnb \r2
1480        st1             {\r2\wd}, [x0], \strd
1481        st1             {\r3\wd}, [x9], \strd
1482.endif
1483.ifnb \r4
1484        st1             {\r4\wd}, [x0], \strd
1485        st1             {\r5\wd}, [x9], \strd
1486        st1             {\r6\wd}, [x0], \strd
1487        st1             {\r7\wd}, [x9], \strd
1488.endif
1489.endm
1490.macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
1491        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
1492.endm
1493.macro shift_store_8 type, strd, r0, r1, r2, r3
1494.ifc \type, put
1495        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1496        umin_h          v31, .8h, \r0, \r2
1497.else
1498        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1499        xtn_h           \r0, \r1, \r2, \r3
1500        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
1501.endif
1502        st_8h           \strd, \r0, \r2
1503.endm
1504.macro shift_store_16 type, strd, dst, r0, r1, r2, r3
1505.ifc \type, put
1506        sqrshrun_h      6,   \r0, \r1, \r2, \r3
1507        umin            \r0\().8h, \r0\().8h, v31.8h
1508        umin            \r1\().8h, \r2\().8h, v31.8h
1509.else
1510        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
1511        xtn_h           \r0, \r1, \r2, \r3
1512        sub             \r0\().8h, \r0\().8h, v29.8h
1513        sub             \r1\().8h, \r2\().8h, v29.8h
1514.endif
1515        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
1516.endm
1517
1518.macro make_8tap_fn op, type, type_h, type_v, taps
1519function \op\()_8tap_\type\()_16bpc_neon, export=1
1520        mov             w9,  \type_h
1521        mov             w10, \type_v
1522        b               \op\()_\taps\()_neon
1523endfunc
1524.endm
1525
1526// No spaces in these expressions, due to gas-preprocessor.
1527#define REGULAR ((0*15<<7)|3*15)
1528#define SMOOTH  ((1*15<<7)|4*15)
1529#define SHARP   ((2*15<<7)|3*15)
1530
1531.macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
1532function \type\()_\taps\()_neon
1533.ifc \bdmax, w8
1534        ldr             w8,  [sp]
1535.endif
1536        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
1537        mul             \mx,  \mx, w11
1538        mul             \my,  \my, w11
1539        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
1540        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
1541.ifc \type, prep
1542        uxtw            \d_strd, \w
1543        lsl             \d_strd, \d_strd, #1
1544.endif
1545
1546        dup             v31.8h,  \bdmax        // bitdepth_max
1547        clz             \bdmax,  \bdmax
1548        clz             w9,  \w
1549        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
1550        mov             w12, #6
1551        tst             \mx, #(0x7f << 14)
1552        sub             w9,  w9,  #24
1553        add             w13, w12, \bdmax       // 6 + intermediate_bits
1554        sub             w12, w12, \bdmax       // 6 - intermediate_bits
1555        movrel          x11, X(mc_subpel_filters), -8
1556        b.ne            L(\type\()_\taps\()_h)
1557        tst             \my, #(0x7f << 14)
1558        b.ne            L(\type\()_\taps\()_v)
1559        b               \type\()_neon
1560
1561L(\type\()_\taps\()_h):
1562        cmp             \w,   #4
1563        ubfx            w10,  \mx, #7, #7
1564        and             \mx,  \mx, #0x7f
1565        b.le            4f
1566        mov             \mx,  w10
15674:
1568        tst             \my,  #(0x7f << 14)
1569        add             \xmx, x11, \mx, uxtw #3
1570        b.ne            L(\type\()_\taps\()_hv)
1571
1572        adr             x10, L(\type\()_\taps\()_h_tbl)
1573        dup             v30.4s,  w12           // 6 - intermediate_bits
1574        ldrh            w9,  [x10, x9, lsl #1]
1575        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1576.ifc \type, put
1577        dup             v29.8h,  \bdmax        // intermediate_bits
1578.else
1579        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
1580.endif
1581        sub             x10, x10, w9, uxtw
1582.ifc \type, put
1583        neg             v29.8h,  v29.8h        // -intermediate_bits
1584.endif
1585        br              x10
1586
158720:     // 2xN h
1588        AARCH64_VALID_JUMP_TARGET
1589.ifc \type, put
1590        add             \xmx,  \xmx,  #2
1591        ld1             {v0.s}[0], [\xmx]
1592        sub             \src,  \src,  #2
1593        add             \ds2,  \dst,  \d_strd
1594        add             \sr2,  \src,  \s_strd
1595        lsl             \d_strd,  \d_strd,  #1
1596        lsl             \s_strd,  \s_strd,  #1
1597        sxtl            v0.8h,   v0.8b
15982:
1599        ld1             {v4.8h},  [\src], \s_strd
1600        ld1             {v6.8h},  [\sr2], \s_strd
1601        ext             v5.16b,  v4.16b,  v4.16b,  #2
1602        ext             v7.16b,  v6.16b,  v6.16b,  #2
1603        subs            \h,  \h,  #2
1604        trn1            v3.2s,   v4.2s,   v6.2s
1605        trn2            v6.2s,   v4.2s,   v6.2s
1606        trn1            v4.2s,   v5.2s,   v7.2s
1607        trn2            v7.2s,   v5.2s,   v7.2s
1608        smull           v3.4s,   v3.4h,   v0.h[0]
1609        smlal           v3.4s,   v4.4h,   v0.h[1]
1610        smlal           v3.4s,   v6.4h,   v0.h[2]
1611        smlal           v3.4s,   v7.4h,   v0.h[3]
1612        srshl           v3.4s,   v3.4s,   v30.4s // -(6-intermediate_bits)
1613        sqxtun          v3.4h,   v3.4s
1614        srshl           v3.4h,   v3.4h,   v29.4h // -intermediate_bits
1615        umin            v3.4h,   v3.4h,   v31.4h
1616        st1             {v3.s}[0], [\dst], \d_strd
1617        st1             {v3.s}[1], [\ds2], \d_strd
1618        b.gt            2b
1619        ret
1620.endif
1621
162240:     // 4xN h
1623        AARCH64_VALID_JUMP_TARGET
1624        add             \xmx,  \xmx,  #2
1625        ld1             {v0.s}[0], [\xmx]
1626        sub             \src,  \src,  #2
1627        add             \ds2,  \dst,  \d_strd
1628        add             \sr2,  \src,  \s_strd
1629        lsl             \d_strd,  \d_strd,  #1
1630        lsl             \s_strd,  \s_strd,  #1
1631        sxtl            v0.8h,   v0.8b
16324:
1633        ld1             {v16.8h}, [\src], \s_strd
1634        ld1             {v20.8h}, [\sr2], \s_strd
1635        ext             v17.16b, v16.16b, v16.16b, #2
1636        ext             v18.16b, v16.16b, v16.16b, #4
1637        ext             v19.16b, v16.16b, v16.16b, #6
1638        ext             v21.16b, v20.16b, v20.16b, #2
1639        ext             v22.16b, v20.16b, v20.16b, #4
1640        ext             v23.16b, v20.16b, v20.16b, #6
1641        subs            \h,  \h,  #2
1642        smull           v16.4s,  v16.4h,  v0.h[0]
1643        smlal           v16.4s,  v17.4h,  v0.h[1]
1644        smlal           v16.4s,  v18.4h,  v0.h[2]
1645        smlal           v16.4s,  v19.4h,  v0.h[3]
1646        smull           v20.4s,  v20.4h,  v0.h[0]
1647        smlal           v20.4s,  v21.4h,  v0.h[1]
1648        smlal           v20.4s,  v22.4h,  v0.h[2]
1649        smlal           v20.4s,  v23.4h,  v0.h[3]
1650        srshl           v16.4s,  v16.4s,  v30.4s // -(6-intermediate_bits)
1651        srshl           v20.4s,  v20.4s,  v30.4s // -(6-intermediate_bits)
1652.ifc \type, put
1653        sqxtun          v16.4h,  v16.4s
1654        sqxtun2         v16.8h,  v20.4s
1655        srshl           v16.8h,  v16.8h,  v29.8h // -intermediate_bits
1656        umin            v16.8h,  v16.8h,  v31.8h
1657.else
1658        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
1659        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
1660.endif
1661        st1             {v16.d}[0], [\dst], \d_strd
1662        st1             {v16.d}[1], [\ds2], \d_strd
1663        b.gt            4b
1664        ret
1665
166680:
1667160:
1668320:
1669640:
16701280:   // 8xN, 16xN, 32xN, ... h
1671        AARCH64_VALID_JUMP_TARGET
1672        ld1             {v0.8b}, [\xmx]
1673        sub             \src,  \src,  #6
1674        add             \ds2,  \dst,  \d_strd
1675        add             \sr2,  \src,  \s_strd
1676        lsl             \s_strd,  \s_strd,  #1
1677        sxtl            v0.8h,   v0.8b
1678
1679        sub             \s_strd,  \s_strd,  \w, uxtw #1
1680        sub             \s_strd,  \s_strd,  #16
1681.ifc \type, put
1682        lsl             \d_strd,  \d_strd,  #1
1683        sub             \d_strd,  \d_strd,  \w, uxtw #1
1684.endif
168581:
1686        ld1             {v16.8h, v17.8h},  [\src], #32
1687        ld1             {v20.8h, v21.8h},  [\sr2], #32
1688        mov             \mx, \w
1689
16908:
1691.ifc \taps, 6tap
1692        ext             v24.16b, v16.16b, v17.16b, #2
1693        ext             v25.16b, v20.16b, v21.16b, #2
1694        smull           v18.4s,  v24.4h,  v0.h[1]
1695        smull2          v19.4s,  v24.8h,  v0.h[1]
1696        smull           v22.4s,  v25.4h,  v0.h[1]
1697        smull2          v23.4s,  v25.8h,  v0.h[1]
1698.irpc i, 23456
1699        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1700        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1701        smlal           v18.4s,  v24.4h,  v0.h[\i]
1702        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1703        smlal           v22.4s,  v25.4h,  v0.h[\i]
1704        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1705.endr
1706.else   // 8tap
1707        smull           v18.4s,  v16.4h,  v0.h[0]
1708        smull2          v19.4s,  v16.8h,  v0.h[0]
1709        smull           v22.4s,  v20.4h,  v0.h[0]
1710        smull2          v23.4s,  v20.8h,  v0.h[0]
1711.irpc i, 1234567
1712        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
1713        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
1714        smlal           v18.4s,  v24.4h,  v0.h[\i]
1715        smlal2          v19.4s,  v24.8h,  v0.h[\i]
1716        smlal           v22.4s,  v25.4h,  v0.h[\i]
1717        smlal2          v23.4s,  v25.8h,  v0.h[\i]
1718.endr
1719.endif
1720        subs            \mx, \mx, #8
1721        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
1722        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
1723        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
1724        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
1725.ifc \type, put
1726        sqxtun          v18.4h,  v18.4s
1727        sqxtun2         v18.8h,  v19.4s
1728        sqxtun          v22.4h,  v22.4s
1729        sqxtun2         v22.8h,  v23.4s
1730        srshl           v18.8h,  v18.8h,  v29.8h // -intermediate_bits
1731        srshl           v22.8h,  v22.8h,  v29.8h // -intermediate_bits
1732        umin            v18.8h,  v18.8h,  v31.8h
1733        umin            v22.8h,  v22.8h,  v31.8h
1734.else
1735        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
1736        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
1737        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
1738        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
1739.endif
1740        st1             {v18.8h}, [\dst], #16
1741        st1             {v22.8h}, [\ds2], #16
1742        b.le            9f
1743
1744        mov             v16.16b, v17.16b
1745        mov             v20.16b, v21.16b
1746        ld1             {v17.8h}, [\src], #16
1747        ld1             {v21.8h}, [\sr2], #16
1748        b               8b
1749
17509:
1751        add             \dst,  \dst,  \d_strd
1752        add             \ds2,  \ds2,  \d_strd
1753        add             \src,  \src,  \s_strd
1754        add             \sr2,  \sr2,  \s_strd
1755
1756        subs            \h,  \h,  #2
1757        b.gt            81b
1758        ret
1759
1760L(\type\()_\taps\()_h_tbl):
1761        .hword L(\type\()_\taps\()_h_tbl) - 1280b
1762        .hword L(\type\()_\taps\()_h_tbl) -  640b
1763        .hword L(\type\()_\taps\()_h_tbl) -  320b
1764        .hword L(\type\()_\taps\()_h_tbl) -  160b
1765        .hword L(\type\()_\taps\()_h_tbl) -   80b
1766        .hword L(\type\()_\taps\()_h_tbl) -   40b
1767        .hword L(\type\()_\taps\()_h_tbl) -   20b
1768        .hword 0
1769
1770
1771L(\type\()_\taps\()_v):
1772        cmp             \h,  #4
1773        ubfx            w10, \my, #7, #7
1774        and             \my, \my, #0x7f
1775        b.le            4f
1776        mov             \my, w10
17774:
1778        add             \xmy, x11, \my, uxtw #3
1779
1780.ifc \type, prep
1781        dup             v30.4s,  w12           // 6 - intermediate_bits
1782        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
1783.endif
1784        adr             x10, L(\type\()_\taps\()_v_tbl)
1785        ldrh            w9,  [x10, x9, lsl #1]
1786.ifc \type, prep
1787        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
1788.endif
1789        sub             x10, x10, w9, uxtw
1790        br              x10
1791
179220:     // 2xN v
1793        AARCH64_VALID_JUMP_TARGET
1794.ifc \type, put
1795        b.gt            28f
1796
1797        cmp             \h,  #2
1798        add             \xmy, \xmy, #2
1799        ld1             {v0.s}[0], [\xmy]
1800        sub             \src,  \src,  \s_strd
1801        add             \ds2,  \dst,  \d_strd
1802        add             \sr2,  \src,  \s_strd
1803        lsl             \s_strd,  \s_strd,  #1
1804        lsl             \d_strd,  \d_strd,  #1
1805        sxtl            v0.8h,   v0.8b
1806
1807        // 2x2 v
1808        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1809        interleave_1_s  v1,  v2,  v3,  v4,  v5
1810        b.gt            24f
1811        smull_smlal_4tap v6, v1,  v2,  v3,  v4
1812        sqrshrun_h      6,   v6
1813        umin_h          v31, .8h, v6
1814        st_s            \d_strd, v6, 2
1815        ret
1816
181724:     // 2x4 v
1818        load_s          \sr2, \src, \s_strd, v6, v7
1819        interleave_1_s  v5,  v6,  v7
1820        smull_smlal_4tap v16, v1, v2, v3, v4
1821        smull_smlal_4tap v17, v3, v4, v5, v6
1822        sqrshrun_h      6,   v16, v17
1823        umin_h          v31, .8h, v16
1824        st_s            \d_strd, v16, 4
1825        ret
1826
182728:     // 2x6, 2x8, 2x12, 2x16 v
1828        ld1             {v0.8b}, [\xmy]
1829        sub             \sr2,  \src,  \s_strd, lsl #1
1830        add             \ds2,  \dst,  \d_strd
1831        sub             \src,  \sr2,  \s_strd
1832        lsl             \d_strd,  \d_strd,  #1
1833        lsl             \s_strd,  \s_strd,  #1
1834        sxtl            v0.8h,   v0.8b
1835
1836        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
1837        interleave_1_s  v1,  v2,  v3,  v4,  v5
1838        interleave_1_s  v5,  v6,  v7
1839216:
1840        subs            \h,  \h,  #4
1841        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
1842        interleave_1_s  v7,  v16, v17, v18, v19
1843        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
1844        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
1845        sqrshrun_h      6,   v24, v25
1846        umin_h          v31, .8h, v24
1847        st_s            \d_strd, v24, 4
1848        b.le            0f
1849        cmp             \h,  #2
1850        mov             v1.16b,  v5.16b
1851        mov             v2.16b,  v6.16b
1852        mov             v3.16b,  v7.16b
1853        mov             v4.16b,  v16.16b
1854        mov             v5.16b,  v17.16b
1855        mov             v6.16b,  v18.16b
1856        mov             v7.16b,  v19.16b
1857        b.eq            26f
1858        b               216b
185926:
1860        load_s          \sr2, \src, \s_strd, v16, v17
1861        interleave_1_s  v7,  v16, v17
1862        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
1863        sqrshrun_h      6,   v24
1864        umin_h          v31, .4h, v24
1865        st_s            \d_strd, v24, 2
18660:
1867        ret
1868.endif
1869
187040:
1871        AARCH64_VALID_JUMP_TARGET
1872        b.gt            480f
1873
1874        // 4x2, 4x4 v
1875        cmp             \h,  #2
1876        add             \xmy, \xmy, #2
1877        ld1             {v0.s}[0], [\xmy]
1878        sub             \src, \src, \s_strd
1879        add             \ds2, \dst, \d_strd
1880        add             \sr2, \src, \s_strd
1881        lsl             \s_strd, \s_strd, #1
1882        lsl             \d_strd, \d_strd, #1
1883        sxtl            v0.8h,   v0.8b
1884
1885        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1886        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
1887        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
1888        shift_store_4   \type, \d_strd, v6, v7
1889        b.le            0f
1890        load_4h         \sr2, \src, \s_strd, v6, v7
1891        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
1892        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
1893        shift_store_4   \type, \d_strd, v1, v2
18940:
1895        ret
1896
1897480:    // 4x6, 4x8, 4x12, 4x16 v
1898        ld1             {v0.8b}, [\xmy]
1899        sub             \sr2, \src, \s_strd, lsl #1
1900        add             \ds2, \dst, \d_strd
1901        sub             \src, \sr2, \s_strd
1902        lsl             \s_strd, \s_strd, #1
1903        lsl             \d_strd, \d_strd, #1
1904        sxtl            v0.8h,   v0.8b
1905
1906        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1907
190848:
1909        subs            \h,  \h,  #4
1910        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
1911        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1912        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1913        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
1914        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
1915        shift_store_4   \type, \d_strd, v1, v2, v3, v4
1916        b.le            0f
1917        cmp             \h,  #2
1918        mov             v16.8b,  v20.8b
1919        mov             v17.8b,  v21.8b
1920        mov             v18.8b,  v22.8b
1921        mov             v19.8b,  v23.8b
1922        mov             v20.8b,  v24.8b
1923        mov             v21.8b,  v25.8b
1924        mov             v22.8b,  v26.8b
1925        b.eq            46f
1926        b               48b
192746:
1928        load_4h         \sr2, \src, \s_strd, v23, v24
1929        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
1930        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
1931        shift_store_4   \type, \d_strd, v1, v2
19320:
1933        ret
1934
193580:
1936        AARCH64_VALID_JUMP_TARGET
1937        b.gt            880f
1938
1939        // 8x2, 8x4 v
1940        cmp             \h,  #2
1941        add             \xmy, \xmy, #2
1942        ld1             {v0.s}[0], [\xmy]
1943        sub             \src, \src, \s_strd
1944        add             \ds2, \dst, \d_strd
1945        add             \sr2, \src, \s_strd
1946        lsl             \s_strd, \s_strd, #1
1947        lsl             \d_strd, \d_strd, #1
1948        sxtl            v0.8h,   v0.8b
1949
1950        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
1951        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
1952        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
1953        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
1954        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
1955        shift_store_8   \type, \d_strd, v16, v17, v18, v19
1956        b.le            0f
1957        load_8h         \sr2, \src, \s_strd, v6, v7
1958        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
1959        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
1960        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
1961        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
1962        shift_store_8   \type, \d_strd, v16, v17, v18, v19
19630:
1964        ret
1965
1966880:    // 8x6, 8x8, 8x16, 8x32 v
19671680:   // 16x8, 16x16, ...
1968320:    // 32x8, 32x16, ...
1969640:
19701280:
1971        AARCH64_VALID_JUMP_TARGET
1972        ld1             {v0.8b}, [\xmy]
1973        sub             \src, \src, \s_strd
1974        sub             \src, \src, \s_strd, lsl #1
1975        sxtl            v0.8h,   v0.8b
1976        mov             \my,  \h
1977168:
1978        add             \ds2, \dst, \d_strd
1979        add             \sr2, \src, \s_strd
1980        lsl             \s_strd, \s_strd, #1
1981        lsl             \d_strd, \d_strd, #1
1982
1983        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
1984
198588:
1986        subs            \h,  \h,  #2
1987        load_8h         \sr2, \src, \s_strd, v23, v24
1988        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
1989        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
1990        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
1991        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
1992        shift_store_8   \type, \d_strd, v1, v2, v3, v4
1993        b.le            9f
1994        subs            \h,  \h,  #2
1995        load_8h         \sr2, \src, \s_strd, v25, v26
1996        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
1997        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
1998        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
1999        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
2000        shift_store_8   \type, \d_strd, v1, v2, v3, v4
2001        b.le            9f
2002        mov             v16.16b, v20.16b
2003        mov             v17.16b, v21.16b
2004        mov             v18.16b, v22.16b
2005        mov             v19.16b, v23.16b
2006        mov             v20.16b, v24.16b
2007        mov             v21.16b, v25.16b
2008        mov             v22.16b, v26.16b
2009        b               88b
20109:
2011        subs            \w,  \w,  #8
2012        b.le            0f
2013        asr             \s_strd, \s_strd, #1
2014        asr             \d_strd, \d_strd, #1
2015        msub            \src, \s_strd, \xmy, \src
2016        msub            \dst, \d_strd, \xmy, \dst
2017        sub             \src, \src, \s_strd, lsl #3
2018        mov             \h,  \my
2019        add             \src, \src, #16
2020        add             \dst, \dst, #16
2021        b               168b
20220:
2023        ret
2024
2025160:
2026        AARCH64_VALID_JUMP_TARGET
2027        b.gt            1680b
2028
2029        // 16x2, 16x4 v
2030        add             \xmy, \xmy, #2
2031        ld1             {v0.s}[0], [\xmy]
2032        sub             \src, \src, \s_strd
2033        sxtl            v0.8h,   v0.8b
2034
2035        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
203616:
2037        load_16h        \src, \src, \s_strd, v22, v23
2038        subs            \h,  \h,  #1
2039        smull_smlal_4tap   v1, v16, v18, v20, v22
2040        smull2_smlal2_4tap v2, v16, v18, v20, v22
2041        smull_smlal_4tap   v3, v17, v19, v21, v23
2042        smull2_smlal2_4tap v4, v17, v19, v21, v23
2043        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
2044        b.le            0f
2045        mov             v16.16b, v18.16b
2046        mov             v17.16b, v19.16b
2047        mov             v18.16b, v20.16b
2048        mov             v19.16b, v21.16b
2049        mov             v20.16b, v22.16b
2050        mov             v21.16b, v23.16b
2051        b               16b
20520:
2053        ret
2054
2055L(\type\()_\taps\()_v_tbl):
2056        .hword L(\type\()_\taps\()_v_tbl) - 1280b
2057        .hword L(\type\()_\taps\()_v_tbl) -  640b
2058        .hword L(\type\()_\taps\()_v_tbl) -  320b
2059        .hword L(\type\()_\taps\()_v_tbl) -  160b
2060        .hword L(\type\()_\taps\()_v_tbl) -   80b
2061        .hword L(\type\()_\taps\()_v_tbl) -   40b
2062        .hword L(\type\()_\taps\()_v_tbl) -   20b
2063        .hword 0
2064
2065L(\type\()_\taps\()_hv):
2066        cmp             \h,  #4
2067        ubfx            w10, \my, #7, #7
2068        and             \my, \my, #0x7f
2069        b.le            4f
2070        mov             \my,  w10
20714:
2072        add             \xmy, x11, \my, uxtw #3
2073
2074        adr             x10, L(\type\()_\taps\()_hv_tbl)
2075        dup             v30.4s,  w12           // 6 - intermediate_bits
2076        ldrh            w9,  [x10, x9, lsl #1]
2077        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
2078.ifc \type, put
2079        dup             v29.4s,  w13           // 6 + intermediate_bits
2080.else
2081        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2082.endif
2083        sub             x10, x10, w9, uxtw
2084.ifc \type, put
2085        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
2086.endif
2087        br              x10
2088
208920:
2090        AARCH64_VALID_JUMP_TARGET
2091.ifc \type, put
2092        add             \xmx,  \xmx,  #2
2093        ld1             {v0.s}[0],  [\xmx]
2094        b.gt            280f
2095        add             \xmy,  \xmy,  #2
2096        ld1             {v1.s}[0],  [\xmy]
2097
2098        // 2x2, 2x4 hv
2099        sub             \sr2, \src, #2
2100        sub             \src, \sr2, \s_strd
2101        add             \ds2, \dst, \d_strd
2102        lsl             \s_strd, \s_strd, #1
2103        lsl             \d_strd, \d_strd, #1
2104        sxtl            v0.8h,   v0.8b
2105        sxtl            v1.8h,   v1.8b
2106        mov             x15, x30
2107
2108        ld1             {v27.8h}, [\src], \s_strd
2109        ext             v28.16b, v27.16b, v27.16b, #2
2110        smull           v27.4s,  v27.4h,  v0.4h
2111        smull           v28.4s,  v28.4h,  v0.4h
2112        addp            v27.4s,  v27.4s,  v28.4s
2113        addp            v16.4s,  v27.4s,  v27.4s
2114        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2115        bl              L(\type\()_\taps\()_filter_2)
2116        // The intermediates from the horizontal pass fit in 16 bit without
2117        // any bias; we could just as well keep them as .4s, but narrowing
2118        // them to .4h gives a significant speedup on out of order cores
2119        // (at the cost of a smaller slowdown on in-order cores such as A53).
2120        xtn             v16.4h,  v16.4s
2121
2122        trn1            v16.2s,  v16.2s,  v24.2s
2123        mov             v17.8b,  v24.8b
2124
21252:
2126        bl              L(\type\()_\taps\()_filter_2)
2127
2128        ext             v18.8b,  v17.8b,  v24.8b,  #4
2129        smull           v2.4s,   v16.4h,  v1.h[0]
2130        smlal           v2.4s,   v17.4h,  v1.h[1]
2131        smlal           v2.4s,   v18.4h,  v1.h[2]
2132        smlal           v2.4s,   v24.4h,  v1.h[3]
2133
2134        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2135        sqxtun          v2.4h,   v2.4s
2136        umin            v2.4h,   v2.4h,   v31.4h
2137        subs            \h,  \h,  #2
2138        st1             {v2.s}[0], [\dst], \d_strd
2139        st1             {v2.s}[1], [\ds2], \d_strd
2140        b.le            0f
2141        mov             v16.8b,  v18.8b
2142        mov             v17.8b,  v24.8b
2143        b               2b
2144
2145280:    // 2x8, 2x16, 2x32 hv
2146        ld1             {v1.8b},  [\xmy]
2147        sub             \src, \src, #2
2148        sub             \sr2, \src, \s_strd, lsl #1
2149        sub             \src, \sr2, \s_strd
2150        add             \ds2, \dst, \d_strd
2151        lsl             \s_strd, \s_strd, #1
2152        lsl             \d_strd, \d_strd, #1
2153        sxtl            v0.8h,   v0.8b
2154        sxtl            v1.8h,   v1.8b
2155        mov             x15, x30
2156
2157        ld1             {v27.8h}, [\src], \s_strd
2158        ext             v28.16b, v27.16b, v27.16b, #2
2159        smull           v27.4s,  v27.4h,  v0.4h
2160        smull           v28.4s,  v28.4h,  v0.4h
2161        addp            v27.4s,  v27.4s,  v28.4s
2162        addp            v16.4s,  v27.4s,  v27.4s
2163        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
2164        // The intermediates from the horizontal pass fit in 16 bit without
2165        // any bias; we could just as well keep them as .4s, but narrowing
2166        // them to .4h gives a significant speedup on out of order cores
2167        // (at the cost of a smaller slowdown on in-order cores such as A53).
2168
2169        bl              L(\type\()_\taps\()_filter_2)
2170        xtn             v16.4h,  v16.4s
2171        trn1            v16.2s,  v16.2s,  v24.2s
2172        mov             v17.8b,  v24.8b
2173        bl              L(\type\()_\taps\()_filter_2)
2174        ext             v18.8b,  v17.8b,  v24.8b,  #4
2175        mov             v19.8b,  v24.8b
2176        bl              L(\type\()_\taps\()_filter_2)
2177        ext             v20.8b,  v19.8b,  v24.8b,  #4
2178        mov             v21.8b,  v24.8b
2179
218028:
2181        bl              L(\type\()_\taps\()_filter_2)
2182        ext             v22.8b,  v21.8b,  v24.8b,  #4
2183.ifc \taps, 6tap
2184        smull           v3.4s,   v17.4h,  v1.h[1]
2185        smlal           v3.4s,   v18.4h,  v1.h[2]
2186        smlal           v3.4s,   v19.4h,  v1.h[3]
2187        smlal           v3.4s,   v20.4h,  v1.h[4]
2188        smlal           v3.4s,   v21.4h,  v1.h[5]
2189        smlal           v3.4s,   v22.4h,  v1.h[6]
2190.else   // 8tap
2191        smull           v3.4s,   v16.4h,  v1.h[0]
2192        smlal           v3.4s,   v17.4h,  v1.h[1]
2193        smlal           v3.4s,   v18.4h,  v1.h[2]
2194        smlal           v3.4s,   v19.4h,  v1.h[3]
2195        smlal           v3.4s,   v20.4h,  v1.h[4]
2196        smlal           v3.4s,   v21.4h,  v1.h[5]
2197        smlal           v3.4s,   v22.4h,  v1.h[6]
2198        smlal           v3.4s,   v24.4h,  v1.h[7]
2199.endif
2200
2201        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2202        sqxtun          v3.4h,   v3.4s
2203        umin            v3.4h,   v3.4h,   v31.4h
2204        subs            \h,  \h,  #2
2205        st1             {v3.s}[0], [\dst], \d_strd
2206        st1             {v3.s}[1], [\ds2], \d_strd
2207        b.le            0f
2208        mov             v16.8b,  v18.8b
2209        mov             v17.8b,  v19.8b
2210        mov             v18.8b,  v20.8b
2211        mov             v19.8b,  v21.8b
2212        mov             v20.8b,  v22.8b
2213        mov             v21.8b,  v24.8b
2214        b               28b
2215
22160:
2217        ret             x15
2218
2219L(\type\()_\taps\()_filter_2):
2220        ld1             {v25.8h},  [\sr2], \s_strd
2221        ld1             {v27.8h},  [\src], \s_strd
2222        ext             v26.16b, v25.16b, v25.16b, #2
2223        ext             v28.16b, v27.16b, v27.16b, #2
2224        trn1            v24.2s,  v25.2s,  v27.2s
2225        trn2            v27.2s,  v25.2s,  v27.2s
2226        trn1            v25.2s,  v26.2s,  v28.2s
2227        trn2            v28.2s,  v26.2s,  v28.2s
2228        smull           v24.4s,  v24.4h,  v0.h[0]
2229        smlal           v24.4s,  v25.4h,  v0.h[1]
2230        smlal           v24.4s,  v27.4h,  v0.h[2]
2231        smlal           v24.4s,  v28.4h,  v0.h[3]
2232        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2233        xtn             v24.4h,  v24.4s
2234        ret
2235.endif
2236
223740:
2238        AARCH64_VALID_JUMP_TARGET
2239        add             \xmx, \xmx, #2
2240        ld1             {v0.s}[0],  [\xmx]
2241        b.gt            480f
2242        add             \xmy, \xmy,  #2
2243        ld1             {v1.s}[0],  [\xmy]
2244        sub             \sr2, \src, #2
2245        sub             \src, \sr2, \s_strd
2246        add             \ds2, \dst, \d_strd
2247        lsl             \s_strd, \s_strd, #1
2248        lsl             \d_strd, \d_strd, #1
2249        sxtl            v0.8h,   v0.8b
2250        sxtl            v1.8h,   v1.8b
2251        mov             x15, x30
2252
2253        // 4x2, 4x4 hv
2254        ld1             {v25.8h}, [\src], \s_strd
2255        ext             v26.16b, v25.16b, v25.16b, #2
2256        ext             v27.16b, v25.16b, v25.16b, #4
2257        ext             v28.16b, v25.16b, v25.16b, #6
2258        smull           v25.4s,  v25.4h,  v0.h[0]
2259        smlal           v25.4s,  v26.4h,  v0.h[1]
2260        smlal           v25.4s,  v27.4h,  v0.h[2]
2261        smlal           v25.4s,  v28.4h,  v0.h[3]
2262        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2263        // The intermediates from the horizontal pass fit in 16 bit without
2264        // any bias; we could just as well keep them as .4s, but narrowing
2265        // them to .4h gives a significant speedup on out of order cores
2266        // (at the cost of a smaller slowdown on in-order cores such as A53).
2267        xtn             v16.4h,  v16.4s
2268
2269        bl              L(\type\()_\taps\()_filter_4)
2270        mov             v17.8b,  v24.8b
2271        mov             v18.8b,  v25.8b
2272
22734:
2274        bl              L(\type\()_\taps\()_filter_4)
2275        smull           v2.4s,   v16.4h,  v1.h[0]
2276        smlal           v2.4s,   v17.4h,  v1.h[1]
2277        smlal           v2.4s,   v18.4h,  v1.h[2]
2278        smlal           v2.4s,   v24.4h,  v1.h[3]
2279        smull           v3.4s,   v17.4h,  v1.h[0]
2280        smlal           v3.4s,   v18.4h,  v1.h[1]
2281        smlal           v3.4s,   v24.4h,  v1.h[2]
2282        smlal           v3.4s,   v25.4h,  v1.h[3]
2283.ifc \type, put
2284        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2285        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2286        sqxtun          v2.4h,   v2.4s
2287        sqxtun2         v2.8h,   v3.4s
2288        umin            v2.8h,   v2.8h,   v31.8h
2289.else
2290        rshrn           v2.4h,   v2.4s,   #6
2291        rshrn2          v2.8h,   v3.4s,   #6
2292        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2293.endif
2294        subs            \h,  \h,  #2
2295
2296        st1             {v2.d}[0], [\dst], \d_strd
2297        st1             {v2.d}[1], [\ds2], \d_strd
2298        b.le            0f
2299        mov             v16.8b,  v18.8b
2300        mov             v17.8b,  v24.8b
2301        mov             v18.8b,  v25.8b
2302        b               4b
2303
2304480:    // 4x8, 4x16, 4x32 hv
2305        ld1             {v1.8b},  [\xmy]
2306        sub             \src, \src, #2
2307.ifc \taps, 6tap
2308        sub             \sr2, \src, \s_strd
2309        sub             \src, \src, \s_strd, lsl #1
2310.else
2311        sub             \sr2, \src, \s_strd, lsl #1
2312        sub             \src, \sr2, \s_strd
2313.endif
2314        add             \ds2, \dst, \d_strd
2315        lsl             \s_strd, \s_strd, #1
2316        lsl             \d_strd, \d_strd, #1
2317        sxtl            v0.8h,   v0.8b
2318        sxtl            v1.8h,   v1.8b
2319        mov             x15, x30
2320
2321        ld1             {v25.8h}, [\src], \s_strd
2322        ext             v26.16b, v25.16b, v25.16b, #2
2323        ext             v27.16b, v25.16b, v25.16b, #4
2324        ext             v28.16b, v25.16b, v25.16b, #6
2325        smull           v25.4s,  v25.4h,  v0.h[0]
2326        smlal           v25.4s,  v26.4h,  v0.h[1]
2327        smlal           v25.4s,  v27.4h,  v0.h[2]
2328        smlal           v25.4s,  v28.4h,  v0.h[3]
2329        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2330        // The intermediates from the horizontal pass fit in 16 bit without
2331        // any bias; we could just as well keep them as .4s, but narrowing
2332        // them to .4h gives a significant speedup on out of order cores
2333        // (at the cost of a smaller slowdown on in-order cores such as A53).
2334.ifc \taps, 6tap
2335        xtn             v18.4h,  v16.4s
2336.else
2337        xtn             v16.4h,  v16.4s
2338
2339        bl              L(\type\()_\taps\()_filter_4)
2340        mov             v17.8b,  v24.8b
2341        mov             v18.8b,  v25.8b
2342.endif
2343        bl              L(\type\()_\taps\()_filter_4)
2344        mov             v19.8b,  v24.8b
2345        mov             v20.8b,  v25.8b
2346        bl              L(\type\()_\taps\()_filter_4)
2347        mov             v21.8b,  v24.8b
2348        mov             v22.8b,  v25.8b
2349
235048:
2351        bl              L(\type\()_\taps\()_filter_4)
2352.ifc \taps, 6tap
2353        smull           v3.4s,   v18.4h,  v1.h[1]
2354        smlal           v3.4s,   v19.4h,  v1.h[2]
2355        smlal           v3.4s,   v20.4h,  v1.h[3]
2356        smlal           v3.4s,   v21.4h,  v1.h[4]
2357        smlal           v3.4s,   v22.4h,  v1.h[5]
2358        smlal           v3.4s,   v24.4h,  v1.h[6]
2359        smull           v4.4s,   v19.4h,  v1.h[1]
2360        smlal           v4.4s,   v20.4h,  v1.h[2]
2361        smlal           v4.4s,   v21.4h,  v1.h[3]
2362        smlal           v4.4s,   v22.4h,  v1.h[4]
2363        smlal           v4.4s,   v24.4h,  v1.h[5]
2364        smlal           v4.4s,   v25.4h,  v1.h[6]
2365.else   // 8tap
2366        smull           v3.4s,   v16.4h,  v1.h[0]
2367        smlal           v3.4s,   v17.4h,  v1.h[1]
2368        smlal           v3.4s,   v18.4h,  v1.h[2]
2369        smlal           v3.4s,   v19.4h,  v1.h[3]
2370        smlal           v3.4s,   v20.4h,  v1.h[4]
2371        smlal           v3.4s,   v21.4h,  v1.h[5]
2372        smlal           v3.4s,   v22.4h,  v1.h[6]
2373        smlal           v3.4s,   v24.4h,  v1.h[7]
2374        smull           v4.4s,   v17.4h,  v1.h[0]
2375        smlal           v4.4s,   v18.4h,  v1.h[1]
2376        smlal           v4.4s,   v19.4h,  v1.h[2]
2377        smlal           v4.4s,   v20.4h,  v1.h[3]
2378        smlal           v4.4s,   v21.4h,  v1.h[4]
2379        smlal           v4.4s,   v22.4h,  v1.h[5]
2380        smlal           v4.4s,   v24.4h,  v1.h[6]
2381        smlal           v4.4s,   v25.4h,  v1.h[7]
2382.endif
2383.ifc \type, put
2384        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2385        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2386        sqxtun          v3.4h,   v3.4s
2387        sqxtun2         v3.8h,   v4.4s
2388        umin            v3.8h,   v3.8h,   v31.8h
2389.else
2390        rshrn           v3.4h,   v3.4s,   #6
2391        rshrn2          v3.8h,   v4.4s,   #6
2392        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2393.endif
2394        subs            \h,  \h,  #2
2395        st1             {v3.d}[0], [\dst], \d_strd
2396        st1             {v3.d}[1], [\ds2], \d_strd
2397        b.le            0f
2398.ifc \taps, 8tap
2399        mov             v16.8b,  v18.8b
2400        mov             v17.8b,  v19.8b
2401.endif
2402        mov             v18.8b,  v20.8b
2403        mov             v19.8b,  v21.8b
2404        mov             v20.8b,  v22.8b
2405        mov             v21.8b,  v24.8b
2406        mov             v22.8b,  v25.8b
2407        b               48b
24080:
2409        ret             x15
2410
2411L(\type\()_\taps\()_filter_4):
2412        ld1             {v24.8h}, [\sr2], \s_strd
2413        ld1             {v25.8h}, [\src], \s_strd
2414        ext             v26.16b, v24.16b, v24.16b, #2
2415        ext             v27.16b, v24.16b, v24.16b, #4
2416        ext             v28.16b, v24.16b, v24.16b, #6
2417        smull           v24.4s,  v24.4h,  v0.h[0]
2418        smlal           v24.4s,  v26.4h,  v0.h[1]
2419        smlal           v24.4s,  v27.4h,  v0.h[2]
2420        smlal           v24.4s,  v28.4h,  v0.h[3]
2421        ext             v26.16b, v25.16b, v25.16b, #2
2422        ext             v27.16b, v25.16b, v25.16b, #4
2423        ext             v28.16b, v25.16b, v25.16b, #6
2424        smull           v25.4s,  v25.4h,  v0.h[0]
2425        smlal           v25.4s,  v26.4h,  v0.h[1]
2426        smlal           v25.4s,  v27.4h,  v0.h[2]
2427        smlal           v25.4s,  v28.4h,  v0.h[3]
2428        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2429        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2430        xtn             v24.4h,  v24.4s
2431        xtn             v25.4h,  v25.4s
2432        ret
2433
243480:
2435160:
2436320:
2437        AARCH64_VALID_JUMP_TARGET
2438        b.gt            880f
2439        add             \xmy,  \xmy,  #2
2440        ld1             {v0.8b},  [\xmx]
2441        ld1             {v1.s}[0],  [\xmy]
2442        sub             \src,  \src,  #6
2443        sub             \src,  \src,  \s_strd
2444        sxtl            v0.8h,   v0.8b
2445        sxtl            v1.8h,   v1.8b
2446        mov             x15, x30
2447        mov             \my, \h
2448
2449164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
2450        add             \ds2,  \dst,  \d_strd
2451        add             \sr2,  \src,  \s_strd
2452        lsl             \d_strd, \d_strd, #1
2453        lsl             \s_strd, \s_strd, #1
2454
2455        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2456        smull           v24.4s,  v27.4h,  v0.h[0]
2457        smull2          v25.4s,  v27.8h,  v0.h[0]
2458.irpc i, 1234567
2459        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2460        smlal           v24.4s,  v26.4h,  v0.h[\i]
2461        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2462.endr
2463        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2464        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2465        // The intermediates from the horizontal pass fit in 16 bit without
2466        // any bias; we could just as well keep them as .4s, but narrowing
2467        // them to .4h gives a significant speedup on out of order cores
2468        // (at the cost of a smaller slowdown on in-order cores such as A53),
2469        // and conserves register space (no need to clobber v8-v15).
2470        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2471
2472        bl              L(\type\()_\taps\()_filter_8)
2473        mov             v17.16b, v23.16b
2474        mov             v18.16b, v24.16b
2475
24768:
2477        smull           v2.4s,   v16.4h,  v1.h[0]
2478        smull2          v3.4s,   v16.8h,  v1.h[0]
2479        bl              L(\type\()_\taps\()_filter_8)
2480        smull           v4.4s,   v17.4h,  v1.h[0]
2481        smull2          v5.4s,   v17.8h,  v1.h[0]
2482        smlal           v2.4s,   v17.4h,  v1.h[1]
2483        smlal2          v3.4s,   v17.8h,  v1.h[1]
2484        smlal           v4.4s,   v18.4h,  v1.h[1]
2485        smlal2          v5.4s,   v18.8h,  v1.h[1]
2486        smlal           v2.4s,   v18.4h,  v1.h[2]
2487        smlal2          v3.4s,   v18.8h,  v1.h[2]
2488        smlal           v4.4s,   v23.4h,  v1.h[2]
2489        smlal2          v5.4s,   v23.8h,  v1.h[2]
2490        smlal           v2.4s,   v23.4h,  v1.h[3]
2491        smlal2          v3.4s,   v23.8h,  v1.h[3]
2492        smlal           v4.4s,   v24.4h,  v1.h[3]
2493        smlal2          v5.4s,   v24.8h,  v1.h[3]
2494.ifc \type, put
2495        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2496        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2497        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2498        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2499        sqxtun          v2.4h,   v2.4s
2500        sqxtun2         v2.8h,   v3.4s
2501        sqxtun          v3.4h,   v4.4s
2502        sqxtun2         v3.8h,   v5.4s
2503        umin            v2.8h,   v2.8h,   v31.8h
2504        umin            v3.8h,   v3.8h,   v31.8h
2505.else
2506        rshrn           v2.4h,   v2.4s,   #6
2507        rshrn2          v2.8h,   v3.4s,   #6
2508        rshrn           v3.4h,   v4.4s,   #6
2509        rshrn2          v3.8h,   v5.4s,   #6
2510        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2511        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2512.endif
2513        subs            \h,  \h,  #2
2514        st1             {v2.8h}, [\dst], \d_strd
2515        st1             {v3.8h}, [\ds2], \d_strd
2516        b.le            9f
2517        mov             v16.16b, v18.16b
2518        mov             v17.16b, v23.16b
2519        mov             v18.16b, v24.16b
2520        b               8b
25219:
2522        subs            \w,  \w,  #8
2523        b.le            0f
2524        asr             \s_strd,  \s_strd,  #1
2525        asr             \d_strd,  \d_strd,  #1
2526        msub            \src,  \s_strd,  \xmy,  \src
2527        msub            \dst,  \d_strd,  \xmy,  \dst
2528        sub             \src,  \src,  \s_strd,  lsl #2
2529        mov             \h,  \my
2530        add             \src,  \src,  #16
2531        add             \dst,  \dst,  #16
2532        b               164b
2533
2534880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
2535640:
25361280:
2537        AARCH64_VALID_JUMP_TARGET
2538        ld1             {v0.8b},  [\xmx]
2539        ld1             {v1.8b},  [\xmy]
2540        sub             \src,  \src,  #6
2541.ifc \taps, 8tap
2542        sub             \src,  \src,  \s_strd
2543.endif
2544        sub             \src,  \src,  \s_strd, lsl #1
2545        sxtl            v0.8h,   v0.8b
2546        sxtl            v1.8h,   v1.8b
2547        mov             x15, x30
2548        mov             \my, \h
2549
2550168:
2551        add             \ds2,  \dst,  \d_strd
2552        add             \sr2,  \src,  \s_strd
2553        lsl             \d_strd, \d_strd, #1
2554        lsl             \s_strd, \s_strd, #1
2555
2556        ld1             {v27.8h, v28.8h},  [\src], \s_strd
2557.ifc \taps, 6tap
2558        ext             v26.16b, v27.16b, v28.16b, #2
2559        smull           v24.4s,  v26.4h,  v0.h[1]
2560        smull2          v25.4s,  v26.8h,  v0.h[1]
2561.irpc i, 23456
2562        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2563        smlal           v24.4s,  v26.4h,  v0.h[\i]
2564        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2565.endr
2566.else   // 8tap
2567        smull           v24.4s,  v27.4h,  v0.h[0]
2568        smull2          v25.4s,  v27.8h,  v0.h[0]
2569.irpc i, 1234567
2570        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
2571        smlal           v24.4s,  v26.4h,  v0.h[\i]
2572        smlal2          v25.4s,  v26.8h,  v0.h[\i]
2573.endr
2574.endif
2575        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
2576        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2577        // The intermediates from the horizontal pass fit in 16 bit without
2578        // any bias; we could just as well keep them as .4s, but narrowing
2579        // them to .4h gives a significant speedup on out of order cores
2580        // (at the cost of a smaller slowdown on in-order cores such as A53),
2581        // and conserves register space (no need to clobber v8-v15).
2582.ifc \taps, 6tap
2583        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2584.else
2585        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
2586
2587        bl              L(\type\()_\taps\()_filter_8)
2588        mov             v17.16b, v23.16b
2589        mov             v18.16b, v24.16b
2590.endif
2591        bl              L(\type\()_\taps\()_filter_8)
2592        mov             v19.16b, v23.16b
2593        mov             v20.16b, v24.16b
2594        bl              L(\type\()_\taps\()_filter_8)
2595        mov             v21.16b, v23.16b
2596        mov             v22.16b, v24.16b
2597
259888:
2599.ifc \taps, 6tap
2600        smull           v2.4s,   v18.4h,  v1.h[1]
2601        smull2          v3.4s,   v18.8h,  v1.h[1]
2602        bl              L(\type\()_\taps\()_filter_8)
2603        smull           v4.4s,   v19.4h,  v1.h[1]
2604        smull2          v5.4s,   v19.8h,  v1.h[1]
2605        smlal           v2.4s,   v19.4h,  v1.h[2]
2606        smlal2          v3.4s,   v19.8h,  v1.h[2]
2607        smlal           v4.4s,   v20.4h,  v1.h[2]
2608        smlal2          v5.4s,   v20.8h,  v1.h[2]
2609        smlal           v2.4s,   v20.4h,  v1.h[3]
2610        smlal2          v3.4s,   v20.8h,  v1.h[3]
2611        smlal           v4.4s,   v21.4h,  v1.h[3]
2612        smlal2          v5.4s,   v21.8h,  v1.h[3]
2613        smlal           v2.4s,   v21.4h,  v1.h[4]
2614        smlal2          v3.4s,   v21.8h,  v1.h[4]
2615        smlal           v4.4s,   v22.4h,  v1.h[4]
2616        smlal2          v5.4s,   v22.8h,  v1.h[4]
2617        smlal           v2.4s,   v22.4h,  v1.h[5]
2618        smlal2          v3.4s,   v22.8h,  v1.h[5]
2619        smlal           v4.4s,   v23.4h,  v1.h[5]
2620        smlal2          v5.4s,   v23.8h,  v1.h[5]
2621        smlal           v2.4s,   v23.4h,  v1.h[6]
2622        smlal2          v3.4s,   v23.8h,  v1.h[6]
2623        smlal           v4.4s,   v24.4h,  v1.h[6]
2624        smlal2          v5.4s,   v24.8h,  v1.h[6]
2625.else   // 8tap
2626        smull           v2.4s,   v16.4h,  v1.h[0]
2627        smull2          v3.4s,   v16.8h,  v1.h[0]
2628        bl              L(\type\()_\taps\()_filter_8)
2629        smull           v4.4s,   v17.4h,  v1.h[0]
2630        smull2          v5.4s,   v17.8h,  v1.h[0]
2631        smlal           v2.4s,   v17.4h,  v1.h[1]
2632        smlal2          v3.4s,   v17.8h,  v1.h[1]
2633        smlal           v4.4s,   v18.4h,  v1.h[1]
2634        smlal2          v5.4s,   v18.8h,  v1.h[1]
2635        smlal           v2.4s,   v18.4h,  v1.h[2]
2636        smlal2          v3.4s,   v18.8h,  v1.h[2]
2637        smlal           v4.4s,   v19.4h,  v1.h[2]
2638        smlal2          v5.4s,   v19.8h,  v1.h[2]
2639        smlal           v2.4s,   v19.4h,  v1.h[3]
2640        smlal2          v3.4s,   v19.8h,  v1.h[3]
2641        smlal           v4.4s,   v20.4h,  v1.h[3]
2642        smlal2          v5.4s,   v20.8h,  v1.h[3]
2643        smlal           v2.4s,   v20.4h,  v1.h[4]
2644        smlal2          v3.4s,   v20.8h,  v1.h[4]
2645        smlal           v4.4s,   v21.4h,  v1.h[4]
2646        smlal2          v5.4s,   v21.8h,  v1.h[4]
2647        smlal           v2.4s,   v21.4h,  v1.h[5]
2648        smlal2          v3.4s,   v21.8h,  v1.h[5]
2649        smlal           v4.4s,   v22.4h,  v1.h[5]
2650        smlal2          v5.4s,   v22.8h,  v1.h[5]
2651        smlal           v2.4s,   v22.4h,  v1.h[6]
2652        smlal2          v3.4s,   v22.8h,  v1.h[6]
2653        smlal           v4.4s,   v23.4h,  v1.h[6]
2654        smlal2          v5.4s,   v23.8h,  v1.h[6]
2655        smlal           v2.4s,   v23.4h,  v1.h[7]
2656        smlal2          v3.4s,   v23.8h,  v1.h[7]
2657        smlal           v4.4s,   v24.4h,  v1.h[7]
2658        smlal2          v5.4s,   v24.8h,  v1.h[7]
2659.endif
2660.ifc \type, put
2661        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
2662        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
2663        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
2664        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
2665        sqxtun          v2.4h,   v2.4s
2666        sqxtun2         v2.8h,   v3.4s
2667        sqxtun          v3.4h,   v4.4s
2668        sqxtun2         v3.8h,   v5.4s
2669        umin            v2.8h,   v2.8h,   v31.8h
2670        umin            v3.8h,   v3.8h,   v31.8h
2671.else
2672        rshrn           v2.4h,   v2.4s,   #6
2673        rshrn2          v2.8h,   v3.4s,   #6
2674        rshrn           v3.4h,   v4.4s,   #6
2675        rshrn2          v3.8h,   v5.4s,   #6
2676        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
2677        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
2678.endif
2679        subs            \h,  \h,  #2
2680        st1             {v2.8h}, [\dst], \d_strd
2681        st1             {v3.8h}, [\ds2], \d_strd
2682        b.le            9f
2683.ifc \taps, 8tap
2684        mov             v16.16b, v18.16b
2685        mov             v17.16b, v19.16b
2686.endif
2687        mov             v18.16b, v20.16b
2688        mov             v19.16b, v21.16b
2689        mov             v20.16b, v22.16b
2690        mov             v21.16b, v23.16b
2691        mov             v22.16b, v24.16b
2692        b               88b
26939:
2694        subs            \w,  \w,  #8
2695        b.le            0f
2696        asr             \s_strd,  \s_strd,  #1
2697        asr             \d_strd,  \d_strd,  #1
2698        msub            \src,  \s_strd,  \xmy,  \src
2699        msub            \dst,  \d_strd,  \xmy,  \dst
2700        sub             \src,  \src,  \s_strd,  lsl #3
2701        mov             \h,  \my
2702        add             \src,  \src,  #16
2703        add             \dst,  \dst,  #16
2704.ifc \taps, 6tap
2705        add             \src,  \src,  \s_strd,  lsl #1
2706.endif
2707        b               168b
27080:
2709        ret             x15
2710
2711L(\type\()_\taps\()_filter_8):
2712        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
2713        ld1             {v6.8h, v7.8h},  [\src], \s_strd
2714.ifc \taps, 6tap
2715        ext             v23.16b, v4.16b,  v5.16b,  #2
2716        ext             v24.16b, v6.16b,  v7.16b,  #2
2717        smull           v25.4s,  v23.4h,  v0.h[1]
2718        smull2          v26.4s,  v23.8h,  v0.h[1]
2719        smull           v27.4s,  v24.4h,  v0.h[1]
2720        smull2          v28.4s,  v24.8h,  v0.h[1]
2721.irpc i, 23456
2722        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2723        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2724        smlal           v25.4s,  v23.4h,  v0.h[\i]
2725        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2726        smlal           v27.4s,  v24.4h,  v0.h[\i]
2727        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2728.endr
2729.else   // 8tap
2730        smull           v25.4s,  v4.4h,   v0.h[0]
2731        smull2          v26.4s,  v4.8h,   v0.h[0]
2732        smull           v27.4s,  v6.4h,   v0.h[0]
2733        smull2          v28.4s,  v6.8h,   v0.h[0]
2734.irpc i, 1234567
2735        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
2736        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
2737        smlal           v25.4s,  v23.4h,  v0.h[\i]
2738        smlal2          v26.4s,  v23.8h,  v0.h[\i]
2739        smlal           v27.4s,  v24.4h,  v0.h[\i]
2740        smlal2          v28.4s,  v24.8h,  v0.h[\i]
2741.endr
2742.endif
2743        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
2744        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
2745        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
2746        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
2747        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
2748        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
2749        ret
2750
2751L(\type\()_\taps\()_hv_tbl):
2752        .hword L(\type\()_\taps\()_hv_tbl) - 1280b
2753        .hword L(\type\()_\taps\()_hv_tbl) -  640b
2754        .hword L(\type\()_\taps\()_hv_tbl) -  320b
2755        .hword L(\type\()_\taps\()_hv_tbl) -  160b
2756        .hword L(\type\()_\taps\()_hv_tbl) -   80b
2757        .hword L(\type\()_\taps\()_hv_tbl) -   40b
2758        .hword L(\type\()_\taps\()_hv_tbl) -   20b
2759        .hword 0
2760endfunc
2761.endm
2762
2763
2764.macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
2765function \type\()_bilin_16bpc_neon, export=1
2766.ifc \bdmax, w8
2767        ldr             w8,  [sp]
2768.endif
2769        dup             v1.8h,   \mx
2770        dup             v3.8h,   \my
2771        mov             w10, #16
2772        sub             w9,  w10, \mx
2773        sub             w10, w10, \my
2774        dup             v0.8h,   w9
2775        dup             v2.8h,   w10
2776.ifc \type, prep
2777        uxtw            \d_strd, \w
2778        lsl             \d_strd, \d_strd, #1
2779.endif
2780
2781        clz             \bdmax,   \bdmax       // bitdepth_max
2782        clz             w9,  \w
2783        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
2784        mov             w11, #4
2785        sub             w9,  w9,  #24
2786        sub             w11, w11, \bdmax  // 4 - intermediate_bits
2787        add             w12, \bdmax, #4   // 4 + intermediate_bits
2788        cbnz            \mx, L(\type\()_bilin_h)
2789        cbnz            \my, L(\type\()_bilin_v)
2790        b               \type\()_neon
2791
2792L(\type\()_bilin_h):
2793        cbnz            \my, L(\type\()_bilin_hv)
2794
2795        adr             x10, L(\type\()_bilin_h_tbl)
2796        dup             v31.8h,  w11      // 4 - intermediate_bits
2797        ldrh            w9,  [x10, x9, lsl #1]
2798        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2799.ifc \type, put
2800        dup             v30.8h,  \bdmax   // intermediate_bits
2801.else
2802        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2803.endif
2804        sub             x10, x10, w9, uxtw
2805.ifc \type, put
2806        neg             v30.8h,  v30.8h   // -intermediate_bits
2807.endif
2808        br              x10
2809
281020:     // 2xN h
2811        AARCH64_VALID_JUMP_TARGET
2812.ifc \type, put
2813        add             \ds2,  \dst,  \d_strd
2814        add             \sr2,  \src,  \s_strd
2815        lsl             \d_strd,  \d_strd,  #1
2816        lsl             \s_strd,  \s_strd,  #1
28172:
2818        ld1             {v4.4h},  [\src], \s_strd
2819        ld1             {v6.4h},  [\sr2], \s_strd
2820        ext             v5.8b,   v4.8b,   v4.8b,   #2
2821        ext             v7.8b,   v6.8b,   v6.8b,   #2
2822        trn1            v4.2s,   v4.2s,   v6.2s
2823        trn1            v5.2s,   v5.2s,   v7.2s
2824        subs            \h,  \h,  #2
2825        mul             v4.4h,   v4.4h,   v0.4h
2826        mla             v4.4h,   v5.4h,   v1.4h
2827        urshl           v4.4h,   v4.4h,   v31.4h
2828        urshl           v4.4h,   v4.4h,   v30.4h
2829        st1             {v4.s}[0], [\dst], \d_strd
2830        st1             {v4.s}[1], [\ds2], \d_strd
2831        b.gt            2b
2832        ret
2833.endif
2834
283540:     // 4xN h
2836        AARCH64_VALID_JUMP_TARGET
2837        add             \ds2,  \dst,  \d_strd
2838        add             \sr2,  \src,  \s_strd
2839        lsl             \d_strd,  \d_strd,  #1
2840        lsl             \s_strd,  \s_strd,  #1
28414:
2842        ld1             {v4.8h}, [\src], \s_strd
2843        ld1             {v6.8h}, [\sr2], \s_strd
2844        ext             v5.16b,  v4.16b,  v4.16b,  #2
2845        ext             v7.16b,  v6.16b,  v6.16b,  #2
2846        trn1            v4.2d,   v4.2d,   v6.2d
2847        trn1            v5.2d,   v5.2d,   v7.2d
2848        subs            \h,  \h,  #2
2849        mul             v4.8h,   v4.8h,   v0.8h
2850        mla             v4.8h,   v5.8h,   v1.8h
2851        urshl           v4.8h,   v4.8h,   v31.8h
2852.ifc \type, put
2853        urshl           v4.8h,   v4.8h,   v30.8h
2854.else
2855        sub             v4.8h,   v4.8h,   v29.8h
2856.endif
2857        st1             {v4.d}[0], [\dst], \d_strd
2858        st1             {v4.d}[1], [\ds2], \d_strd
2859        b.gt            4b
2860        ret
2861
286280:     // 8xN h
2863        AARCH64_VALID_JUMP_TARGET
2864        add             \ds2,  \dst,  \d_strd
2865        add             \sr2,  \src,  \s_strd
2866        lsl             \d_strd,  \d_strd,  #1
2867        lsl             \s_strd,  \s_strd,  #1
28688:
2869        ldr             h5,  [\src, #16]
2870        ldr             h7,  [\sr2, #16]
2871        ld1             {v4.8h}, [\src], \s_strd
2872        ld1             {v6.8h}, [\sr2], \s_strd
2873        ext             v5.16b,  v4.16b,  v5.16b,  #2
2874        ext             v7.16b,  v6.16b,  v7.16b,  #2
2875        subs            \h,  \h,  #2
2876        mul             v4.8h,   v4.8h,   v0.8h
2877        mla             v4.8h,   v5.8h,   v1.8h
2878        mul             v6.8h,   v6.8h,   v0.8h
2879        mla             v6.8h,   v7.8h,   v1.8h
2880        urshl           v4.8h,   v4.8h,   v31.8h
2881        urshl           v6.8h,   v6.8h,   v31.8h
2882.ifc \type, put
2883        urshl           v4.8h,   v4.8h,   v30.8h
2884        urshl           v6.8h,   v6.8h,   v30.8h
2885.else
2886        sub             v4.8h,   v4.8h,   v29.8h
2887        sub             v6.8h,   v6.8h,   v29.8h
2888.endif
2889        st1             {v4.8h}, [\dst], \d_strd
2890        st1             {v6.8h}, [\ds2], \d_strd
2891        b.gt            8b
2892        ret
2893160:
2894320:
2895640:
28961280:   // 16xN, 32xN, ... h
2897        AARCH64_VALID_JUMP_TARGET
2898        add             \ds2,  \dst,  \d_strd
2899        add             \sr2,  \src,  \s_strd
2900        lsl             \s_strd,  \s_strd,  #1
2901
2902        sub             \s_strd,  \s_strd,  \w, uxtw #1
2903        sub             \s_strd,  \s_strd,  #16
2904.ifc \type, put
2905        lsl             \d_strd,  \d_strd,  #1
2906        sub             \d_strd,  \d_strd,  \w, uxtw #1
2907.endif
2908161:
2909        ld1             {v16.8h},  [\src], #16
2910        ld1             {v21.8h},  [\sr2], #16
2911        mov             \mx, \w
2912
291316:
2914        ld1             {v17.8h, v18.8h},  [\src], #32
2915        ld1             {v22.8h, v23.8h},  [\sr2], #32
2916        ext             v19.16b, v16.16b, v17.16b, #2
2917        ext             v20.16b, v17.16b, v18.16b, #2
2918        ext             v24.16b, v21.16b, v22.16b, #2
2919        ext             v25.16b, v22.16b, v23.16b, #2
2920        mul             v16.8h,  v16.8h,  v0.8h
2921        mla             v16.8h,  v19.8h,  v1.8h
2922        mul             v17.8h,  v17.8h,  v0.8h
2923        mla             v17.8h,  v20.8h,  v1.8h
2924        mul             v21.8h,  v21.8h,  v0.8h
2925        mla             v21.8h,  v24.8h,  v1.8h
2926        mul             v22.8h,  v22.8h,  v0.8h
2927        mla             v22.8h,  v25.8h,  v1.8h
2928        urshl           v16.8h,  v16.8h,  v31.8h
2929        urshl           v17.8h,  v17.8h,  v31.8h
2930        urshl           v21.8h,  v21.8h,  v31.8h
2931        urshl           v22.8h,  v22.8h,  v31.8h
2932        subs            \mx, \mx, #16
2933.ifc \type, put
2934        urshl           v16.8h,  v16.8h,  v30.8h
2935        urshl           v17.8h,  v17.8h,  v30.8h
2936        urshl           v21.8h,  v21.8h,  v30.8h
2937        urshl           v22.8h,  v22.8h,  v30.8h
2938.else
2939        sub             v16.8h,  v16.8h,  v29.8h
2940        sub             v17.8h,  v17.8h,  v29.8h
2941        sub             v21.8h,  v21.8h,  v29.8h
2942        sub             v22.8h,  v22.8h,  v29.8h
2943.endif
2944        st1             {v16.8h, v17.8h}, [\dst], #32
2945        st1             {v21.8h, v22.8h}, [\ds2], #32
2946        b.le            9f
2947
2948        mov             v16.16b, v18.16b
2949        mov             v21.16b, v23.16b
2950        b               16b
2951
29529:
2953        add             \dst,  \dst,  \d_strd
2954        add             \ds2,  \ds2,  \d_strd
2955        add             \src,  \src,  \s_strd
2956        add             \sr2,  \sr2,  \s_strd
2957
2958        subs            \h,  \h,  #2
2959        b.gt            161b
2960        ret
2961
2962L(\type\()_bilin_h_tbl):
2963        .hword L(\type\()_bilin_h_tbl) - 1280b
2964        .hword L(\type\()_bilin_h_tbl) -  640b
2965        .hword L(\type\()_bilin_h_tbl) -  320b
2966        .hword L(\type\()_bilin_h_tbl) -  160b
2967        .hword L(\type\()_bilin_h_tbl) -   80b
2968        .hword L(\type\()_bilin_h_tbl) -   40b
2969        .hword L(\type\()_bilin_h_tbl) -   20b
2970        .hword 0
2971
2972
2973L(\type\()_bilin_v):
2974        cmp             \h,  #4
2975        adr             x10, L(\type\()_bilin_v_tbl)
2976.ifc \type, prep
2977        dup             v31.8h,  w11      // 4 - intermediate_bits
2978.endif
2979        ldrh            w9,  [x10, x9, lsl #1]
2980.ifc \type, prep
2981        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
2982        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
2983.endif
2984        sub             x10, x10, w9, uxtw
2985        br              x10
2986
298720:     // 2xN v
2988        AARCH64_VALID_JUMP_TARGET
2989.ifc \type, put
2990        cmp             \h,  #2
2991        add             \ds2,  \dst,  \d_strd
2992        add             \sr2,  \src,  \s_strd
2993        lsl             \s_strd,  \s_strd,  #1
2994        lsl             \d_strd,  \d_strd,  #1
2995
2996        // 2x2 v
2997        ld1             {v16.s}[0], [\src], \s_strd
2998        b.gt            24f
299922:
3000        ld1             {v17.s}[0], [\sr2], \s_strd
3001        ld1             {v18.s}[0], [\src], \s_strd
3002        trn1            v16.2s,  v16.2s,  v17.2s
3003        trn1            v17.2s,  v17.2s,  v18.2s
3004        mul             v4.4h,   v16.4h,  v2.4h
3005        mla             v4.4h,   v17.4h,  v3.4h
3006        urshr           v4.8h,   v4.8h,   #4
3007        st1             {v4.s}[0], [\dst]
3008        st1             {v4.s}[1], [\ds2]
3009        ret
301024:     // 2x4, 2x6, 2x8, ... v
3011        ld1             {v17.s}[0], [\sr2], \s_strd
3012        ld1             {v18.s}[0], [\src], \s_strd
3013        ld1             {v19.s}[0], [\sr2], \s_strd
3014        ld1             {v20.s}[0], [\src], \s_strd
3015        sub             \h,  \h,  #4
3016        trn1            v16.2s,  v16.2s,  v17.2s
3017        trn1            v17.2s,  v17.2s,  v18.2s
3018        trn1            v18.2s,  v18.2s,  v19.2s
3019        trn1            v19.2s,  v19.2s,  v20.2s
3020        trn1            v16.2d,  v16.2d,  v18.2d
3021        trn1            v17.2d,  v17.2d,  v19.2d
3022        mul             v4.8h,   v16.8h,  v2.8h
3023        mla             v4.8h,   v17.8h,  v3.8h
3024        cmp             \h,  #2
3025        urshr           v4.8h,   v4.8h,   #4
3026        st1             {v4.s}[0], [\dst], \d_strd
3027        st1             {v4.s}[1], [\ds2], \d_strd
3028        st1             {v4.s}[2], [\dst], \d_strd
3029        st1             {v4.s}[3], [\ds2], \d_strd
3030        b.lt            0f
3031        mov             v16.8b,  v20.8b
3032        b.eq            22b
3033        b               24b
30340:
3035        ret
3036.endif
3037
303840:     // 4xN v
3039        AARCH64_VALID_JUMP_TARGET
3040        add             \ds2,  \dst,  \d_strd
3041        add             \sr2,  \src,  \s_strd
3042        lsl             \s_strd,  \s_strd,  #1
3043        lsl             \d_strd,  \d_strd,  #1
3044        ld1             {v16.4h}, [\src], \s_strd
30454:
3046        ld1             {v17.4h}, [\sr2], \s_strd
3047        ld1             {v18.4h}, [\src], \s_strd
3048        trn1            v16.2d,  v16.2d,  v17.2d
3049        trn1            v17.2d,  v17.2d,  v18.2d
3050        mul             v4.8h,   v16.8h,  v2.8h
3051        mla             v4.8h,   v17.8h,  v3.8h
3052        subs            \h,  \h,  #2
3053.ifc \type, put
3054        urshr           v4.8h,   v4.8h,   #4
3055.else
3056        urshl           v4.8h,   v4.8h,   v31.8h
3057        sub             v4.8h,   v4.8h,   v29.8h
3058.endif
3059        st1             {v4.d}[0], [\dst], \d_strd
3060        st1             {v4.d}[1], [\ds2], \d_strd
3061        b.le            0f
3062        mov             v16.8b,  v18.8b
3063        b               4b
30640:
3065        ret
3066
306780:     // 8xN v
3068        AARCH64_VALID_JUMP_TARGET
3069        add             \ds2,  \dst,  \d_strd
3070        add             \sr2,  \src,  \s_strd
3071        lsl             \s_strd,  \s_strd,  #1
3072        lsl             \d_strd,  \d_strd,  #1
3073        ld1             {v16.8h}, [\src], \s_strd
30748:
3075        ld1             {v17.8h}, [\sr2], \s_strd
3076        ld1             {v18.8h}, [\src], \s_strd
3077        mul             v4.8h,   v16.8h,  v2.8h
3078        mla             v4.8h,   v17.8h,  v3.8h
3079        mul             v5.8h,   v17.8h,  v2.8h
3080        mla             v5.8h,   v18.8h,  v3.8h
3081        subs            \h,  \h,  #2
3082.ifc \type, put
3083        urshr           v4.8h,   v4.8h,   #4
3084        urshr           v5.8h,   v5.8h,   #4
3085.else
3086        urshl           v4.8h,   v4.8h,   v31.8h
3087        urshl           v5.8h,   v5.8h,   v31.8h
3088        sub             v4.8h,   v4.8h,   v29.8h
3089        sub             v5.8h,   v5.8h,   v29.8h
3090.endif
3091        st1             {v4.8h}, [\dst], \d_strd
3092        st1             {v5.8h}, [\ds2], \d_strd
3093        b.le            0f
3094        mov             v16.16b, v18.16b
3095        b               8b
30960:
3097        ret
3098
3099160:    // 16xN, 32xN, ...
3100320:
3101640:
31021280:
3103        AARCH64_VALID_JUMP_TARGET
3104        mov             \my, \h
31051:
3106        add             \ds2, \dst, \d_strd
3107        add             \sr2, \src, \s_strd
3108        lsl             \s_strd, \s_strd, #1
3109        lsl             \d_strd, \d_strd, #1
3110
3111        ld1             {v16.8h, v17.8h}, [\src], \s_strd
31122:
3113        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
3114        ld1             {v20.8h, v21.8h}, [\src], \s_strd
3115        mul             v4.8h,   v16.8h,  v2.8h
3116        mla             v4.8h,   v18.8h,  v3.8h
3117        mul             v5.8h,   v17.8h,  v2.8h
3118        mla             v5.8h,   v19.8h,  v3.8h
3119        mul             v6.8h,   v18.8h,  v2.8h
3120        mla             v6.8h,   v20.8h,  v3.8h
3121        mul             v7.8h,   v19.8h,  v2.8h
3122        mla             v7.8h,   v21.8h,  v3.8h
3123        subs            \h,  \h,  #2
3124.ifc \type, put
3125        urshr           v4.8h,   v4.8h,   #4
3126        urshr           v5.8h,   v5.8h,   #4
3127        urshr           v6.8h,   v6.8h,   #4
3128        urshr           v7.8h,   v7.8h,   #4
3129.else
3130        urshl           v4.8h,   v4.8h,   v31.8h
3131        urshl           v5.8h,   v5.8h,   v31.8h
3132        urshl           v6.8h,   v6.8h,   v31.8h
3133        urshl           v7.8h,   v7.8h,   v31.8h
3134        sub             v4.8h,   v4.8h,   v29.8h
3135        sub             v5.8h,   v5.8h,   v29.8h
3136        sub             v6.8h,   v6.8h,   v29.8h
3137        sub             v7.8h,   v7.8h,   v29.8h
3138.endif
3139        st1             {v4.8h, v5.8h}, [\dst], \d_strd
3140        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
3141        b.le            9f
3142        mov             v16.16b, v20.16b
3143        mov             v17.16b, v21.16b
3144        b               2b
31459:
3146        subs            \w,  \w,  #16
3147        b.le            0f
3148        asr             \s_strd, \s_strd, #1
3149        asr             \d_strd, \d_strd, #1
3150        msub            \src, \s_strd, \xmy, \src
3151        msub            \dst, \d_strd, \xmy, \dst
3152        sub             \src, \src, \s_strd, lsl #1
3153        mov             \h,  \my
3154        add             \src, \src, #32
3155        add             \dst, \dst, #32
3156        b               1b
31570:
3158        ret
3159
3160L(\type\()_bilin_v_tbl):
3161        .hword L(\type\()_bilin_v_tbl) - 1280b
3162        .hword L(\type\()_bilin_v_tbl) -  640b
3163        .hword L(\type\()_bilin_v_tbl) -  320b
3164        .hword L(\type\()_bilin_v_tbl) -  160b
3165        .hword L(\type\()_bilin_v_tbl) -   80b
3166        .hword L(\type\()_bilin_v_tbl) -   40b
3167        .hword L(\type\()_bilin_v_tbl) -   20b
3168        .hword 0
3169
3170L(\type\()_bilin_hv):
3171        adr             x10, L(\type\()_bilin_hv_tbl)
3172        dup             v31.8h,  w11      // 4 - intermediate_bits
3173        ldrh            w9,  [x10, x9, lsl #1]
3174        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
3175.ifc \type, put
3176        dup             v30.4s,  w12      // 4 + intermediate_bits
3177.else
3178        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
3179.endif
3180        sub             x10, x10, w9, uxtw
3181.ifc \type, put
3182        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
3183.endif
3184        br              x10
3185
318620:     // 2xN hv
3187        AARCH64_VALID_JUMP_TARGET
3188.ifc \type, put
3189        add             \sr2, \src, \s_strd
3190        add             \ds2, \dst, \d_strd
3191        lsl             \s_strd, \s_strd, #1
3192        lsl             \d_strd, \d_strd, #1
3193
3194        ld1             {v20.4h},  [\src], \s_strd
3195        ext             v21.8b,  v20.8b,  v20.8b,  #2
3196        mul             v16.4h,  v20.4h,  v0.4h
3197        mla             v16.4h,  v21.4h,  v1.4h
3198        urshl           v16.4h,  v16.4h,  v31.4h
3199
32002:
3201        ld1             {v22.4h},  [\sr2], \s_strd
3202        ld1             {v24.4h},  [\src], \s_strd
3203        ext             v23.8b,  v22.8b,  v22.8b,  #2
3204        ext             v25.8b,  v24.8b,  v24.8b,  #2
3205        trn1            v22.2s,  v22.2s,  v24.2s
3206        trn1            v23.2s,  v23.2s,  v25.2s
3207        mul             v17.4h,  v22.4h,  v0.4h
3208        mla             v17.4h,  v23.4h,  v1.4h
3209        urshl           v17.4h,  v17.4h,  v31.4h
3210
3211        trn1            v16.2s,  v16.2s,  v17.2s
3212
3213        umull           v4.4s,   v16.4h,  v2.4h
3214        umlal           v4.4s,   v17.4h,  v3.4h
3215        urshl           v4.4s,   v4.4s,   v30.4s
3216        xtn             v4.4h,   v4.4s
3217        subs            \h,  \h,  #2
3218        st1             {v4.s}[0], [\dst], \d_strd
3219        st1             {v4.s}[1], [\ds2], \d_strd
3220        b.le            0f
3221        trn2            v16.2s,  v17.2s,  v17.2s
3222        b               2b
32230:
3224        ret
3225.endif
3226
322740:     // 4xN hv
3228        AARCH64_VALID_JUMP_TARGET
3229        add             \sr2, \src, \s_strd
3230        add             \ds2, \dst, \d_strd
3231        lsl             \s_strd, \s_strd, #1
3232        lsl             \d_strd, \d_strd, #1
3233
3234        ld1             {v20.8h},  [\src], \s_strd
3235        ext             v21.16b, v20.16b, v20.16b, #2
3236        mul             v16.4h,  v20.4h,  v0.4h
3237        mla             v16.4h,  v21.4h,  v1.4h
3238        urshl           v16.4h,  v16.4h,  v31.4h
3239
32404:
3241        ld1             {v22.8h},  [\sr2], \s_strd
3242        ld1             {v24.8h},  [\src], \s_strd
3243        ext             v23.16b, v22.16b, v22.16b, #2
3244        ext             v25.16b, v24.16b, v24.16b, #2
3245        trn1            v22.2d,  v22.2d,  v24.2d
3246        trn1            v23.2d,  v23.2d,  v25.2d
3247        mul             v17.8h,  v22.8h,  v0.8h
3248        mla             v17.8h,  v23.8h,  v1.8h
3249        urshl           v17.8h,  v17.8h,  v31.8h
3250
3251        trn1            v16.2d,  v16.2d,  v17.2d
3252
3253        umull           v4.4s,   v16.4h,  v2.4h
3254        umlal           v4.4s,   v17.4h,  v3.4h
3255        umull2          v5.4s,   v16.8h,  v2.8h
3256        umlal2          v5.4s,   v17.8h,  v3.8h
3257.ifc \type, put
3258        urshl           v4.4s,   v4.4s,   v30.4s
3259        urshl           v5.4s,   v5.4s,   v30.4s
3260        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3261.else
3262        rshrn           v4.4h,   v4.4s,   #4
3263        rshrn2          v4.8h,   v5.4s,   #4
3264        sub             v4.8h,   v4.8h,   v29.8h
3265.endif
3266        subs            \h,  \h,  #2
3267        st1             {v4.d}[0], [\dst], \d_strd
3268        st1             {v4.d}[1], [\ds2], \d_strd
3269        b.le            0f
3270        trn2            v16.2d,  v17.2d,  v17.2d
3271        b               4b
32720:
3273        ret
3274
327580:     // 8xN, 16xN, ... hv
3276160:
3277320:
3278640:
32791280:
3280        AARCH64_VALID_JUMP_TARGET
3281        mov             \my, \h
3282
32831:
3284        add             \sr2, \src, \s_strd
3285        add             \ds2, \dst, \d_strd
3286        lsl             \s_strd, \s_strd, #1
3287        lsl             \d_strd, \d_strd, #1
3288
3289        ldr             h21, [\src, #16]
3290        ld1             {v20.8h},  [\src], \s_strd
3291        ext             v21.16b, v20.16b, v21.16b, #2
3292        mul             v16.8h,  v20.8h,  v0.8h
3293        mla             v16.8h,  v21.8h,  v1.8h
3294        urshl           v16.8h,  v16.8h,  v31.8h
3295
32962:
3297        ldr             h23, [\sr2, #16]
3298        ld1             {v22.8h},  [\sr2], \s_strd
3299        ldr             h25, [\src, #16]
3300        ld1             {v24.8h},  [\src], \s_strd
3301        ext             v23.16b, v22.16b, v23.16b, #2
3302        ext             v25.16b, v24.16b, v25.16b, #2
3303        mul             v17.8h,  v22.8h,  v0.8h
3304        mla             v17.8h,  v23.8h,  v1.8h
3305        mul             v18.8h,  v24.8h,  v0.8h
3306        mla             v18.8h,  v25.8h,  v1.8h
3307        urshl           v17.8h,  v17.8h,  v31.8h
3308        urshl           v18.8h,  v18.8h,  v31.8h
3309
3310        umull           v4.4s,   v16.4h,  v2.4h
3311        umlal           v4.4s,   v17.4h,  v3.4h
3312        umull2          v5.4s,   v16.8h,  v2.8h
3313        umlal2          v5.4s,   v17.8h,  v3.8h
3314        umull           v6.4s,   v17.4h,  v2.4h
3315        umlal           v6.4s,   v18.4h,  v3.4h
3316        umull2          v7.4s,   v17.8h,  v2.8h
3317        umlal2          v7.4s,   v18.8h,  v3.8h
3318.ifc \type, put
3319        urshl           v4.4s,   v4.4s,   v30.4s
3320        urshl           v5.4s,   v5.4s,   v30.4s
3321        urshl           v6.4s,   v6.4s,   v30.4s
3322        urshl           v7.4s,   v7.4s,   v30.4s
3323        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
3324        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
3325.else
3326        rshrn           v4.4h,   v4.4s,   #4
3327        rshrn2          v4.8h,   v5.4s,   #4
3328        rshrn           v5.4h,   v6.4s,   #4
3329        rshrn2          v5.8h,   v7.4s,   #4
3330        sub             v4.8h,   v4.8h,   v29.8h
3331        sub             v5.8h,   v5.8h,   v29.8h
3332.endif
3333        subs            \h,  \h,  #2
3334        st1             {v4.8h}, [\dst], \d_strd
3335        st1             {v5.8h}, [\ds2], \d_strd
3336        b.le            9f
3337        mov             v16.16b, v18.16b
3338        b               2b
33399:
3340        subs            \w,  \w,  #8
3341        b.le            0f
3342        asr             \s_strd,  \s_strd,  #1
3343        asr             \d_strd,  \d_strd,  #1
3344        msub            \src,  \s_strd,  \xmy,  \src
3345        msub            \dst,  \d_strd,  \xmy,  \dst
3346        sub             \src,  \src,  \s_strd,  lsl #1
3347        mov             \h,  \my
3348        add             \src,  \src,  #16
3349        add             \dst,  \dst,  #16
3350        b               1b
33510:
3352        ret
3353
3354L(\type\()_bilin_hv_tbl):
3355        .hword L(\type\()_bilin_hv_tbl) - 1280b
3356        .hword L(\type\()_bilin_hv_tbl) -  640b
3357        .hword L(\type\()_bilin_hv_tbl) -  320b
3358        .hword L(\type\()_bilin_hv_tbl) -  160b
3359        .hword L(\type\()_bilin_hv_tbl) -   80b
3360        .hword L(\type\()_bilin_hv_tbl) -   40b
3361        .hword L(\type\()_bilin_hv_tbl) -   20b
3362        .hword 0
3363endfunc
3364.endm
3365
3366make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
3367make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3368make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
3369make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
3370make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3371filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
3372
3373make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
3374make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
3375make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
3376make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
3377filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
3378filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
3379
3380make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
3381make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
3382make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
3383make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
3384make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
3385filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
3386
3387make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
3388make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
3389make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
3390make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
3391filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
3392filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
3393
3394
3395.macro load_filter_row dst, src, inc
3396        asr             w13, \src, #10
3397        add             \src, \src, \inc
3398        ldr             \dst, [x11, w13, sxtw #3]
3399.endm
3400
3401function warp_filter_horz_neon
3402        add             w12, w5,  #512
3403
3404        ld1             {v16.8h, v17.8h}, [x2], x3
3405
3406        load_filter_row d0, w12, w7
3407        load_filter_row d1, w12, w7
3408        load_filter_row d2, w12, w7
3409        sxtl            v0.8h,   v0.8b
3410        load_filter_row d3, w12, w7
3411        sxtl            v1.8h,   v1.8b
3412        load_filter_row d4, w12, w7
3413        sxtl            v2.8h,   v2.8b
3414        load_filter_row d5, w12, w7
3415        sxtl            v3.8h,   v3.8b
3416        load_filter_row d6, w12, w7
3417        sxtl            v4.8h,   v4.8b
3418        load_filter_row d7, w12, w7
3419        sxtl            v5.8h,   v5.8b
3420        ext             v18.16b, v16.16b, v17.16b, #2*1
3421        smull           v8.4s,   v16.4h,  v0.4h
3422        smull2          v9.4s,   v16.8h,  v0.8h
3423        sxtl            v6.8h,   v6.8b
3424        ext             v19.16b, v16.16b, v17.16b, #2*2
3425        smull           v10.4s,  v18.4h,  v1.4h
3426        smull2          v11.4s,  v18.8h,  v1.8h
3427        sxtl            v7.8h,   v7.8b
3428        ext             v20.16b, v16.16b, v17.16b, #2*3
3429        smull           v0.4s,   v19.4h,  v2.4h
3430        smull2          v1.4s,   v19.8h,  v2.8h
3431        ext             v21.16b, v16.16b, v17.16b, #2*4
3432        addp            v8.4s,   v8.4s,   v9.4s
3433        smull           v2.4s,   v20.4h,  v3.4h
3434        smull2          v3.4s,   v20.8h,  v3.8h
3435        ext             v22.16b, v16.16b, v17.16b, #2*5
3436        addp            v9.4s,   v10.4s,  v11.4s
3437        smull           v10.4s,  v21.4h,  v4.4h
3438        smull2          v11.4s,  v21.8h,  v4.8h
3439        ext             v23.16b, v16.16b, v17.16b, #2*6
3440        addp            v0.4s,   v0.4s,   v1.4s
3441        smull           v18.4s,  v22.4h,  v5.4h
3442        smull2          v19.4s,  v22.8h,  v5.8h
3443        ext             v16.16b, v16.16b, v17.16b, #2*7
3444        addp            v1.4s,   v2.4s,   v3.4s
3445        addp            v2.4s,   v10.4s,  v11.4s
3446        smull           v20.4s,  v23.4h,  v6.4h
3447        smull2          v21.4s,  v23.8h,  v6.8h
3448        addp            v3.4s,   v18.4s,  v19.4s
3449        smull           v22.4s,  v16.4h,  v7.4h
3450        smull2          v23.4s,  v16.8h,  v7.8h
3451        addp            v4.4s,   v20.4s,  v21.4s
3452        addp            v5.4s,   v22.4s,  v23.4s
3453
3454        addp            v8.4s,   v8.4s,   v9.4s
3455        addp            v0.4s,   v0.4s,   v1.4s
3456        addp            v2.4s,   v2.4s,   v3.4s
3457        addp            v4.4s,   v4.4s,   v5.4s
3458
3459        addp            v16.4s,  v8.4s,   v0.4s
3460        addp            v17.4s,  v2.4s,   v4.4s
3461
3462        add             w5,  w5,  w8
3463
3464        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
3465        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
3466
3467        ret
3468endfunc
3469
3470// void dav1d_warp_affine_8x8_16bpc_neon(
3471//         pixel *dst, const ptrdiff_t dst_stride,
3472//         const pixel *src, const ptrdiff_t src_stride,
3473//         const int16_t *const abcd, int mx, int my,
3474//         const int bitdepth_max)
3475.macro warp t
3476function warp_affine_8x8\t\()_16bpc_neon, export=1
3477        stp             d8,  d9,  [sp, #-0x40]!
3478        stp             d10, d11, [sp, #0x10]
3479        stp             d12, d13, [sp, #0x20]
3480        stp             d14, d15, [sp, #0x30]
3481
3482.ifb \t
3483        dup             v15.8h,  w7        // bitdepth_max
3484.else
3485        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
3486.endif
3487        clz             w7,  w7
3488                                           // intermediate_bits = clz(bitdepth_max) - 18
3489.ifb \t
3490        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
3491.endif
3492        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
3493.ifb \t
3494        neg             w8,  w8            // -(7 + intermediate_bits)
3495.endif
3496        dup             v14.4s,  w7        // -(7 - intermediate_bits)
3497.ifb \t
3498        dup             v13.4s,  w8        // -(7 + intermediate_bits)
3499.endif
3500
3501        ldr             x4,  [x4]
3502        sbfx            x7,  x4, #0,  #16
3503        sbfx            x8,  x4, #16, #16
3504        sbfx            x9,  x4, #32, #16
3505        sbfx            x4,  x4, #48, #16
3506        mov             w10, #8
3507        sub             x2,  x2,  x3, lsl #1
3508        sub             x2,  x2,  x3
3509        sub             x2,  x2,  #6
3510        movrel          x11, X(mc_warp_filter), 64*8
3511        mov             x15, x30
3512.ifnb \t
3513        lsl             x1,  x1,  #1
3514.endif
3515
3516        bl              warp_filter_horz_neon
3517        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3518        bl              warp_filter_horz_neon
3519        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
3520        bl              warp_filter_horz_neon
3521        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
3522        bl              warp_filter_horz_neon
3523        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
3524        bl              warp_filter_horz_neon
3525        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
3526        bl              warp_filter_horz_neon
3527        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
3528        bl              warp_filter_horz_neon
3529        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
3530
35311:
3532        add             w14, w6,  #512
3533        bl              warp_filter_horz_neon
3534        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
3535
3536        load_filter_row d0, w14, w9
3537        load_filter_row d1, w14, w9
3538        load_filter_row d2, w14, w9
3539        load_filter_row d3, w14, w9
3540        load_filter_row d4, w14, w9
3541        load_filter_row d5, w14, w9
3542        load_filter_row d6, w14, w9
3543        load_filter_row d7, w14, w9
3544        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
3545
3546        // This ordering of smull/smlal/smull2/smlal2 is highly
3547        // beneficial for Cortex A53 here.
3548        smull           v16.4s,  v24.4h,  v0.4h
3549        smlal           v16.4s,  v25.4h,  v1.4h
3550        smlal           v16.4s,  v26.4h,  v2.4h
3551        smlal           v16.4s,  v27.4h,  v3.4h
3552        smlal           v16.4s,  v28.4h,  v4.4h
3553        smlal           v16.4s,  v29.4h,  v5.4h
3554        smlal           v16.4s,  v30.4h,  v6.4h
3555        smlal           v16.4s,  v31.4h,  v7.4h
3556        smull2          v17.4s,  v24.8h,  v0.8h
3557        smlal2          v17.4s,  v25.8h,  v1.8h
3558        smlal2          v17.4s,  v26.8h,  v2.8h
3559        smlal2          v17.4s,  v27.8h,  v3.8h
3560        smlal2          v17.4s,  v28.8h,  v4.8h
3561        smlal2          v17.4s,  v29.8h,  v5.8h
3562        smlal2          v17.4s,  v30.8h,  v6.8h
3563        smlal2          v17.4s,  v31.8h,  v7.8h
3564
3565        mov             v24.16b, v25.16b
3566        mov             v25.16b, v26.16b
3567.ifb \t
3568        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
3569        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
3570.else
3571        rshrn           v16.4h,  v16.4s,  #7
3572        rshrn2          v16.8h,  v17.4s,  #7
3573.endif
3574        mov             v26.16b, v27.16b
3575.ifb \t
3576        sqxtun          v16.4h,  v16.4s
3577        sqxtun2         v16.8h,  v17.4s
3578.else
3579        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
3580.endif
3581        mov             v27.16b, v28.16b
3582        mov             v28.16b, v29.16b
3583.ifb \t
3584        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
3585.endif
3586        mov             v29.16b, v30.16b
3587        mov             v30.16b, v31.16b
3588        subs            w10, w10, #1
3589        st1             {v16.8h}, [x0], x1
3590
3591        add             w6,  w6,  w4
3592        b.gt            1b
3593
3594        ldp             d14, d15, [sp, #0x30]
3595        ldp             d12, d13, [sp, #0x20]
3596        ldp             d10, d11, [sp, #0x10]
3597        ldp             d8,  d9,  [sp], 0x40
3598
3599        ret             x15
3600endfunc
3601.endm
3602
3603warp
3604warp t
3605
3606// void dav1d_emu_edge_16bpc_neon(
3607//         const intptr_t bw, const intptr_t bh,
3608//         const intptr_t iw, const intptr_t ih,
3609//         const intptr_t x, const intptr_t y,
3610//         pixel *dst, const ptrdiff_t dst_stride,
3611//         const pixel *ref, const ptrdiff_t ref_stride)
3612function emu_edge_16bpc_neon, export=1
3613        ldp             x8,  x9,  [sp]
3614
3615        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
3616        // ref += iclip(x, 0, iw - 1)
3617        sub             x12, x3,  #1           // ih - 1
3618        cmp             x5,  x3
3619        sub             x13, x2,  #1           // iw - 1
3620        csel            x12, x12, x5,  ge      // min(y, ih - 1)
3621        cmp             x4,  x2
3622        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
3623        csel            x13, x13, x4,  ge      // min(x, iw - 1)
3624        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
3625        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
3626        add             x8,  x8,  x13, lsl #1  // ref += iclip()
3627
3628        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
3629        // top_ext = iclip(-y, 0, bh - 1)
3630        add             x10, x5,  x1           // y + bh
3631        neg             x5,  x5                // -y
3632        sub             x10, x10, x3           // y + bh - ih
3633        sub             x12, x1,  #1           // bh - 1
3634        cmp             x10, x1
3635        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
3636        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
3637        cmp             x5,  x1
3638        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
3639        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
3640
3641        // right_ext = iclip(x + bw - iw, 0, bw - 1)
3642        // left_ext = iclip(-x, 0, bw - 1)
3643        add             x11, x4,  x0           // x + bw
3644        neg             x4,  x4                // -x
3645        sub             x11, x11, x2           // x + bw - iw
3646        sub             x13, x0,  #1           // bw - 1
3647        cmp             x11, x0
3648        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
3649        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
3650        cmp             x4,  x0
3651        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
3652        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
3653
3654        // center_h = bh - top_ext - bottom_ext
3655        // dst += top_ext * PXSTRIDE(dst_stride)
3656        // center_w = bw - left_ext - right_ext
3657        sub             x1,  x1,  x5           // bh - top_ext
3658        madd            x6,  x5,  x7,  x6
3659        sub             x2,  x0,  x4           // bw - left_ext
3660        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
3661        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
3662
3663        mov             x14, x6                // backup of dst
3664
3665.macro v_loop need_left, need_right
36660:
3667.if \need_left
3668        ld1r            {v0.8h}, [x8]
3669        mov             x12, x6                // out = dst
3670        mov             x3,  x4
3671        mov             v1.16b,  v0.16b
36721:
3673        subs            x3,  x3,  #16
3674        st1             {v0.8h, v1.8h}, [x12], #32
3675        b.gt            1b
3676.endif
3677        mov             x13, x8
3678        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
3679        mov             x3,  x2
36801:
3681        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
3682        subs            x3,  x3,  #32
3683        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
3684        b.gt            1b
3685.if \need_right
3686        add             x3,  x8,  x2, lsl #1   // in + center_w
3687        sub             x3,  x3,  #2           // in + center_w - 1
3688        add             x12, x6,  x4, lsl #1   // dst + left_ext
3689        ld1r            {v0.8h}, [x3]
3690        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
3691        mov             x3,  x11
3692        mov             v1.16b,  v0.16b
36931:
3694        subs            x3,  x3,  #16
3695        st1             {v0.8h, v1.8h}, [x12], #32
3696        b.gt            1b
3697.endif
3698
3699        subs            x1,  x1,  #1           // center_h--
3700        add             x6,  x6,  x7
3701        add             x8,  x8,  x9
3702        b.gt            0b
3703.endm
3704
3705        cbz             x4,  2f
3706        // need_left
3707        cbz             x11, 3f
3708        // need_left + need_right
3709        v_loop          1,   1
3710        b               5f
3711
37122:
3713        // !need_left
3714        cbz             x11, 4f
3715        // !need_left + need_right
3716        v_loop          0,   1
3717        b               5f
3718
37193:
3720        // need_left + !need_right
3721        v_loop          1,   0
3722        b               5f
3723
37244:
3725        // !need_left + !need_right
3726        v_loop          0,   0
3727
37285:
3729
3730        cbz             x10, 3f
3731        // need_bottom
3732        sub             x8,  x6,  x7           // ref = dst - stride
3733        mov             x4,  x0
37341:
3735        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
3736        mov             x3,  x10
37372:
3738        subs            x3,  x3,  #1
3739        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3740        b.gt            2b
3741        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
3742        subs            x4,  x4,  #32          // bw -= 32
3743        add             x6,  x6,  #64          // dst += 32
3744        b.gt            1b
3745
37463:
3747        cbz             x5,  3f
3748        // need_top
3749        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
37501:
3751        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
3752        mov             x3,  x5
37532:
3754        subs            x3,  x3,  #1
3755        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
3756        b.gt            2b
3757        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
3758        subs            x0,  x0,  #32          // bw -= 32
3759        add             x6,  x6,  #64          // dst += 32
3760        b.gt            1b
3761
37623:
3763        ret
3764endfunc
3765