• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                             const pixel *const topleft,
33//                             const int width, const int height, const int a,
34//                             const int max_width, const int max_height);
35function ipred_dc_128_8bpc_neon, export=1
36        clz             w3,  w3
37        adr             x5,  L(ipred_dc_128_tbl)
38        sub             w3,  w3,  #25
39        ldrh            w3,  [x5, w3, uxtw #1]
40        movi            v0.16b,  #128
41        sub             x5,  x5,  w3, uxtw
42        add             x6,  x0,  x1
43        lsl             x1,  x1,  #1
44        br              x5
454:
46        AARCH64_VALID_JUMP_TARGET
47        st1             {v0.s}[0],  [x0], x1
48        st1             {v0.s}[0],  [x6], x1
49        subs            w4,  w4,  #4
50        st1             {v0.s}[0],  [x0], x1
51        st1             {v0.s}[0],  [x6], x1
52        b.gt            4b
53        ret
548:
55        AARCH64_VALID_JUMP_TARGET
56        st1             {v0.8b},  [x0], x1
57        st1             {v0.8b},  [x6], x1
58        subs            w4,  w4,  #4
59        st1             {v0.8b},  [x0], x1
60        st1             {v0.8b},  [x6], x1
61        b.gt            8b
62        ret
6316:
64        AARCH64_VALID_JUMP_TARGET
65        st1             {v0.16b}, [x0], x1
66        st1             {v0.16b}, [x6], x1
67        subs            w4,  w4,  #4
68        st1             {v0.16b}, [x0], x1
69        st1             {v0.16b}, [x6], x1
70        b.gt            16b
71        ret
72320:
73        AARCH64_VALID_JUMP_TARGET
74        movi            v1.16b,  #128
7532:
76        st1             {v0.16b, v1.16b}, [x0], x1
77        st1             {v0.16b, v1.16b}, [x6], x1
78        subs            w4,  w4,  #4
79        st1             {v0.16b, v1.16b}, [x0], x1
80        st1             {v0.16b, v1.16b}, [x6], x1
81        b.gt            32b
82        ret
83640:
84        AARCH64_VALID_JUMP_TARGET
85        movi            v1.16b,  #128
86        movi            v2.16b,  #128
87        movi            v3.16b,  #128
8864:
89        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
90        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
91        subs            w4,  w4,  #4
92        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
93        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
94        b.gt            64b
95        ret
96
97L(ipred_dc_128_tbl):
98        .hword L(ipred_dc_128_tbl) - 640b
99        .hword L(ipred_dc_128_tbl) - 320b
100        .hword L(ipred_dc_128_tbl) -  16b
101        .hword L(ipred_dc_128_tbl) -   8b
102        .hword L(ipred_dc_128_tbl) -   4b
103endfunc
104
105// void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
106//                        const pixel *const topleft,
107//                        const int width, const int height, const int a,
108//                        const int max_width, const int max_height);
109function ipred_v_8bpc_neon, export=1
110        clz             w3,  w3
111        adr             x5,  L(ipred_v_tbl)
112        sub             w3,  w3,  #25
113        ldrh            w3,  [x5, w3, uxtw #1]
114        add             x2,  x2,  #1
115        sub             x5,  x5,  w3, uxtw
116        add             x6,  x0,  x1
117        lsl             x1,  x1,  #1
118        br              x5
11940:
120        AARCH64_VALID_JUMP_TARGET
121        ld1             {v0.s}[0],  [x2]
1224:
123        st1             {v0.s}[0],  [x0], x1
124        st1             {v0.s}[0],  [x6], x1
125        subs            w4,  w4,  #4
126        st1             {v0.s}[0],  [x0], x1
127        st1             {v0.s}[0],  [x6], x1
128        b.gt            4b
129        ret
13080:
131        AARCH64_VALID_JUMP_TARGET
132        ld1             {v0.8b},  [x2]
1338:
134        st1             {v0.8b},  [x0], x1
135        st1             {v0.8b},  [x6], x1
136        subs            w4,  w4,  #4
137        st1             {v0.8b},  [x0], x1
138        st1             {v0.8b},  [x6], x1
139        b.gt            8b
140        ret
141160:
142        AARCH64_VALID_JUMP_TARGET
143        ld1             {v0.16b}, [x2]
14416:
145        st1             {v0.16b}, [x0], x1
146        st1             {v0.16b}, [x6], x1
147        subs            w4,  w4,  #4
148        st1             {v0.16b}, [x0], x1
149        st1             {v0.16b}, [x6], x1
150        b.gt            16b
151        ret
152320:
153        AARCH64_VALID_JUMP_TARGET
154        ld1             {v0.16b, v1.16b}, [x2]
15532:
156        st1             {v0.16b, v1.16b}, [x0], x1
157        st1             {v0.16b, v1.16b}, [x6], x1
158        subs            w4,  w4,  #4
159        st1             {v0.16b, v1.16b}, [x0], x1
160        st1             {v0.16b, v1.16b}, [x6], x1
161        b.gt            32b
162        ret
163640:
164        AARCH64_VALID_JUMP_TARGET
165        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
16664:
167        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
168        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
169        subs            w4,  w4,  #4
170        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
171        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
172        b.gt            64b
173        ret
174
175L(ipred_v_tbl):
176        .hword L(ipred_v_tbl) - 640b
177        .hword L(ipred_v_tbl) - 320b
178        .hword L(ipred_v_tbl) - 160b
179        .hword L(ipred_v_tbl) -  80b
180        .hword L(ipred_v_tbl) -  40b
181endfunc
182
183// void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
184//                        const pixel *const topleft,
185//                        const int width, const int height, const int a,
186//                        const int max_width, const int max_height);
187function ipred_h_8bpc_neon, export=1
188        clz             w3,  w3
189        adr             x5,  L(ipred_h_tbl)
190        sub             w3,  w3,  #25
191        ldrh            w3,  [x5, w3, uxtw #1]
192        sub             x2,  x2,  #4
193        sub             x5,  x5,  w3, uxtw
194        mov             x7,  #-4
195        add             x6,  x0,  x1
196        lsl             x1,  x1,  #1
197        br              x5
1984:
199        AARCH64_VALID_JUMP_TARGET
200        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
201        st1             {v3.s}[0],  [x0], x1
202        st1             {v2.s}[0],  [x6], x1
203        subs            w4,  w4,  #4
204        st1             {v1.s}[0],  [x0], x1
205        st1             {v0.s}[0],  [x6], x1
206        b.gt            4b
207        ret
2088:
209        AARCH64_VALID_JUMP_TARGET
210        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
211        st1             {v3.8b},  [x0], x1
212        st1             {v2.8b},  [x6], x1
213        subs            w4,  w4,  #4
214        st1             {v1.8b},  [x0], x1
215        st1             {v0.8b},  [x6], x1
216        b.gt            8b
217        ret
21816:
219        AARCH64_VALID_JUMP_TARGET
220        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
221        st1             {v3.16b}, [x0], x1
222        st1             {v2.16b}, [x6], x1
223        subs            w4,  w4,  #4
224        st1             {v1.16b}, [x0], x1
225        st1             {v0.16b}, [x6], x1
226        b.gt            16b
227        ret
22832:
229        AARCH64_VALID_JUMP_TARGET
230        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
231        str             q3,  [x0, #16]
232        str             q2,  [x6, #16]
233        st1             {v3.16b}, [x0], x1
234        st1             {v2.16b}, [x6], x1
235        subs            w4,  w4,  #4
236        str             q1,  [x0, #16]
237        str             q0,  [x6, #16]
238        st1             {v1.16b}, [x0], x1
239        st1             {v0.16b}, [x6], x1
240        b.gt            32b
241        ret
24264:
243        AARCH64_VALID_JUMP_TARGET
244        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
245        str             q3,  [x0, #16]
246        str             q2,  [x6, #16]
247        stp             q3,  q3,  [x0, #32]
248        stp             q2,  q2,  [x6, #32]
249        st1             {v3.16b}, [x0], x1
250        st1             {v2.16b}, [x6], x1
251        subs            w4,  w4,  #4
252        str             q1,  [x0, #16]
253        str             q0,  [x6, #16]
254        stp             q1,  q1,  [x0, #32]
255        stp             q0,  q0,  [x6, #32]
256        st1             {v1.16b}, [x0], x1
257        st1             {v0.16b}, [x6], x1
258        b.gt            64b
259        ret
260
261L(ipred_h_tbl):
262        .hword L(ipred_h_tbl) - 64b
263        .hword L(ipred_h_tbl) - 32b
264        .hword L(ipred_h_tbl) - 16b
265        .hword L(ipred_h_tbl) -  8b
266        .hword L(ipred_h_tbl) -  4b
267endfunc
268
269// void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
270//                             const pixel *const topleft,
271//                             const int width, const int height, const int a,
272//                             const int max_width, const int max_height);
273function ipred_dc_top_8bpc_neon, export=1
274        clz             w3,  w3
275        adr             x5,  L(ipred_dc_top_tbl)
276        sub             w3,  w3,  #25
277        ldrh            w3,  [x5, w3, uxtw #1]
278        add             x2,  x2,  #1
279        sub             x5,  x5,  w3, uxtw
280        add             x6,  x0,  x1
281        lsl             x1,  x1,  #1
282        br              x5
28340:
284        AARCH64_VALID_JUMP_TARGET
285        ld1r            {v0.2s},  [x2]
286        uaddlv          h0,      v0.8b
287        rshrn           v0.8b,   v0.8h,   #3
288        dup             v0.8b,   v0.b[0]
2894:
290        st1             {v0.s}[0],  [x0], x1
291        st1             {v0.s}[0],  [x6], x1
292        subs            w4,  w4,  #4
293        st1             {v0.s}[0],  [x0], x1
294        st1             {v0.s}[0],  [x6], x1
295        b.gt            4b
296        ret
29780:
298        AARCH64_VALID_JUMP_TARGET
299        ld1             {v0.8b},  [x2]
300        uaddlv          h0,      v0.8b
301        rshrn           v0.8b,   v0.8h,   #3
302        dup             v0.8b,   v0.b[0]
3038:
304        st1             {v0.8b},  [x0], x1
305        st1             {v0.8b},  [x6], x1
306        subs            w4,  w4,  #4
307        st1             {v0.8b},  [x0], x1
308        st1             {v0.8b},  [x6], x1
309        b.gt            8b
310        ret
311160:
312        AARCH64_VALID_JUMP_TARGET
313        ld1             {v0.16b}, [x2]
314        uaddlv          h0,      v0.16b
315        rshrn           v0.8b,   v0.8h,   #4
316        dup             v0.16b,  v0.b[0]
31716:
318        st1             {v0.16b}, [x0], x1
319        st1             {v0.16b}, [x6], x1
320        subs            w4,  w4,  #4
321        st1             {v0.16b}, [x0], x1
322        st1             {v0.16b}, [x6], x1
323        b.gt            16b
324        ret
325320:
326        AARCH64_VALID_JUMP_TARGET
327        ld1             {v0.16b, v1.16b}, [x2]
328        uaddlv          h0,      v0.16b
329        uaddlv          h1,      v1.16b
330        add             v2.4h,   v0.4h,   v1.4h
331        rshrn           v2.8b,   v2.8h,   #5
332        dup             v0.16b,  v2.b[0]
333        dup             v1.16b,  v2.b[0]
33432:
335        st1             {v0.16b, v1.16b}, [x0], x1
336        st1             {v0.16b, v1.16b}, [x6], x1
337        subs            w4,  w4,  #4
338        st1             {v0.16b, v1.16b}, [x0], x1
339        st1             {v0.16b, v1.16b}, [x6], x1
340        b.gt            32b
341        ret
342640:
343        AARCH64_VALID_JUMP_TARGET
344        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
345        uaddlv          h0,      v0.16b
346        uaddlv          h1,      v1.16b
347        uaddlv          h2,      v2.16b
348        uaddlv          h3,      v3.16b
349        add             v4.4h,   v0.4h,   v1.4h
350        add             v5.4h,   v2.4h,   v3.4h
351        add             v4.4h,   v4.4h,   v5.4h
352        rshrn           v4.8b,   v4.8h,   #6
353        dup             v0.16b,  v4.b[0]
354        dup             v1.16b,  v4.b[0]
355        dup             v2.16b,  v4.b[0]
356        dup             v3.16b,  v4.b[0]
35764:
358        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
359        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
360        subs            w4,  w4,  #4
361        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
362        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
363        b.gt            64b
364        ret
365
366L(ipred_dc_top_tbl):
367        .hword L(ipred_dc_top_tbl) - 640b
368        .hword L(ipred_dc_top_tbl) - 320b
369        .hword L(ipred_dc_top_tbl) - 160b
370        .hword L(ipred_dc_top_tbl) -  80b
371        .hword L(ipred_dc_top_tbl) -  40b
372endfunc
373
374// void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
375//                              const pixel *const topleft,
376//                              const int width, const int height, const int a,
377//                              const int max_width, const int max_height);
378function ipred_dc_left_8bpc_neon, export=1
379        sub             x2,  x2,  w4, uxtw
380        clz             w3,  w3
381        clz             w7,  w4
382        adr             x5,  L(ipred_dc_left_tbl)
383        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
384        sub             w7,  w7,  #25
385        ldrh            w3,  [x5, w3, uxtw #1]
386        ldrh            w7,  [x5, w7, uxtw #1]
387        sub             x3,  x5,  w3, uxtw
388        sub             x5,  x5,  w7, uxtw
389        add             x6,  x0,  x1
390        lsl             x1,  x1,  #1
391        br              x5
392
393L(ipred_dc_left_h4):
394        AARCH64_VALID_JUMP_TARGET
395        ld1r            {v0.2s},  [x2]
396        uaddlv          h0,      v0.8b
397        rshrn           v0.8b,   v0.8h,   #3
398        dup             v0.16b,  v0.b[0]
399        br              x3
400L(ipred_dc_left_w4):
401        AARCH64_VALID_JUMP_TARGET
402        st1             {v0.s}[0],  [x0], x1
403        st1             {v0.s}[0],  [x6], x1
404        subs            w4,  w4,  #4
405        st1             {v0.s}[0],  [x0], x1
406        st1             {v0.s}[0],  [x6], x1
407        b.gt            L(ipred_dc_left_w4)
408        ret
409
410L(ipred_dc_left_h8):
411        AARCH64_VALID_JUMP_TARGET
412        ld1             {v0.8b},  [x2]
413        uaddlv          h0,      v0.8b
414        rshrn           v0.8b,   v0.8h,   #3
415        dup             v0.16b,  v0.b[0]
416        br              x3
417L(ipred_dc_left_w8):
418        AARCH64_VALID_JUMP_TARGET
419        st1             {v0.8b},  [x0], x1
420        st1             {v0.8b},  [x6], x1
421        subs            w4,  w4,  #4
422        st1             {v0.8b},  [x0], x1
423        st1             {v0.8b},  [x6], x1
424        b.gt            L(ipred_dc_left_w8)
425        ret
426
427L(ipred_dc_left_h16):
428        AARCH64_VALID_JUMP_TARGET
429        ld1             {v0.16b}, [x2]
430        uaddlv          h0,      v0.16b
431        rshrn           v0.8b,   v0.8h,   #4
432        dup             v0.16b,  v0.b[0]
433        br              x3
434L(ipred_dc_left_w16):
435        AARCH64_VALID_JUMP_TARGET
436        st1             {v0.16b}, [x0], x1
437        st1             {v0.16b}, [x6], x1
438        subs            w4,  w4,  #4
439        st1             {v0.16b}, [x0], x1
440        st1             {v0.16b}, [x6], x1
441        b.gt            L(ipred_dc_left_w16)
442        ret
443
444L(ipred_dc_left_h32):
445        AARCH64_VALID_JUMP_TARGET
446        ld1             {v0.16b, v1.16b}, [x2]
447        uaddlv          h0,      v0.16b
448        uaddlv          h1,      v1.16b
449        add             v0.4h,   v0.4h,   v1.4h
450        rshrn           v0.8b,   v0.8h,   #5
451        dup             v0.16b,  v0.b[0]
452        br              x3
453L(ipred_dc_left_w32):
454        AARCH64_VALID_JUMP_TARGET
455        mov             v1.16b,  v0.16b
4561:
457        st1             {v0.16b, v1.16b}, [x0], x1
458        st1             {v0.16b, v1.16b}, [x6], x1
459        subs            w4,  w4,  #4
460        st1             {v0.16b, v1.16b}, [x0], x1
461        st1             {v0.16b, v1.16b}, [x6], x1
462        b.gt            1b
463        ret
464
465L(ipred_dc_left_h64):
466        AARCH64_VALID_JUMP_TARGET
467        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
468        uaddlv          h0,      v0.16b
469        uaddlv          h1,      v1.16b
470        uaddlv          h2,      v2.16b
471        uaddlv          h3,      v3.16b
472        add             v0.4h,   v0.4h,   v1.4h
473        add             v2.4h,   v2.4h,   v3.4h
474        add             v0.4h,   v0.4h,   v2.4h
475        rshrn           v0.8b,   v0.8h,   #6
476        dup             v0.16b,  v0.b[0]
477        br              x3
478L(ipred_dc_left_w64):
479        AARCH64_VALID_JUMP_TARGET
480        mov             v1.16b,  v0.16b
481        mov             v2.16b,  v0.16b
482        mov             v3.16b,  v0.16b
4831:
484        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
485        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
486        subs            w4,  w4,  #4
487        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
488        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
489        b.gt            1b
490        ret
491
492L(ipred_dc_left_tbl):
493        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
494        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
495        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
496        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
497        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
498        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
499        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
500        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
501        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
502        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
503endfunc
504
505// void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
506//                         const pixel *const topleft,
507//                         const int width, const int height, const int a,
508//                         const int max_width, const int max_height);
509function ipred_dc_8bpc_neon, export=1
510        sub             x2,  x2,  w4, uxtw
511        add             w7,  w3,  w4             // width + height
512        clz             w3,  w3
513        clz             w6,  w4
514        dup             v16.8h, w7               // width + height
515        adr             x5,  L(ipred_dc_tbl)
516        rbit            w7,  w7                  // rbit(width + height)
517        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
518        sub             w6,  w6,  #25
519        clz             w7,  w7                  // ctz(width + height)
520        ldrh            w3,  [x5, w3, uxtw #1]
521        ldrh            w6,  [x5, w6, uxtw #1]
522        neg             w7,  w7                  // -ctz(width + height)
523        sub             x3,  x5,  w3, uxtw
524        sub             x5,  x5,  w6, uxtw
525        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
526        dup             v17.8h,  w7              // -ctz(width + height)
527        add             x6,  x0,  x1
528        lsl             x1,  x1,  #1
529        br              x5
530
531L(ipred_dc_h4):
532        AARCH64_VALID_JUMP_TARGET
533        ld1             {v0.s}[0],  [x2], #4
534        ins             v0.s[1], wzr
535        uaddlv          h0,      v0.8b
536        add             x2,  x2,  #1
537        br              x3
538L(ipred_dc_w4):
539        AARCH64_VALID_JUMP_TARGET
540        ld1             {v1.s}[0],  [x2]
541        ins             v1.s[1], wzr
542        add             v0.4h,   v0.4h,   v16.4h
543        uaddlv          h1,      v1.8b
544        cmp             w4,  #4
545        add             v0.4h,   v0.4h,   v1.4h
546        ushl            v0.4h,   v0.4h,   v17.4h
547        b.eq            1f
548        // h = 8/16
549        mov             w16, #(0x3334/2)
550        movk            w16, #(0x5556/2), lsl #16
551        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
552        lsr             w16, w16, w17
553        dup             v16.4h,  w16
554        sqdmulh         v0.4h,   v0.4h,   v16.4h
5551:
556        dup             v0.8b,   v0.b[0]
5572:
558        st1             {v0.s}[0],  [x0], x1
559        st1             {v0.s}[0],  [x6], x1
560        subs            w4,  w4,  #4
561        st1             {v0.s}[0],  [x0], x1
562        st1             {v0.s}[0],  [x6], x1
563        b.gt            2b
564        ret
565
566L(ipred_dc_h8):
567        AARCH64_VALID_JUMP_TARGET
568        ld1             {v0.8b},  [x2], #8
569        uaddlv          h0,      v0.8b
570        add             x2,  x2,  #1
571        br              x3
572L(ipred_dc_w8):
573        AARCH64_VALID_JUMP_TARGET
574        ld1             {v1.8b},  [x2]
575        add             v0.4h,   v0.4h,   v16.4h
576        uaddlv          h1,      v1.8b
577        cmp             w4,  #8
578        add             v0.4h,   v0.4h,   v1.4h
579        ushl            v0.4h,   v0.4h,   v17.4h
580        b.eq            1f
581        // h = 4/16/32
582        cmp             w4,  #32
583        mov             w16, #(0x3334/2)
584        mov             w17, #(0x5556/2)
585        csel            w16, w16, w17, eq
586        dup             v16.4h,  w16
587        sqdmulh         v0.4h,   v0.4h,   v16.4h
5881:
589        dup             v0.8b,   v0.b[0]
5902:
591        st1             {v0.8b},  [x0], x1
592        st1             {v0.8b},  [x6], x1
593        subs            w4,  w4,  #4
594        st1             {v0.8b},  [x0], x1
595        st1             {v0.8b},  [x6], x1
596        b.gt            2b
597        ret
598
599L(ipred_dc_h16):
600        AARCH64_VALID_JUMP_TARGET
601        ld1             {v0.16b}, [x2], #16
602        uaddlv          h0,      v0.16b
603        add             x2,  x2,  #1
604        br              x3
605L(ipred_dc_w16):
606        AARCH64_VALID_JUMP_TARGET
607        ld1             {v1.16b}, [x2]
608        add             v0.4h,   v0.4h,   v16.4h
609        uaddlv          h1,      v1.16b
610        cmp             w4,  #16
611        add             v0.4h,   v0.4h,   v1.4h
612        ushl            v0.4h,   v0.4h,   v17.4h
613        b.eq            1f
614        // h = 4/8/32/64
615        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
616        mov             w16, #(0x3334/2)
617        mov             w17, #(0x5556/2)
618        csel            w16, w16, w17, eq
619        dup             v16.4h,  w16
620        sqdmulh         v0.4h,   v0.4h,   v16.4h
6211:
622        dup             v0.16b,  v0.b[0]
6232:
624        st1             {v0.16b}, [x0], x1
625        st1             {v0.16b}, [x6], x1
626        subs            w4,  w4,  #4
627        st1             {v0.16b}, [x0], x1
628        st1             {v0.16b}, [x6], x1
629        b.gt            2b
630        ret
631
632L(ipred_dc_h32):
633        AARCH64_VALID_JUMP_TARGET
634        ld1             {v0.16b, v1.16b}, [x2], #32
635        uaddlv          h0,      v0.16b
636        uaddlv          h1,      v1.16b
637        add             x2,  x2,  #1
638        add             v0.4h,   v0.4h,   v1.4h
639        br              x3
640L(ipred_dc_w32):
641        AARCH64_VALID_JUMP_TARGET
642        ld1             {v1.16b, v2.16b}, [x2]
643        add             v0.4h,   v0.4h,   v16.4h
644        uaddlv          h1,      v1.16b
645        uaddlv          h2,      v2.16b
646        cmp             w4,  #32
647        add             v0.4h,   v0.4h,   v1.4h
648        add             v0.4h,   v0.4h,   v2.4h
649        ushl            v4.4h,   v0.4h,   v17.4h
650        b.eq            1f
651        // h = 8/16/64
652        cmp             w4,  #8
653        mov             w16, #(0x3334/2)
654        mov             w17, #(0x5556/2)
655        csel            w16, w16, w17, eq
656        dup             v16.4h,  w16
657        sqdmulh         v4.4h,   v4.4h,   v16.4h
6581:
659        dup             v0.16b,  v4.b[0]
660        dup             v1.16b,  v4.b[0]
6612:
662        st1             {v0.16b, v1.16b}, [x0], x1
663        st1             {v0.16b, v1.16b}, [x6], x1
664        subs            w4,  w4,  #4
665        st1             {v0.16b, v1.16b}, [x0], x1
666        st1             {v0.16b, v1.16b}, [x6], x1
667        b.gt            2b
668        ret
669
670L(ipred_dc_h64):
671        AARCH64_VALID_JUMP_TARGET
672        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
673        uaddlv          h0,      v0.16b
674        uaddlv          h1,      v1.16b
675        uaddlv          h2,      v2.16b
676        uaddlv          h3,      v3.16b
677        add             v0.4h,   v0.4h,   v1.4h
678        add             v2.4h,   v2.4h,   v3.4h
679        add             x2,  x2,  #1
680        add             v0.4h,   v0.4h,   v2.4h
681        br              x3
682L(ipred_dc_w64):
683        AARCH64_VALID_JUMP_TARGET
684        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
685        add             v0.4h,   v0.4h,   v16.4h
686        uaddlv          h1,      v1.16b
687        uaddlv          h2,      v2.16b
688        uaddlv          h3,      v3.16b
689        uaddlv          h4,      v4.16b
690        add             v1.4h,   v1.4h,   v2.4h
691        add             v3.4h,   v3.4h,   v4.4h
692        cmp             w4,  #64
693        add             v0.4h,   v0.4h,   v1.4h
694        add             v0.4h,   v0.4h,   v3.4h
695        ushl            v4.4h,   v0.4h,   v17.4h
696        b.eq            1f
697        // h = 16/32
698        mov             w16, #(0x5556/2)
699        movk            w16, #(0x3334/2), lsl #16
700        lsr             w16, w16, w4
701        dup             v16.4h,  w16
702        sqdmulh         v4.4h,   v4.4h,   v16.4h
7031:
704        dup             v0.16b,  v4.b[0]
705        dup             v1.16b,  v4.b[0]
706        dup             v2.16b,  v4.b[0]
707        dup             v3.16b,  v4.b[0]
7082:
709        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
710        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
711        subs            w4,  w4,  #4
712        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
713        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
714        b.gt            2b
715        ret
716
717L(ipred_dc_tbl):
718        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
719        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
720        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
721        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
722        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
723        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
724        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
725        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
726        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
727        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
728endfunc
729
730// void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
731//                            const pixel *const topleft,
732//                            const int width, const int height, const int a,
733//                            const int max_width, const int max_height);
734function ipred_paeth_8bpc_neon, export=1
735        clz             w9,  w3
736        adr             x5,  L(ipred_paeth_tbl)
737        sub             w9,  w9,  #25
738        ldrh            w9,  [x5, w9, uxtw #1]
739        ld1r            {v4.16b},  [x2]
740        add             x8,  x2,  #1
741        sub             x2,  x2,  #4
742        sub             x5,  x5,  w9, uxtw
743        mov             x7,  #-4
744        add             x6,  x0,  x1
745        lsl             x1,  x1,  #1
746        br              x5
74740:
748        AARCH64_VALID_JUMP_TARGET
749        ld1r            {v5.4s},  [x8]
750        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7514:
752        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
753        zip1            v0.2s,   v0.2s,   v1.2s
754        zip1            v2.2s,   v2.2s,   v3.2s
755        uaddw           v16.8h,  v6.8h,   v0.8b
756        uaddw           v17.8h,  v6.8h,   v2.8b
757        sqxtun          v16.8b,  v16.8h           // base
758        sqxtun2         v16.16b, v17.8h
759        zip1            v0.2d,   v0.2d,   v2.2d
760        uabd            v20.16b, v5.16b,  v16.16b // tdiff
761        uabd            v22.16b, v4.16b,  v16.16b // tldiff
762        uabd            v16.16b, v0.16b,  v16.16b // ldiff
763        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
764        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
765        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
766        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
767        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
768        st1             {v20.s}[3], [x0], x1
769        st1             {v20.s}[2], [x6], x1
770        subs            w4,  w4,  #4
771        st1             {v20.s}[1], [x0], x1
772        st1             {v20.s}[0], [x6], x1
773        b.gt            4b
774        ret
77580:
776        AARCH64_VALID_JUMP_TARGET
777        ld1r            {v5.2d},  [x8]
778        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
7798:
780        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
781        uaddw           v16.8h,  v6.8h,   v0.8b
782        uaddw           v17.8h,  v6.8h,   v1.8b
783        uaddw           v18.8h,  v6.8h,   v2.8b
784        uaddw           v19.8h,  v6.8h,   v3.8b
785        sqxtun          v16.8b,  v16.8h           // base
786        sqxtun2         v16.16b, v17.8h
787        sqxtun          v18.8b,  v18.8h
788        sqxtun2         v18.16b, v19.8h
789        zip1            v2.2d,   v2.2d,   v3.2d
790        zip1            v0.2d,   v0.2d,   v1.2d
791        uabd            v21.16b, v5.16b,  v18.16b // tdiff
792        uabd            v20.16b, v5.16b,  v16.16b
793        uabd            v23.16b, v4.16b,  v18.16b // tldiff
794        uabd            v22.16b, v4.16b,  v16.16b
795        uabd            v17.16b, v2.16b,  v18.16b // ldiff
796        uabd            v16.16b, v0.16b,  v16.16b
797        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
798        umin            v18.16b, v20.16b, v22.16b
799        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
800        cmhs            v20.16b, v22.16b, v20.16b
801        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
802        cmhs            v16.16b, v18.16b, v16.16b
803        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
804        bsl             v20.16b, v5.16b,  v4.16b
805        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
806        bit             v20.16b, v0.16b,  v16.16b
807        st1             {v21.d}[1], [x0], x1
808        st1             {v21.d}[0], [x6], x1
809        subs            w4,  w4,  #4
810        st1             {v20.d}[1], [x0], x1
811        st1             {v20.d}[0], [x6], x1
812        b.gt            8b
813        ret
814160:
815320:
816640:
817        AARCH64_VALID_JUMP_TARGET
818        ld1             {v5.16b},  [x8], #16
819        mov             w9,  w3
820        // Set up pointers for four rows in parallel; x0, x6, x5, x10
821        add             x5,  x0,  x1
822        add             x10, x6,  x1
823        lsl             x1,  x1,  #1
824        sub             x1,  x1,  w3, uxtw
8251:
826        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
8272:
828        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
829        usubl2          v7.8h,   v5.16b,  v4.16b
830        uaddw           v24.8h,  v6.8h,   v0.8b
831        uaddw           v25.8h,  v7.8h,   v0.8b
832        uaddw           v26.8h,  v6.8h,   v1.8b
833        uaddw           v27.8h,  v7.8h,   v1.8b
834        uaddw           v28.8h,  v6.8h,   v2.8b
835        uaddw           v29.8h,  v7.8h,   v2.8b
836        uaddw           v30.8h,  v6.8h,   v3.8b
837        uaddw           v31.8h,  v7.8h,   v3.8b
838        sqxtun          v17.8b,  v26.8h           // base
839        sqxtun2         v17.16b, v27.8h
840        sqxtun          v16.8b,  v24.8h
841        sqxtun2         v16.16b, v25.8h
842        sqxtun          v19.8b,  v30.8h
843        sqxtun2         v19.16b, v31.8h
844        sqxtun          v18.8b,  v28.8h
845        sqxtun2         v18.16b, v29.8h
846        uabd            v23.16b, v5.16b,  v19.16b // tdiff
847        uabd            v22.16b, v5.16b,  v18.16b
848        uabd            v21.16b, v5.16b,  v17.16b
849        uabd            v20.16b, v5.16b,  v16.16b
850        uabd            v27.16b, v4.16b,  v19.16b // tldiff
851        uabd            v26.16b, v4.16b,  v18.16b
852        uabd            v25.16b, v4.16b,  v17.16b
853        uabd            v24.16b, v4.16b,  v16.16b
854        uabd            v19.16b, v3.16b,  v19.16b // ldiff
855        uabd            v18.16b, v2.16b,  v18.16b
856        uabd            v17.16b, v1.16b,  v17.16b
857        uabd            v16.16b, v0.16b,  v16.16b
858        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
859        umin            v30.16b, v22.16b, v26.16b
860        umin            v29.16b, v21.16b, v25.16b
861        umin            v28.16b, v20.16b, v24.16b
862        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
863        cmhs            v22.16b, v26.16b, v22.16b
864        cmhs            v21.16b, v25.16b, v21.16b
865        cmhs            v20.16b, v24.16b, v20.16b
866        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
867        cmhs            v18.16b, v30.16b, v18.16b
868        cmhs            v17.16b, v29.16b, v17.16b
869        cmhs            v16.16b, v28.16b, v16.16b
870        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
871        bsl             v22.16b, v5.16b,  v4.16b
872        bsl             v21.16b, v5.16b,  v4.16b
873        bsl             v20.16b, v5.16b,  v4.16b
874        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
875        bit             v22.16b, v2.16b,  v18.16b
876        bit             v21.16b, v1.16b,  v17.16b
877        bit             v20.16b, v0.16b,  v16.16b
878        subs            w3,  w3,  #16
879        st1             {v23.16b}, [x0],  #16
880        st1             {v22.16b}, [x6],  #16
881        st1             {v21.16b}, [x5],  #16
882        st1             {v20.16b}, [x10], #16
883        b.le            8f
884        ld1             {v5.16b},  [x8], #16
885        b               2b
8868:
887        subs            w4,  w4,  #4
888        b.le            9f
889        // End of horizontal loop, move pointers to next four rows
890        sub             x8,  x8,  w9, uxtw
891        add             x0,  x0,  x1
892        add             x6,  x6,  x1
893        // Load the top row as early as possible
894        ld1             {v5.16b},  [x8], #16
895        add             x5,  x5,  x1
896        add             x10, x10, x1
897        mov             w3,  w9
898        b               1b
8999:
900        ret
901
902L(ipred_paeth_tbl):
903        .hword L(ipred_paeth_tbl) - 640b
904        .hword L(ipred_paeth_tbl) - 320b
905        .hword L(ipred_paeth_tbl) - 160b
906        .hword L(ipred_paeth_tbl) -  80b
907        .hword L(ipred_paeth_tbl) -  40b
908endfunc
909
910// void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
911//                             const pixel *const topleft,
912//                             const int width, const int height, const int a,
913//                             const int max_width, const int max_height);
914function ipred_smooth_8bpc_neon, export=1
915        movrel          x10, X(sm_weights)
916        add             x11, x10, w4, uxtw
917        add             x10, x10, w3, uxtw
918        clz             w9,  w3
919        adr             x5,  L(ipred_smooth_tbl)
920        sub             x12, x2,  w4, uxtw
921        sub             w9,  w9,  #25
922        ldrh            w9,  [x5, w9, uxtw #1]
923        ld1r            {v4.16b},  [x12] // bottom
924        add             x8,  x2,  #1
925        sub             x5,  x5,  w9, uxtw
926        add             x6,  x0,  x1
927        lsl             x1,  x1,  #1
928        br              x5
92940:
930        AARCH64_VALID_JUMP_TARGET
931        ld1r            {v6.2s}, [x8]             // top
932        ld1r            {v7.2s}, [x10]            // weights_hor
933        sub             x2,  x2,  #4
934        mov             x7,  #-4
935        dup             v5.16b,  v6.b[3]          // right
936        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
937        uxtl            v7.8h,   v7.8b            // weights_hor
9384:
939        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
940        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
941        shll            v20.8h,  v5.8b,   #8      // right*256
942        shll            v21.8h,  v5.8b,   #8
943        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
944        zip1            v0.2s,   v3.2s,   v2.2s
945        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
946        zip1            v18.2s,  v18.2s,  v19.2s
947        shll            v22.8h,  v4.8b,   #8      // bottom*256
948        shll            v23.8h,  v4.8b,   #8
949        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
950        usubl           v1.8h,   v1.8b,   v5.8b
951        uxtl            v16.8h,  v16.8b           // weights_ver
952        uxtl            v18.8h,  v18.8b
953        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
954        mla             v21.8h,  v1.8h,   v7.8h
955        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
956        mla             v23.8h,  v6.8h,   v18.8h
957        uhadd           v20.8h,  v20.8h,  v22.8h
958        uhadd           v21.8h,  v21.8h,  v23.8h
959        rshrn           v20.8b,  v20.8h,  #8
960        rshrn           v21.8b,  v21.8h,  #8
961        st1             {v20.s}[0], [x0], x1
962        st1             {v20.s}[1], [x6], x1
963        subs            w4,  w4,  #4
964        st1             {v21.s}[0], [x0], x1
965        st1             {v21.s}[1], [x6], x1
966        b.gt            4b
967        ret
96880:
969        AARCH64_VALID_JUMP_TARGET
970        ld1             {v6.8b}, [x8]             // top
971        ld1             {v7.8b}, [x10]            // weights_hor
972        sub             x2,  x2,  #4
973        mov             x7,  #-4
974        dup             v5.16b,  v6.b[7]          // right
975        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
976        uxtl            v7.8h,   v7.8b            // weights_hor
9778:
978        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
979        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
980        shll            v20.8h,  v5.8b,   #8      // right*256
981        shll            v21.8h,  v5.8b,   #8
982        shll            v22.8h,  v5.8b,   #8
983        shll            v23.8h,  v5.8b,   #8
984        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
985        usubl           v1.8h,   v1.8b,   v5.8b
986        usubl           v2.8h,   v2.8b,   v5.8b
987        usubl           v3.8h,   v3.8b,   v5.8b
988        shll            v24.8h,  v4.8b,   #8      // bottom*256
989        shll            v25.8h,  v4.8b,   #8
990        shll            v26.8h,  v4.8b,   #8
991        shll            v27.8h,  v4.8b,   #8
992        uxtl            v16.8h,  v16.8b           // weights_ver
993        uxtl            v17.8h,  v17.8b
994        uxtl            v18.8h,  v18.8b
995        uxtl            v19.8h,  v19.8b
996        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
997        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
998        mla             v22.8h,  v1.8h,   v7.8h
999        mla             v23.8h,  v0.8h,   v7.8h
1000        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1001        mla             v25.8h,  v6.8h,   v17.8h
1002        mla             v26.8h,  v6.8h,   v18.8h
1003        mla             v27.8h,  v6.8h,   v19.8h
1004        uhadd           v20.8h,  v20.8h,  v24.8h
1005        uhadd           v21.8h,  v21.8h,  v25.8h
1006        uhadd           v22.8h,  v22.8h,  v26.8h
1007        uhadd           v23.8h,  v23.8h,  v27.8h
1008        rshrn           v20.8b,  v20.8h,  #8
1009        rshrn           v21.8b,  v21.8h,  #8
1010        rshrn           v22.8b,  v22.8h,  #8
1011        rshrn           v23.8b,  v23.8h,  #8
1012        st1             {v20.8b}, [x0], x1
1013        st1             {v21.8b}, [x6], x1
1014        subs            w4,  w4,  #4
1015        st1             {v22.8b}, [x0], x1
1016        st1             {v23.8b}, [x6], x1
1017        b.gt            8b
1018        ret
1019160:
1020320:
1021640:
1022        AARCH64_VALID_JUMP_TARGET
1023        add             x12, x2,  w3, uxtw
1024        sub             x2,  x2,  #2
1025        mov             x7,  #-2
1026        ld1r            {v5.16b}, [x12]           // right
1027        sub             x1,  x1,  w3, uxtw
1028        mov             w9,  w3
1029
10301:
1031        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
1032        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1033        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1034        usubl           v1.8h,   v1.8b,   v5.8b
1035        uxtl            v16.8h,  v16.8b           // weights_ver
1036        uxtl            v17.8h,  v17.8b
10372:
1038        ld1             {v7.16b}, [x10],  #16     // weights_hor
1039        ld1             {v3.16b}, [x8],   #16     // top
1040        shll            v20.8h,  v5.8b,   #8      // right*256
1041        shll            v21.8h,  v5.8b,   #8
1042        shll            v22.8h,  v5.8b,   #8
1043        shll            v23.8h,  v5.8b,   #8
1044        uxtl            v6.8h,   v7.8b            // weights_hor
1045        uxtl2           v7.8h,   v7.16b
1046        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1047        usubl2          v3.8h,   v3.16b,  v4.16b
1048        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1049        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
1050        mla             v22.8h,  v0.8h,   v6.8h
1051        mla             v23.8h,  v0.8h,   v7.8h
1052        shll            v24.8h,  v4.8b,   #8      // bottom*256
1053        shll            v25.8h,  v4.8b,   #8
1054        shll            v26.8h,  v4.8b,   #8
1055        shll            v27.8h,  v4.8b,   #8
1056        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1057        mla             v25.8h,  v3.8h,   v16.8h
1058        mla             v26.8h,  v2.8h,   v17.8h
1059        mla             v27.8h,  v3.8h,   v17.8h
1060        uhadd           v20.8h,  v20.8h,  v24.8h
1061        uhadd           v21.8h,  v21.8h,  v25.8h
1062        uhadd           v22.8h,  v22.8h,  v26.8h
1063        uhadd           v23.8h,  v23.8h,  v27.8h
1064        rshrn           v20.8b,  v20.8h,  #8
1065        rshrn2          v20.16b, v21.8h,  #8
1066        rshrn           v22.8b,  v22.8h,  #8
1067        rshrn2          v22.16b, v23.8h,  #8
1068        subs            w3,  w3,  #16
1069        st1             {v20.16b}, [x0],  #16
1070        st1             {v22.16b}, [x6],  #16
1071        b.gt            2b
1072        subs            w4,  w4,  #2
1073        b.le            9f
1074        sub             x8,  x8,  w9, uxtw
1075        sub             x10, x10, w9, uxtw
1076        add             x0,  x0,  x1
1077        add             x6,  x6,  x1
1078        mov             w3,  w9
1079        b               1b
10809:
1081        ret
1082
1083L(ipred_smooth_tbl):
1084        .hword L(ipred_smooth_tbl) - 640b
1085        .hword L(ipred_smooth_tbl) - 320b
1086        .hword L(ipred_smooth_tbl) - 160b
1087        .hword L(ipred_smooth_tbl) -  80b
1088        .hword L(ipred_smooth_tbl) -  40b
1089endfunc
1090
1091// void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1092//                               const pixel *const topleft,
1093//                               const int width, const int height, const int a,
1094//                               const int max_width, const int max_height);
1095function ipred_smooth_v_8bpc_neon, export=1
1096        movrel          x7,  X(sm_weights)
1097        add             x7,  x7,  w4, uxtw
1098        clz             w9,  w3
1099        adr             x5,  L(ipred_smooth_v_tbl)
1100        sub             x8,  x2,  w4, uxtw
1101        sub             w9,  w9,  #25
1102        ldrh            w9,  [x5, w9, uxtw #1]
1103        ld1r            {v4.16b},  [x8] // bottom
1104        add             x2,  x2,  #1
1105        sub             x5,  x5,  w9, uxtw
1106        add             x6,  x0,  x1
1107        lsl             x1,  x1,  #1
1108        br              x5
110940:
1110        AARCH64_VALID_JUMP_TARGET
1111        ld1r            {v6.2s}, [x2]             // top
1112        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
11134:
1114        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1115        shll            v22.8h,  v4.8b,   #8      // bottom*256
1116        shll            v23.8h,  v4.8b,   #8
1117        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1118        zip1            v18.2s,  v18.2s,  v19.2s
1119        uxtl            v16.8h,  v16.8b           // weights_ver
1120        uxtl            v18.8h,  v18.8b
1121        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1122        mla             v23.8h,  v6.8h,   v18.8h
1123        rshrn           v22.8b,  v22.8h,  #8
1124        rshrn           v23.8b,  v23.8h,  #8
1125        st1             {v22.s}[0], [x0], x1
1126        st1             {v22.s}[1], [x6], x1
1127        subs            w4,  w4,  #4
1128        st1             {v23.s}[0], [x0], x1
1129        st1             {v23.s}[1], [x6], x1
1130        b.gt            4b
1131        ret
113280:
1133        AARCH64_VALID_JUMP_TARGET
1134        ld1             {v6.8b}, [x2]             // top
1135        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
11368:
1137        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1138        shll            v24.8h,  v4.8b,   #8      // bottom*256
1139        shll            v25.8h,  v4.8b,   #8
1140        shll            v26.8h,  v4.8b,   #8
1141        shll            v27.8h,  v4.8b,   #8
1142        uxtl            v16.8h,  v16.8b           // weights_ver
1143        uxtl            v17.8h,  v17.8b
1144        uxtl            v18.8h,  v18.8b
1145        uxtl            v19.8h,  v19.8b
1146        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1147        mla             v25.8h,  v6.8h,   v17.8h
1148        mla             v26.8h,  v6.8h,   v18.8h
1149        mla             v27.8h,  v6.8h,   v19.8h
1150        rshrn           v24.8b,  v24.8h,  #8
1151        rshrn           v25.8b,  v25.8h,  #8
1152        rshrn           v26.8b,  v26.8h,  #8
1153        rshrn           v27.8b,  v27.8h,  #8
1154        st1             {v24.8b}, [x0], x1
1155        st1             {v25.8b}, [x6], x1
1156        subs            w4,  w4,  #4
1157        st1             {v26.8b}, [x0], x1
1158        st1             {v27.8b}, [x6], x1
1159        b.gt            8b
1160        ret
1161160:
1162320:
1163640:
1164        AARCH64_VALID_JUMP_TARGET
1165        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1166        add             x5,  x0,  x1
1167        add             x8,  x6,  x1
1168        lsl             x1,  x1,  #1
1169        sub             x1,  x1,  w3, uxtw
1170        mov             w9,  w3
1171
11721:
1173        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1174        uxtl            v16.8h,  v16.8b           // weights_ver
1175        uxtl            v17.8h,  v17.8b
1176        uxtl            v18.8h,  v18.8b
1177        uxtl            v19.8h,  v19.8b
11782:
1179        ld1             {v3.16b}, [x2],   #16     // top
1180        shll            v20.8h,  v4.8b,   #8      // bottom*256
1181        shll            v21.8h,  v4.8b,   #8
1182        shll            v22.8h,  v4.8b,   #8
1183        shll            v23.8h,  v4.8b,   #8
1184        shll            v24.8h,  v4.8b,   #8
1185        shll            v25.8h,  v4.8b,   #8
1186        shll            v26.8h,  v4.8b,   #8
1187        shll            v27.8h,  v4.8b,   #8
1188        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
1189        usubl2          v3.8h,   v3.16b,  v4.16b
1190        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
1191        mla             v21.8h,  v3.8h,   v16.8h
1192        mla             v22.8h,  v2.8h,   v17.8h
1193        mla             v23.8h,  v3.8h,   v17.8h
1194        mla             v24.8h,  v2.8h,   v18.8h
1195        mla             v25.8h,  v3.8h,   v18.8h
1196        mla             v26.8h,  v2.8h,   v19.8h
1197        mla             v27.8h,  v3.8h,   v19.8h
1198        rshrn           v20.8b,  v20.8h,  #8
1199        rshrn2          v20.16b, v21.8h,  #8
1200        rshrn           v22.8b,  v22.8h,  #8
1201        rshrn2          v22.16b, v23.8h,  #8
1202        rshrn           v24.8b,  v24.8h,  #8
1203        rshrn2          v24.16b, v25.8h,  #8
1204        rshrn           v26.8b,  v26.8h,  #8
1205        rshrn2          v26.16b, v27.8h,  #8
1206        subs            w3,  w3,  #16
1207        st1             {v20.16b}, [x0],  #16
1208        st1             {v22.16b}, [x6],  #16
1209        st1             {v24.16b}, [x5],  #16
1210        st1             {v26.16b}, [x8],  #16
1211        b.gt            2b
1212        subs            w4,  w4,  #4
1213        b.le            9f
1214        sub             x2,  x2,  w9, uxtw
1215        add             x0,  x0,  x1
1216        add             x6,  x6,  x1
1217        add             x5,  x5,  x1
1218        add             x8,  x8,  x1
1219        mov             w3,  w9
1220        b               1b
12219:
1222        ret
1223
1224L(ipred_smooth_v_tbl):
1225        .hword L(ipred_smooth_v_tbl) - 640b
1226        .hword L(ipred_smooth_v_tbl) - 320b
1227        .hword L(ipred_smooth_v_tbl) - 160b
1228        .hword L(ipred_smooth_v_tbl) -  80b
1229        .hword L(ipred_smooth_v_tbl) -  40b
1230endfunc
1231
1232// void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1233//                               const pixel *const topleft,
1234//                               const int width, const int height, const int a,
1235//                               const int max_width, const int max_height);
1236function ipred_smooth_h_8bpc_neon, export=1
1237        movrel          x8,  X(sm_weights)
1238        add             x8,  x8,  w3, uxtw
1239        clz             w9,  w3
1240        adr             x5,  L(ipred_smooth_h_tbl)
1241        add             x12, x2,  w3, uxtw
1242        sub             w9,  w9,  #25
1243        ldrh            w9,  [x5, w9, uxtw #1]
1244        ld1r            {v5.16b},  [x12] // right
1245        sub             x5,  x5,  w9, uxtw
1246        add             x6,  x0,  x1
1247        lsl             x1,  x1,  #1
1248        br              x5
124940:
1250        AARCH64_VALID_JUMP_TARGET
1251        ld1r            {v7.2s}, [x8]             // weights_hor
1252        sub             x2,  x2,  #4
1253        mov             x7,  #-4
1254        uxtl            v7.8h,   v7.8b            // weights_hor
12554:
1256        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1257        shll            v20.8h,  v5.8b,   #8      // right*256
1258        shll            v21.8h,  v5.8b,   #8
1259        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
1260        zip1            v0.2s,   v3.2s,   v2.2s
1261        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1262        usubl           v1.8h,   v1.8b,   v5.8b
1263        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1264        mla             v21.8h,  v1.8h,   v7.8h
1265        rshrn           v20.8b,  v20.8h,  #8
1266        rshrn           v21.8b,  v21.8h,  #8
1267        st1             {v20.s}[0], [x0], x1
1268        st1             {v20.s}[1], [x6], x1
1269        subs            w4,  w4,  #4
1270        st1             {v21.s}[0], [x0], x1
1271        st1             {v21.s}[1], [x6], x1
1272        b.gt            4b
1273        ret
127480:
1275        AARCH64_VALID_JUMP_TARGET
1276        ld1             {v7.8b}, [x8]             // weights_hor
1277        sub             x2,  x2,  #4
1278        mov             x7,  #-4
1279        uxtl            v7.8h,   v7.8b            // weights_hor
12808:
1281        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
1282        shll            v20.8h,  v5.8b,   #8      // right*256
1283        shll            v21.8h,  v5.8b,   #8
1284        shll            v22.8h,  v5.8b,   #8
1285        shll            v23.8h,  v5.8b,   #8
1286        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
1287        usubl           v2.8h,   v2.8b,   v5.8b
1288        usubl           v1.8h,   v1.8b,   v5.8b
1289        usubl           v0.8h,   v0.8b,   v5.8b
1290        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
1291        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
1292        mla             v22.8h,  v1.8h,   v7.8h
1293        mla             v23.8h,  v0.8h,   v7.8h
1294        rshrn           v20.8b,  v20.8h,  #8
1295        rshrn           v21.8b,  v21.8h,  #8
1296        rshrn           v22.8b,  v22.8h,  #8
1297        rshrn           v23.8b,  v23.8h,  #8
1298        st1             {v20.8b}, [x0], x1
1299        st1             {v21.8b}, [x6], x1
1300        subs            w4,  w4,  #4
1301        st1             {v22.8b}, [x0], x1
1302        st1             {v23.8b}, [x6], x1
1303        b.gt            8b
1304        ret
1305160:
1306320:
1307640:
1308        AARCH64_VALID_JUMP_TARGET
1309        sub             x2,  x2,  #4
1310        mov             x7,  #-4
1311        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1312        add             x5,  x0,  x1
1313        add             x10, x6,  x1
1314        lsl             x1,  x1,  #1
1315        sub             x1,  x1,  w3, uxtw
1316        mov             w9,  w3
1317
13181:
1319        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
1320        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
1321        usubl           v1.8h,   v1.8b,   v5.8b
1322        usubl           v2.8h,   v2.8b,   v5.8b
1323        usubl           v3.8h,   v3.8b,   v5.8b
13242:
1325        ld1             {v7.16b}, [x8],   #16     // weights_hor
1326        shll            v20.8h,  v5.8b,   #8      // right*256
1327        shll            v21.8h,  v5.8b,   #8
1328        shll            v22.8h,  v5.8b,   #8
1329        shll            v23.8h,  v5.8b,   #8
1330        shll            v24.8h,  v5.8b,   #8
1331        shll            v25.8h,  v5.8b,   #8
1332        shll            v26.8h,  v5.8b,   #8
1333        shll            v27.8h,  v5.8b,   #8
1334        uxtl            v6.8h,   v7.8b            // weights_hor
1335        uxtl2           v7.8h,   v7.16b
1336        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
1337        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
1338        mla             v22.8h,  v2.8h,   v6.8h
1339        mla             v23.8h,  v2.8h,   v7.8h
1340        mla             v24.8h,  v1.8h,   v6.8h
1341        mla             v25.8h,  v1.8h,   v7.8h
1342        mla             v26.8h,  v0.8h,   v6.8h
1343        mla             v27.8h,  v0.8h,   v7.8h
1344        rshrn           v20.8b,  v20.8h,  #8
1345        rshrn2          v20.16b, v21.8h,  #8
1346        rshrn           v22.8b,  v22.8h,  #8
1347        rshrn2          v22.16b, v23.8h,  #8
1348        rshrn           v24.8b,  v24.8h,  #8
1349        rshrn2          v24.16b, v25.8h,  #8
1350        rshrn           v26.8b,  v26.8h,  #8
1351        rshrn2          v26.16b, v27.8h,  #8
1352        subs            w3,  w3,  #16
1353        st1             {v20.16b}, [x0],  #16
1354        st1             {v22.16b}, [x6],  #16
1355        st1             {v24.16b}, [x5],  #16
1356        st1             {v26.16b}, [x10], #16
1357        b.gt            2b
1358        subs            w4,  w4,  #4
1359        b.le            9f
1360        sub             x8,  x8,  w9, uxtw
1361        add             x0,  x0,  x1
1362        add             x6,  x6,  x1
1363        add             x5,  x5,  x1
1364        add             x10, x10, x1
1365        mov             w3,  w9
1366        b               1b
13679:
1368        ret
1369
1370L(ipred_smooth_h_tbl):
1371        .hword L(ipred_smooth_h_tbl) - 640b
1372        .hword L(ipred_smooth_h_tbl) - 320b
1373        .hword L(ipred_smooth_h_tbl) - 160b
1374        .hword L(ipred_smooth_h_tbl) -  80b
1375        .hword L(ipred_smooth_h_tbl) -  40b
1376endfunc
1377
1378const padding_mask_buf
1379        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1380        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1381        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1382        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1383padding_mask:
1384        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1385        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1386        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1387        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1388endconst
1389
1390// void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
1391//                                       const pixel *const in, const int end);
1392function ipred_z1_upsample_edge_8bpc_neon, export=1
1393        movrel          x4,  padding_mask
1394        ld1             {v0.16b},  [x2]           // in[]
1395        add             x5,  x2,  w3,  uxtw       // in[end]
1396        sub             x4,  x4,  w3,  uxtw
1397
1398        ld1r            {v1.16b},  [x5]           // padding
1399        ld1             {v3.16b},  [x4]           // padding_mask
1400
1401        movi            v31.8h,  #9
1402
1403        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1404
1405        ext             v4.16b,  v0.16b,  v1.16b,  #1
1406        ext             v5.16b,  v0.16b,  v1.16b,  #2
1407        ext             v6.16b,  v0.16b,  v1.16b,  #3
1408
1409        uaddl           v16.8h,  v4.8b,   v5.8b   // in[i+1] + in[i+2]
1410        uaddl2          v17.8h,  v4.16b,  v5.16b
1411        uaddl           v18.8h,  v0.8b,   v6.8b   // in[i+0] + in[i+3]
1412        uaddl2          v19.8h,  v0.16b,  v6.16b
1413        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1414        mul             v17.8h,  v17.8h,  v31.8h
1415        sub             v16.8h,  v16.8h,  v18.8h
1416        sub             v17.8h,  v17.8h,  v19.8h
1417
1418        sqrshrun        v16.8b,  v16.8h,  #4
1419        sqrshrun2       v16.16b, v17.8h,  #4
1420
1421        zip1            v0.16b,  v4.16b,  v16.16b
1422        zip2            v1.16b,  v4.16b,  v16.16b
1423
1424        st1             {v0.16b, v1.16b}, [x0]
1425
1426        ret
1427endfunc
1428
1429// void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
1430//                                       const pixel *const in);
1431function ipred_z2_upsample_edge_8bpc_neon, export=1
1432        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1433        movrel          x4,  padding_mask
1434        ld1             {v0.16b},  [x2]           // in[]
1435        add             x5,  x2,  w1,  uxtw       // in[sz]
1436        sub             x4,  x4,  w1,  uxtw
1437
1438        ld1r            {v2.16b},  [x2]           // in[0] for padding
1439        ld1r            {v1.16b},  [x5]           // padding
1440        ld1             {v3.16b},  [x4]           // padding_mask
1441
1442        movi            v31.8h,  #9
1443
1444        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
1445
1446        ext             v4.16b,  v2.16b,  v0.16b,  #15
1447        ext             v5.16b,  v0.16b,  v1.16b,  #1
1448        ext             v6.16b,  v0.16b,  v1.16b,  #2
1449
1450        uaddl           v16.8h,  v0.8b,   v5.8b   // in[i+0] + in[i+1]
1451        uaddl           v18.8h,  v4.8b,   v6.8b   // in[i-1] + in[i+2]
1452        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
1453        sub             v16.8h,  v16.8h,  v18.8h
1454
1455        sqrshrun        v16.8b,  v16.8h,  #4
1456
1457        add             x5,  x0,  #16
1458
1459        zip1            v2.16b,  v0.16b,  v16.16b
1460
1461        st1             {v1.b}[0], [x5]
1462        // In case sz=8, output one single pixel in out[16].
1463        st1             {v2.16b}, [x0]
1464
1465        ret
1466endfunc
1467
1468const edge_filter
1469        .byte 0, 4, 8, 0
1470        .byte 0, 5, 6, 0
1471// Leaving out the coeffs for strength=3
1472//      .byte 2, 4, 4, 0
1473endconst
1474
1475// void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
1476//                                     const pixel *const in, const int end,
1477//                                     const int strength);
1478function ipred_z1_filter_edge_8bpc_neon, export=1
1479        cmp             w4, #3
1480        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1481
1482        movrel          x5,  edge_filter, -3
1483        add             x5,  x5,  w4,  uxtw #2    // edge_filter + (strength - 1)*4 + 1
1484
1485        ld1             {v31.h}[0], [x5]          // kernel[1-2]
1486
1487        ld1             {v0.16b}, [x2], #16
1488
1489        dup             v30.16b, v31.b[0]
1490        dup             v31.16b, v31.b[1]
14911:
1492        // in[end], is the last valid pixel. We produce 16 pixels out by
1493        // using 18 pixels in - the last pixel used is [17] of the ones
1494        // read/buffered.
1495        cmp             w3,  #17
1496        ld1             {v1.16b}, [x2], #16
1497        b.lt            2f
1498        ext             v2.16b,  v0.16b,  v1.16b,  #1
1499        ext             v3.16b,  v0.16b,  v1.16b,  #2
1500        umull           v4.8h,   v0.8b,   v30.8b
1501        umlal           v4.8h,   v2.8b,   v31.8b
1502        umlal           v4.8h,   v3.8b,   v30.8b
1503        umull2          v5.8h,   v0.16b,  v30.16b
1504        umlal2          v5.8h,   v2.16b,  v31.16b
1505        umlal2          v5.8h,   v3.16b,  v30.16b
1506        subs            w1,  w1,  #16
1507        mov             v0.16b,  v1.16b
1508        rshrn           v4.8b,   v4.8h,   #4
1509        rshrn2          v4.16b,  v5.8h,   #4
1510        sub             w3,  w3,  #16
1511        st1             {v4.16b}, [x0], #16
1512        b.gt            1b
1513        ret
15142:
1515        // Right padding
1516
1517        // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
1518        movrel          x5,  padding_mask
1519        sub             w6,  w3,  #32
1520        sub             x5,  x5,  w3,  uxtw
1521        add             x6,  x2,  w6,  sxtw
1522
1523        ld1             {v2.16b}, [x5]            // padding_mask
1524
1525        ld1r            {v1.16b}, [x6]
1526        bit             v0.16b,  v1.16b,  v2.16b  // Pad v0-v1
1527
1528        // Filter one block
1529        ext             v2.16b,  v0.16b,  v1.16b,  #1
1530        ext             v3.16b,  v0.16b,  v1.16b,  #2
1531        umull           v4.8h,   v0.8b,   v30.8b
1532        umlal           v4.8h,   v2.8b,   v31.8b
1533        umlal           v4.8h,   v3.8b,   v30.8b
1534        umull2          v5.8h,   v0.16b,  v30.16b
1535        umlal2          v5.8h,   v2.16b,  v31.16b
1536        umlal2          v5.8h,   v3.16b,  v30.16b
1537        subs            w1,  w1,  #16
1538        rshrn           v4.8b,   v4.8h,   #4
1539        rshrn2          v4.16b,  v5.8h,   #4
1540        st1             {v4.16b}, [x0], #16
1541        b.le            9f
15425:
1543        // After one block, any remaining output would only be filtering
1544        // padding - thus just store the padding.
1545        subs            w1,  w1,  #16
1546        st1             {v1.16b}, [x0], #16
1547        b.gt            5b
15489:
1549        ret
1550
1551L(fivetap):
1552        sub             x2,  x2,  #1              // topleft -= 1
1553        movi            v29.16b, #2
1554        ld1             {v0.16b}, [x2], #16
1555        movi            v30.16b, #4
1556        movi            v31.16b, #4
1557        ins             v0.b[0], v0.b[1]
15581:
1559        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1560        // using 20 pixels in - the last pixel used is [19] of the ones
1561        // read/buffered.
1562        cmp             w3,  #18
1563        ld1             {v1.16b}, [x2], #16
1564        b.lt            2f                        // if (end + 1 < 19)
1565        ext             v2.16b,  v0.16b,  v1.16b,  #1
1566        ext             v3.16b,  v0.16b,  v1.16b,  #2
1567        ext             v4.16b,  v0.16b,  v1.16b,  #3
1568        ext             v5.16b,  v0.16b,  v1.16b,  #4
1569        umull           v6.8h,   v0.8b,   v29.8b
1570        umlal           v6.8h,   v2.8b,   v30.8b
1571        umlal           v6.8h,   v3.8b,   v31.8b
1572        umlal           v6.8h,   v4.8b,   v30.8b
1573        umlal           v6.8h,   v5.8b,   v29.8b
1574        umull2          v7.8h,   v0.16b,  v29.16b
1575        umlal2          v7.8h,   v2.16b,  v30.16b
1576        umlal2          v7.8h,   v3.16b,  v31.16b
1577        umlal2          v7.8h,   v4.16b,  v30.16b
1578        umlal2          v7.8h,   v5.16b,  v29.16b
1579        subs            w1,  w1,  #16
1580        mov             v0.16b,  v1.16b
1581        rshrn           v6.8b,   v6.8h,   #4
1582        rshrn2          v6.16b,  v7.8h,   #4
1583        sub             w3,  w3,  #16
1584        st1             {v6.16b}, [x0], #16
1585        b.gt            1b
1586        ret
15872:
1588        // Right padding
1589
1590        // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
1591        movrel          x5,  padding_mask, -1
1592        sub             w6,  w3,  #31
1593        sub             x5,  x5,  w3,  uxtw
1594        add             x6,  x2,  w6,  sxtw
1595
1596        ld1             {v2.16b, v3.16b}, [x5]    // padding_mask
1597
1598        ld1r            {v28.16b}, [x6]
1599        bit             v0.16b,  v28.16b, v2.16b  // Pad v0-v1
1600        bit             v1.16b,  v28.16b, v3.16b
16014:
1602        // Filter one block
1603        ext             v2.16b,  v0.16b,  v1.16b,  #1
1604        ext             v3.16b,  v0.16b,  v1.16b,  #2
1605        ext             v4.16b,  v0.16b,  v1.16b,  #3
1606        ext             v5.16b,  v0.16b,  v1.16b,  #4
1607        umull           v6.8h,   v0.8b,   v29.8b
1608        umlal           v6.8h,   v2.8b,   v30.8b
1609        umlal           v6.8h,   v3.8b,   v31.8b
1610        umlal           v6.8h,   v4.8b,   v30.8b
1611        umlal           v6.8h,   v5.8b,   v29.8b
1612        umull2          v7.8h,   v0.16b,  v29.16b
1613        umlal2          v7.8h,   v2.16b,  v30.16b
1614        umlal2          v7.8h,   v3.16b,  v31.16b
1615        umlal2          v7.8h,   v4.16b,  v30.16b
1616        umlal2          v7.8h,   v5.16b,  v29.16b
1617        subs            w1,  w1,  #16
1618        mov             v0.16b,  v1.16b
1619        mov             v1.16b,  v28.16b
1620        rshrn           v6.8b,   v6.8h,   #4
1621        rshrn2          v6.16b,  v7.8h,   #4
1622        sub             w3,  w3,  #16
1623        st1             {v6.16b}, [x0], #16
1624        b.le            9f
1625        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1626        // filter properly once more - aka (w3 >= 0).
1627        cmp             w3,  #0
1628        b.ge            4b
16295:
1630        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1631        // last valid pixel - thus just output that without filtering.
1632        subs            w1,  w1,  #16
1633        st1             {v1.16b}, [x0], #16
1634        b.gt            5b
16359:
1636        ret
1637endfunc
1638
1639// void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
1640//                                const int n);
1641function ipred_pixel_set_8bpc_neon, export=1
1642        dup             v0.16b,  w1
16431:
1644        subs            w2,  w2,  #16
1645        st1             {v0.16b}, [x0], #16
1646        b.gt            1b
1647        ret
1648endfunc
1649
1650// void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1651//                               const pixel *const top,
1652//                               const int width, const int height,
1653//                               const int dx, const int max_base_x);
1654function ipred_z1_fill1_8bpc_neon, export=1
1655        clz             w9,  w3
1656        adr             x8,  L(ipred_z1_fill1_tbl)
1657        sub             w9,  w9,  #25
1658        ldrh            w9,  [x8, w9, uxtw #1]
1659        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1660        sub             x8,  x8,  w9,  uxtw
1661        ld1r            {v31.16b}, [x10]          // padding
1662        mov             w7,  w5
1663        mov             w15, #64
1664        br              x8
166540:
1666        AARCH64_VALID_JUMP_TARGET
16674:
1668        lsr             w8,  w7,  #6              // base
1669        and             w9,  w7,  #0x3e           // frac
1670        add             w7,  w7,  w5              // xpos += dx
1671        cmp             w8,  w6                   // base >= max_base_x
1672        lsr             w10, w7,  #6              // base
1673        and             w11, w7,  #0x3e           // frac
1674        b.ge            49f
1675        ldr             d0,  [x2, w8, uxtw]       // top[base]
1676        ldr             d2,  [x2, w10, uxtw]
1677        dup             v4.4h,   w9               // frac
1678        dup             v5.4h,   w11
1679        ext             v1.8b,   v0.8b,   v0.8b,   #1 // top[base+1]
1680        ext             v3.8b,   v2.8b,   v2.8b,   #1
1681        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1682        usubl           v7.8h,   v3.8b,   v2.8b
1683        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1684        ushll           v17.8h,  v2.8b,   #6
1685        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1686        mla             v17.4h,  v7.4h,   v5.4h
1687        rshrn           v16.8b,  v16.8h,  #6
1688        rshrn           v17.8b,  v17.8h,  #6
1689        st1             {v16.s}[0], [x0], x1
1690        add             w7,  w7,  w5              // xpos += dx
1691        subs            w4,  w4,  #2
1692        st1             {v17.s}[0], [x0], x1
1693        b.gt            4b
1694        ret
1695
169649:
1697        st1             {v31.s}[0], [x0], x1
1698        subs            w4,  w4,  #2
1699        st1             {v31.s}[0], [x0], x1
1700        b.gt            49b
1701        ret
1702
170380:
1704        AARCH64_VALID_JUMP_TARGET
17058:
1706        lsr             w8,  w7,  #6              // base
1707        and             w9,  w7,  #0x3e           // frac
1708        add             w7,  w7,  w5              // xpos += dx
1709        cmp             w8,  w6                   // base >= max_base_x
1710        lsr             w10, w7,  #6              // base
1711        and             w11, w7,  #0x3e           // frac
1712        b.ge            89f
1713        ldr             q0,  [x2, w8, uxtw]       // top[base]
1714        ldr             q2,  [x2, w10, uxtw]
1715        dup             v4.8b,   w9               // frac
1716        dup             v5.8b,   w11
1717        sub             w9,  w15, w9              // 64 - frac
1718        sub             w11, w15, w11
1719        dup             v6.8b,   w9               // 64 - frac
1720        dup             v7.8b,   w11
1721        ext             v1.16b,  v0.16b,  v0.16b,  #1 // top[base+1]
1722        ext             v3.16b,  v2.16b,  v2.16b,  #1
1723        umull           v16.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1724        umlal           v16.8h,  v1.8b,   v4.8b   // + top[base+1]*frac
1725        umull           v17.8h,  v2.8b,   v7.8b
1726        umlal           v17.8h,  v3.8b,   v5.8b
1727        rshrn           v16.8b,  v16.8h,  #6
1728        rshrn           v17.8b,  v17.8h,  #6
1729        st1             {v16.8b}, [x0], x1
1730        add             w7,  w7,  w5              // xpos += dx
1731        subs            w4,  w4,  #2
1732        st1             {v17.8b}, [x0], x1
1733        b.gt            8b
1734        ret
1735
173689:
1737        st1             {v31.8b}, [x0], x1
1738        subs            w4,  w4,  #2
1739        st1             {v31.8b}, [x0], x1
1740        b.gt            89b
1741        ret
1742
1743160:
1744320:
1745640:
1746        AARCH64_VALID_JUMP_TARGET
1747
1748        mov             w12, w3
1749
1750        add             x13, x0,  x1
1751        lsl             x1,  x1,  #1
1752        sub             x1,  x1,  w3,  uxtw
17531:
1754        lsr             w8,  w7,  #6              // base
1755        and             w9,  w7,  #0x3e           // frac
1756        add             w7,  w7,  w5              // xpos += dx
1757        cmp             w8,  w6                   // base >= max_base_x
1758        lsr             w10, w7,  #6              // base
1759        and             w11, w7,  #0x3e           // frac
1760        b.ge            169f
1761        add             x8,  x2,  w8,  uxtw
1762        add             x10, x2,  w10, uxtw
1763        dup             v4.16b,  w9               // frac
1764        dup             v5.16b,  w11
1765        ld1             {v0.16b, v1.16b}, [x8],  #32 // top[base]
1766        ld1             {v2.16b, v3.16b}, [x10], #32
1767        sub             w9,  w15, w9              // 64 - frac
1768        sub             w11, w15, w11
1769        dup             v6.16b,  w9               // 64 - frac
1770        dup             v7.16b,  w11
1771        add             w7,  w7,  w5              // xpos += dx
17722:
1773        ext             v16.16b, v0.16b,  v1.16b,  #1 // top[base+1]
1774        ext             v17.16b, v2.16b,  v3.16b,  #1
1775        subs            w3,  w3,  #16
1776        umull           v18.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
1777        umlal           v18.8h,  v16.8b,  v4.8b   // + top[base+1]*frac
1778        umull2          v19.8h,  v0.16b,  v6.16b
1779        umlal2          v19.8h,  v16.16b, v4.16b
1780        umull           v20.8h,  v2.8b,   v7.8b
1781        umlal           v20.8h,  v17.8b,  v5.8b
1782        umull2          v21.8h,  v2.16b,  v7.16b
1783        umlal2          v21.8h,  v17.16b, v5.16b
1784        rshrn           v16.8b,  v18.8h,  #6
1785        rshrn2          v16.16b, v19.8h,  #6
1786        rshrn           v17.8b,  v20.8h,  #6
1787        rshrn2          v17.16b, v21.8h,  #6
1788        st1             {v16.16b}, [x0],  #16
1789        st1             {v17.16b}, [x13], #16
1790        b.le            3f
1791        mov             v0.16b,  v1.16b
1792        ld1             {v1.16b}, [x8],  #16 // top[base]
1793        mov             v2.16b,  v3.16b
1794        ld1             {v3.16b}, [x10], #16
1795        b               2b
1796
17973:
1798        subs            w4,  w4,  #2
1799        b.le            9f
1800        add             x0,  x0,  x1
1801        add             x13, x13, x1
1802        mov             w3,  w12
1803        b               1b
18049:
1805        ret
1806
1807169:
1808        st1             {v31.16b}, [x0],  #16
1809        subs            w3,  w3,  #16
1810        st1             {v31.16b}, [x13], #16
1811        b.gt            169b
1812        subs            w4,  w4,  #2
1813        b.le            9b
1814        add             x0,  x0,  x1
1815        add             x13, x13, x1
1816        mov             w3,  w12
1817        b               169b
1818
1819L(ipred_z1_fill1_tbl):
1820        .hword L(ipred_z1_fill1_tbl) - 640b
1821        .hword L(ipred_z1_fill1_tbl) - 320b
1822        .hword L(ipred_z1_fill1_tbl) - 160b
1823        .hword L(ipred_z1_fill1_tbl) -  80b
1824        .hword L(ipred_z1_fill1_tbl) -  40b
1825endfunc
1826
1827function ipred_z1_fill2_8bpc_neon, export=1
1828        cmp             w3,  #8
1829        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1830        ld1r            {v31.16b}, [x10]          // padding
1831        mov             w7,  w5
1832        mov             w15, #64
1833        b.eq            8f
1834
18354:      // w == 4
1836        lsr             w8,  w7,  #6              // base
1837        and             w9,  w7,  #0x3e           // frac
1838        add             w7,  w7,  w5              // xpos += dx
1839        cmp             w8,  w6                   // base >= max_base_x
1840        lsr             w10, w7,  #6              // base
1841        and             w11, w7,  #0x3e           // frac
1842        b.ge            49f
1843        ldr             d0,  [x2, w8, uxtw]       // top[base]
1844        ldr             d2,  [x2, w10, uxtw]
1845        dup             v4.4h,   w9               // frac
1846        dup             v5.4h,   w11
1847        uzp2            v1.8b,   v0.8b,   v0.8b   // top[base+1]
1848        uzp1            v0.8b,   v0.8b,   v0.8b   // top[base]
1849        uzp2            v3.8b,   v2.8b,   v2.8b
1850        uzp1            v2.8b,   v2.8b,   v2.8b
1851        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
1852        usubl           v7.8h,   v3.8b,   v2.8b
1853        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
1854        ushll           v17.8h,  v2.8b,   #6
1855        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
1856        mla             v17.4h,  v7.4h,   v5.4h
1857        rshrn           v16.8b,  v16.8h,  #6
1858        rshrn           v17.8b,  v17.8h,  #6
1859        st1             {v16.s}[0], [x0], x1
1860        add             w7,  w7,  w5              // xpos += dx
1861        subs            w4,  w4,  #2
1862        st1             {v17.s}[0], [x0], x1
1863        b.gt            4b
1864        ret
1865
186649:
1867        st1             {v31.s}[0], [x0], x1
1868        subs            w4,  w4,  #2
1869        st1             {v31.s}[0], [x0], x1
1870        b.gt            49b
1871        ret
1872
18738:      // w == 8
1874        lsr             w8,  w7,  #6              // base
1875        and             w9,  w7,  #0x3e           // frac
1876        add             w7,  w7,  w5              // xpos += dx
1877        cmp             w8,  w6                   // base >= max_base_x
1878        lsr             w10, w7,  #6              // base
1879        and             w11, w7,  #0x3e           // frac
1880        b.ge            89f
1881        ldr             q0,  [x2, w8, uxtw]       // top[base]
1882        ldr             q2,  [x2, w10, uxtw]
1883        dup             v4.8b,   w9               // frac
1884        dup             v5.8b,   w11
1885        sub             w9,  w15, w9              // 64 - frac
1886        sub             w11, w15, w11
1887        dup             v6.8b,   w9               // 64 - frac
1888        dup             v7.8b,   w11
1889        uzp2            v1.16b,  v0.16b,  v0.16b  // top[base+1]
1890        uzp1            v0.16b,  v0.16b,  v0.16b  // top[base]
1891        uzp2            v3.16b,  v2.16b,  v2.16b
1892        uzp1            v2.16b,  v2.16b,  v2.16b
1893        umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
1894        umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
1895        umull           v17.8h,  v3.8b,   v5.8b
1896        umlal           v17.8h,  v2.8b,   v7.8b
1897        rshrn           v16.8b,  v16.8h,  #6
1898        rshrn           v17.8b,  v17.8h,  #6
1899        st1             {v16.8b}, [x0], x1
1900        add             w7,  w7,  w5              // xpos += dx
1901        subs            w4,  w4,  #2
1902        st1             {v17.8b}, [x0], x1
1903        b.gt            8b
1904        ret
1905
190689:
1907        st1             {v31.8b}, [x0], x1
1908        subs            w4,  w4,  #2
1909        st1             {v31.8b}, [x0], x1
1910        b.gt            89b
1911        ret
1912endfunc
1913
1914// void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
1915//                              const int n);
1916function ipred_reverse_8bpc_neon, export=1
1917        sub             x1,  x1,  #16
1918        add             x3,  x0,  #8
1919        mov             x4,  #16
19201:
1921        ld1             {v0.16b}, [x1]
1922        subs            w2,  w2,  #16
1923        rev64           v0.16b,  v0.16b
1924        sub             x1,  x1,  #16
1925        st1             {v0.d}[1], [x0], x4
1926        st1             {v0.d}[0], [x3], x4
1927        b.gt            1b
1928        ret
1929endfunc
1930
1931const increments
1932        .short          0,  1,  2,  3,  4,  5,  6,  7
1933        .short          8,  9,  10, 11, 12, 13, 14, 15
1934endconst
1935
1936// void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
1937//                               const pixel *const top,
1938//                               const pixel *const left,
1939//                               const int width, const int height,
1940//                               const int dx, const int dy);
1941function ipred_z2_fill1_8bpc_neon, export=1
1942        clz             w10, w4
1943        adr             x9,  L(ipred_z2_fill1_tbl)
1944        sub             w10, w10, #25
1945        ldrh            w10, [x9, w10, uxtw #1]
1946        mov             w8,  #(1 << 6)            // xpos = 1 << 6
1947        sub             x9,  x9,  w10, uxtw
1948        sub             w8,  w8,  w6              // xpos -= dx
1949
1950        movrel          x11, increments
1951        ld1             {v31.8h},  [x11]          // increments
1952        neg             w7,  w7                   // -dy
1953
1954        br              x9
195540:
1956        AARCH64_VALID_JUMP_TARGET
1957
1958        dup             v30.4h,  w7               // -dy
1959        movi            v17.8b,  #1
1960
1961        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
1962        movi            v25.16b, #0x3e
1963        add             v30.4h,  v16.4h,  v30.4h  // -= dy
1964
1965        xtn             v31.8b,  v31.8h           // {0,1,2,3}
1966
1967        // Worst case height for w=4 is 16, but we need at least h+1 elements
1968        ld1             {v0.16b, v1.16b}, [x3]    // left[]
1969
1970        movi            v26.16b, #64
1971        movi            v19.16b, #2
1972
1973        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
1974        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
1975        and             v27.8b,  v27.8b,  v25.8b  // frac_y
1976
1977        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
1978
1979        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
1980        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
1981
1982        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
1983
1984        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
1985
1986        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
1987
1988        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
1989
1990        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
1991        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
1992
1993        movi            v29.8b,  #2
19944:
1995        asr             w9,  w8,  #6              // base_x
1996        dup             v6.4h,   w8               // xpos
1997        sub             w8,  w8,  w6              // xpos -= dx
1998        cmp             w9,  #-4                  // base_x <= -4
1999        asr             w11, w8,  #6              // base_x
2000        b.le            49f
2001
2002        dup             v7.4h,   w8               // xpos
2003
2004        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2005        ldr             d4,  [x2, w11, sxtw]
2006
2007        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2008
2009        // Cut corners here; only doing tbl over v0 here; we only
2010        // seem to need the last pixel, from v1, after skipping to the
2011        // left-only codepath below.
2012        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2013
2014        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2015        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2016
2017        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
2018        ext             v5.8b,   v4.8b,   v4.8b,   #1
2019
2020        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2021
2022        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2023
2024        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
2025        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
2026
2027        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2028
2029        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2030
2031        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2032        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2033
2034        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2035        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2036
2037        cmge            v20.8b,  v20.8b,  #0
2038
2039        rshrn           v16.8b,  v16.8h,  #6
2040        rshrn           v22.8b,  v22.8h,  #6
2041
2042        bit             v16.8b,  v22.8b,  v20.8b
2043
2044        st1             {v16.s}[0], [x0], x1
2045        sub             w8,  w8,  w6              // xpos -= dx
2046        subs            w5,  w5,  #2
2047        st1             {v16.s}[1], [x0], x1
2048        b.le            9f
2049
2050        ext             v16.8b,  v17.8b,  v17.8b, #4
2051        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2052        b               4b
2053
205449:
2055        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
2056
2057        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2058
2059        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2060        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2061        rshrn           v18.8b,  v18.8h,  #6
2062
2063        st1             {v18.s}[0], [x0], x1
2064        subs            w5,  w5,  #2
2065        st1             {v18.s}[1], [x0], x1
2066        b.le            9f
2067
2068        ext             v16.8b,  v17.8b,  v17.8b, #4
2069        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2070        b               49b
2071
20729:
2073        ret
2074
207580:
2076        AARCH64_VALID_JUMP_TARGET
2077
2078        dup             v30.8h,  w7               // -dy
2079        movi            v17.8b,  #1
2080
2081        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2082        movi            v25.16b, #0x3e
2083        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2084
2085        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2086
2087        // Worst case height for w=8 is 32, but we need at least h+1 elements
2088        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
2089
2090        movi            v26.16b, #64
2091        movi            v19.16b, #2
2092
2093        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2094        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2095        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2096
2097        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2098
2099        // Cut corners here; for the first row we don't expect to need to
2100        // read outside of v0.
2101        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2102
2103        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2104        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2105
2106        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2107
2108        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2109
2110        movi            v24.8b,  #2               // 2
21118:
2112        asr             w9,  w8,  #6              // base_x
2113        dup             v16.8h,   w8              // xpos
2114        sub             w8,  w8,  w6              // xpos -= dx
2115        cmp             w9,  #-8                  // base_x <= -8
2116        asr             w11, w8,  #6              // base_x
2117        b.le            89f
2118
2119        dup             v17.8h,   w8              // xpos
2120
2121        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2122        ldr             q6,  [x2, w11, sxtw]
2123
2124        // Cut corners here; only doing tbl over v0-v1 here; we only
2125        // seem to need the last pixel, from v2, after skipping to the
2126        // left-only codepath below.
2127        tbl             v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
2128
2129        shrn            v21.8b,  v16.8h,  #6      // first base_x
2130        shrn2           v21.16b, v17.8h,  #6
2131        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2132        xtn2            v16.16b, v17.8h
2133
2134        tbl             v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
2135
2136        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
2137        ext             v7.16b,  v6.16b,  v6.16b,  #1
2138
2139        and             v16.16b, v16.16b, v25.16b // frac_x
2140
2141        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2142        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2143
2144        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2145
2146        add             v21.16b, v21.16b, v31.16b // actual base_x
2147
2148        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2149        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2150        umull           v17.8h,  v19.8b,  v28.8b
2151        umlal           v17.8h,  v20.8b,  v27.8b
2152
2153        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2154        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2155        umull2          v23.8h,  v4.16b,  v7.16b
2156        umlal2          v23.8h,  v5.16b,  v16.16b
2157
2158        cmge            v21.16b, v21.16b, #0
2159
2160        rshrn           v6.8b,   v6.8h,   #6
2161        rshrn2          v6.16b,  v17.8h,  #6
2162        rshrn           v22.8b,  v22.8h,  #6
2163        rshrn2          v22.16b, v23.8h,  #6
2164
2165        bit             v6.16b,  v22.16b, v21.16b
2166
2167        st1             {v6.d}[0], [x0], x1
2168        sub             w8,  w8,  w6              // xpos -= dx
2169        subs            w5,  w5,  #2
2170        st1             {v6.d}[1], [x0], x1
2171        b.le            9f
2172
2173        mov             v18.8b,  v20.8b
2174        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2175        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2176        b               8b
2177
217889:
2179        tbl             v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
2180        tbl             v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
2181
2182        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2183        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2184        umull           v17.8h,  v19.8b,  v28.8b
2185        umlal           v17.8h,  v20.8b,  v27.8b
2186
2187        rshrn           v6.8b,   v6.8h,   #6
2188        rshrn2          v6.16b,  v17.8h,  #6
2189
2190        st1             {v6.d}[0], [x0], x1
2191        subs            w5,  w5,  #2
2192        st1             {v6.d}[1], [x0], x1
2193        b.le            9f
2194
2195        mov             v18.8b,  v20.8b
2196        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2197        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2198        b               89b
2199
22009:
2201        ret
2202
2203160:
2204        AARCH64_VALID_JUMP_TARGET
2205
2206        stp             d8,  d9,  [sp, #-0x40]!
2207        stp             d10, d11, [sp, #0x10]
2208        stp             d12, d13, [sp, #0x20]
2209        stp             d14, d15, [sp, #0x30]
2210
2211        add             x11, x11, #16             // increments
2212
2213        dup             v18.8h,  w7               // -dy
2214        movi            v17.16b, #1
2215        add             x3,  x3,  #1              // Skip past left[0]
2216
2217        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2218
2219        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2220        mul             v19.8h,  v14.8h,  v18.8h  // {8,9,10,11,12,13,14,15}* -dy
2221        movi            v25.16b, #0x3e
2222        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2223        add             v18.8h,  v19.8h,  v18.8h
2224
2225        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2226        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2227
2228        // Worst case height is 64.
2229        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2230        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2231
2232        movi            v26.16b, #64
2233        movi            v19.16b, #2
2234
2235        xtn             v27.8b,  v16.8h           // (uint8_t)ypos
2236        xtn2            v27.16b, v18.8h
2237        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2238        shrn2           v29.16b, v18.8h,  #6
2239        mov             v18.16b, v15.16b          // left[0]
2240        and             v27.16b, v27.16b, v25.16b // frac_y
2241
2242        // Cut corners here; for the first row we don't expect to need to
2243        // read outside of v0.
2244        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2245
2246        add             v30.16b, v29.16b, v19.16b // base_y + 2
2247        add             v29.16b, v29.16b, v17.16b // base_y + 1
2248
2249        sub             v28.16b, v26.16b, v27.16b // 64 - frac_y
2250
2251        movi            v24.16b, #2               // 2
225216:
2253        asr             w9,  w8,  #6              // base_x
2254        dup             v16.8h,   w8              // xpos
2255        sub             w8,  w8,  w6              // xpos -= dx
2256        cmp             w9,  #-16                 // base_x <= -16
2257        asr             w11, w8,  #6              // base_x
2258        b.le            169f
2259
2260        dup             v17.8h,   w8              // xpos
2261
2262        add             x9,  x2,  w9,  sxtw
2263        add             x11, x2,  w11, sxtw
2264
2265        ld1             {v4.16b, v5.16b}, [x9]    // top[base_x]
2266        mov             v19.16b, v15.16b          // left[0]
2267        ld1             {v6.16b, v7.16b}, [x11]
2268
2269        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2270
2271        mov             v20.16b, v15.16b          // left[0]
2272
2273        shrn            v21.8b,  v16.8h,  #6      // first base_x
2274        shrn            v22.8b,  v17.8h,  #6
2275        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2276        xtn             v17.8b,  v17.8h
2277
2278        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2279
2280        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2281        trn1            v22.2d,  v22.2d,  v22.2d
2282        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2283        trn1            v17.2d,  v17.2d,  v17.2d
2284
2285        ext             v5.16b,  v4.16b,  v5.16b,  #1 // top[base_x+1]
2286        ext             v7.16b,  v6.16b,  v7.16b,  #1
2287
2288        and             v16.16b, v16.16b, v25.16b // frac_x
2289        and             v17.16b, v17.16b, v25.16b
2290
2291        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2292        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2293
2294        sub             v8.16b,  v26.16b, v16.16b // 64 - frac_x
2295        sub             v9.16b,  v26.16b, v17.16b
2296
2297        umull2          v11.8h,  v18.16b, v28.16b
2298        umlal2          v11.8h,  v19.16b, v27.16b
2299
2300        add             v21.16b, v21.16b, v31.16b // actual base_x
2301        add             v22.16b, v22.16b, v31.16b
2302
2303        umull           v12.8h,  v19.8b,  v28.8b
2304        umlal           v12.8h,  v20.8b,  v27.8b
2305        umull2          v13.8h,  v19.16b, v28.16b
2306        umlal2          v13.8h,  v20.16b, v27.16b
2307
2308        rshrn           v10.8b,  v10.8h,  #6
2309        rshrn2          v10.16b, v11.8h,  #6
2310        rshrn           v11.8b,  v12.8h,  #6
2311        rshrn2          v11.16b, v13.8h,  #6
2312
2313        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2314        umlal           v12.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2315        umull2          v13.8h,  v4.16b,  v8.16b
2316        umlal2          v13.8h,  v5.16b,  v16.16b
2317        umull           v14.8h,  v6.8b,   v9.8b
2318        umlal           v14.8h,  v7.8b,   v17.8b
2319        umull2          v18.8h,  v6.16b,  v9.16b
2320        umlal2          v18.8h,  v7.16b,  v17.16b
2321
2322        cmge            v21.16b, v21.16b, #0
2323        cmge            v22.16b, v22.16b, #0
2324
2325        rshrn           v12.8b,  v12.8h,  #6
2326        rshrn2          v12.16b, v13.8h,  #6
2327        rshrn           v13.8b,  v14.8h,  #6
2328        rshrn2          v13.16b, v18.8h,  #6
2329
2330        bit             v10.16b, v12.16b, v21.16b
2331        bit             v11.16b, v13.16b, v22.16b
2332
2333        st1             {v10.16b}, [x0], x1
2334        subs            w5,  w5,  #2
2335        sub             w8,  w8,  w6              // xpos -= dx
2336        st1             {v11.16b}, [x0], x1
2337        b.le            9f
2338
2339        mov             v18.16b, v20.16b
2340        add             v29.16b, v29.16b, v24.16b // base_y += 2
2341        add             v30.16b, v30.16b, v24.16b // base_y += 2
2342        b               16b
2343
2344169:
2345        mov             v19.16b, v15.16b
2346        mov             v20.16b, v15.16b
2347        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2348        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2349
2350        umull           v4.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2351        umlal           v4.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2352        umull2          v5.8h,   v18.16b, v28.16b
2353        umlal2          v5.8h,   v19.16b, v27.16b
2354        umull           v6.8h,   v19.8b,  v28.8b
2355        umlal           v6.8h,   v20.8b,  v27.8b
2356        umull2          v7.8h,   v19.16b, v28.16b
2357        umlal2          v7.8h,   v20.16b, v27.16b
2358
2359        rshrn           v4.8b,   v4.8h,   #6
2360        rshrn2          v4.16b,  v5.8h,   #6
2361        rshrn           v5.8b,   v6.8h,   #6
2362        rshrn2          v5.16b,  v7.8h,   #6
2363
2364        st1             {v4.16b}, [x0], x1
2365        subs            w5,  w5,  #2
2366        st1             {v5.16b}, [x0], x1
2367        b.le            9f
2368
2369        mov             v18.16b, v20.16b
2370        add             v29.16b, v29.16b, v24.16b // base_y += 2
2371        add             v30.16b, v30.16b, v24.16b // base_y += 2
2372        b               169b
2373
23749:
2375        ldp             d14, d15, [sp, #0x30]
2376        ldp             d12, d13, [sp, #0x20]
2377        ldp             d10, d11, [sp, #0x10]
2378        ldp             d8,  d9,  [sp], 0x40
2379        ret
2380
2381320:
2382640:
2383        AARCH64_VALID_JUMP_TARGET
2384
2385        stp             d8,  d9,  [sp, #-0x40]!
2386        stp             d10, d11, [sp, #0x10]
2387        stp             d12, d13, [sp, #0x20]
2388        stp             d14, d15, [sp, #0x30]
2389
2390        add             x11, x11, #16             // increments
2391
2392        dup             v25.8h,  w7               // -dy
2393        add             x3,  x3,  #1              // Skip past left[0]
2394
2395        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
2396
2397        add             x13, x0,  x1              // alternating row
2398        lsl             x1,  x1,  #1              // stride *= 2
2399        sub             x1,  x1,  w4,  uxtw       // stride -= width
2400
2401        movi            v11.8h,  #8
2402        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2403        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2404        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2405
2406        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2407        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
2408
2409        // Worst case height is 64.
2410        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
2411        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
2412
2413        mov             w12, w4                   // orig w
2414        neg             w14, w4                   // -w
2415
24161:
2417        mov             v23.16b, v26.16b          // reset ypos
2418
2419        asr             w9,  w8,  #6              // base_x
2420        dup             v16.8h,   w8              // xpos
2421        sub             w8,  w8,  w6              // xpos -= dx
2422        cmp             w9,  w14                  // base_x <= -w
2423        asr             w11, w8,  #6              // base_x
2424        b.le            329f
2425
2426        dup             v17.8h,   w8              // xpos
2427        sub             w8,  w8,  w6              // xpos -= dx
2428
2429        add             x9,  x2,  w9,  sxtw
2430        add             x11, x2,  w11, sxtw
2431
2432        sqshrn          v21.8b,  v16.8h,  #6      // first base_x
2433        sqshrn          v22.8b,  v17.8h,  #6
2434        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2435        xtn             v17.8b,  v17.8h
2436
2437        ld1             {v4.16b}, [x9], #16       // top[base_x]
2438        ld1             {v6.16b}, [x11], #16
2439
2440        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
2441        trn1            v22.2d,  v22.2d,  v22.2d
2442        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
2443        trn1            v17.2d,  v17.2d,  v17.2d
2444
2445        movi            v10.16b, #0x3e
2446        movi            v11.16b, #64
2447
2448        and             v16.16b, v16.16b, v10.16b // frac_x
2449        and             v17.16b, v17.16b, v10.16b
2450
2451        sub             v8.16b,  v11.16b, v16.16b // 64 - frac_x
2452        sub             v9.16b,  v11.16b, v17.16b
2453
2454        add             v21.16b, v21.16b, v31.16b // actual base_x
2455        add             v22.16b, v22.16b, v31.16b
2456
24572:
2458        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2459        movi            v12.16b, #64
2460        movi            v20.16b, #2
2461        movi            v10.16b, #0x3e
2462
2463        smov            w10,     v22.b[0]
2464
2465        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2466        xtn2            v27.16b, v13.8h
2467        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2468        shrn2           v29.16b, v13.8h,  #6
2469        cmp             w10, #0                   // base_x (bottom left) >= 0
2470        and             v27.16b, v27.16b, v10.16b // frac_y
2471
2472        mov             v18.16b, v15.16b          // left[0]
2473
2474        b.ge            4f
2475
2476        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2477        movi            v13.16b, #1
2478
2479        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2480        add             v29.16b, v29.16b, v13.16b // base_y + 1
2481        mov             v19.16b, v15.16b          // left[0]
2482
2483        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
2484
2485        ld1             {v5.16b}, [x9], #16       // top[base_x]
2486        ld1             {v7.16b}, [x11], #16
2487
2488        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2489        add             v29.16b, v29.16b, v13.16b // base_y + 2
2490
2491        mov             v20.16b, v15.16b          // left[0]
2492        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2493
2494        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2495        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2496        umull2          v11.8h,  v18.16b, v28.16b
2497        umlal2          v11.8h,  v19.16b, v27.16b
2498        umull           v12.8h,  v19.8b,  v28.8b
2499        umlal           v12.8h,  v20.8b,  v27.8b
2500        umull2          v13.8h,  v19.16b, v28.16b
2501        umlal2          v13.8h,  v20.16b, v27.16b
2502
2503        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2504        ext             v19.16b, v6.16b,  v7.16b,  #1
2505
2506        rshrn           v10.8b,  v10.8h,  #6
2507        rshrn2          v10.16b, v11.8h,  #6
2508        rshrn           v11.8b,  v12.8h,  #6
2509        rshrn2          v11.16b, v13.8h,  #6
2510
2511        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2512        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2513        umull2          v13.8h,  v4.16b,  v8.16b
2514        umlal2          v13.8h,  v18.16b, v16.16b
2515        umull           v14.8h,  v6.8b,   v9.8b
2516        umlal           v14.8h,  v19.8b,  v17.8b
2517        umull2          v20.8h,  v6.16b,  v9.16b
2518        umlal2          v20.8h,  v19.16b, v17.16b
2519
2520        cmge            v18.16b, v21.16b, #0
2521        cmge            v19.16b, v22.16b, #0
2522
2523        rshrn           v12.8b,  v12.8h,  #6
2524        rshrn2          v12.16b, v13.8h,  #6
2525        rshrn           v13.8b,  v14.8h,  #6
2526        rshrn2          v13.16b, v20.8h,  #6
2527
2528        bit             v10.16b, v12.16b, v18.16b
2529        bit             v11.16b, v13.16b, v19.16b
2530
2531        st1             {v10.16b}, [x0], #16
2532        subs            w4,  w4,  #16
2533        st1             {v11.16b}, [x13], #16
2534        b.le            3f
2535
2536        movi            v10.16b, #16
2537        mov             v4.16b,  v5.16b
2538        mov             v6.16b,  v7.16b
2539        add             v21.16b, v21.16b, v10.16b // base_x += 16
2540        add             v22.16b, v22.16b, v10.16b
2541        b               2b
2542
25433:
2544        subs            w5,  w5,  #2
2545        b.le            9f
2546        movi            v10.8h, #128
2547        add             x0,  x0,  x1
2548        add             x13, x13, x1
2549        mov             w4,  w12                  // reset w
2550        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2551        b               1b
2552
25534:      // The rest of the row only predicted from top[]
2554        ld1             {v5.16b}, [x9], #16       // top[base_x]
2555        ld1             {v7.16b}, [x11], #16
2556
2557        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
2558        ext             v19.16b, v6.16b,  v7.16b,  #1
2559
2560        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
2561        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
2562        umull2          v13.8h,  v4.16b,  v8.16b
2563        umlal2          v13.8h,  v18.16b, v16.16b
2564        umull           v14.8h,  v6.8b,   v9.8b
2565        umlal           v14.8h,  v19.8b,  v17.8b
2566        umull2          v20.8h,  v6.16b,  v9.16b
2567        umlal2          v20.8h,  v19.16b, v17.16b
2568
2569        rshrn           v12.8b,  v12.8h,  #6
2570        rshrn2          v12.16b, v13.8h,  #6
2571        rshrn           v13.8b,  v14.8h,  #6
2572        rshrn2          v13.16b, v20.8h,  #6
2573
2574        st1             {v12.16b}, [x0], #16
2575        subs            w4,  w4,  #16
2576        st1             {v13.16b}, [x13], #16
2577        b.le            3b
2578
2579        mov             v4.16b,  v5.16b
2580        mov             v6.16b,  v7.16b
2581        b               4b
2582
2583329:    // The rest of the block only predicted from left[]
2584        add             x1,  x1,  w4,  uxtw       // restore stride
2585        mov             w12, w5                   // orig remaining h
25861:
2587        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2588        movi            v12.16b, #64
2589        movi            v10.16b, #0x3e
2590
2591        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
2592        xtn2            v27.16b, v13.8h
2593        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2594        shrn2           v29.16b, v13.8h,  #6
2595        and             v27.16b, v27.16b, v10.16b // frac_y
2596
2597        mov             v18.16b, v15.16b          // left[0]
2598        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
2599        movi            v21.16b, #1
2600
2601        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2602        add             v29.16b, v29.16b, v21.16b // base_y + 1
2603
2604        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
26052:
2606        mov             v19.16b, v15.16b          // left[0]
2607        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2608        add             v29.16b, v29.16b, v21.16b // base_y + 2
2609        mov             v20.16b, v15.16b          // left[0]
2610        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2611        add             v29.16b, v29.16b, v21.16b // next base_y
2612
2613        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2614        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2615        umull2          v11.8h,  v18.16b, v28.16b
2616        umlal2          v11.8h,  v19.16b, v27.16b
2617        umull           v12.8h,  v19.8b,  v28.8b
2618        umlal           v12.8h,  v20.8b,  v27.8b
2619        umull2          v13.8h,  v19.16b, v28.16b
2620        umlal2          v13.8h,  v20.16b, v27.16b
2621
2622        rshrn           v10.8b,  v10.8h,  #6
2623        rshrn2          v10.16b, v11.8h,  #6
2624        rshrn           v11.8b,  v12.8h,  #6
2625        rshrn2          v11.16b, v13.8h,  #6
2626
2627        st1             {v10.16b}, [x0], x1
2628        subs            w5,  w5,  #2
2629        st1             {v11.16b}, [x13], x1
2630        b.le            3f
2631        mov             v18.16b, v20.16b
2632        b               2b
2633
26343:
2635        subs            w4,  w4,  #16
2636        b.le            9f
2637
2638        lsr             x1,  x1,  #1
2639        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2640        msub            x13, x1,  x12, x13
2641        lsl             x1,  x1,  #1
2642        add             x0,  x0,  #16
2643        add             x13, x13, #16
2644        mov             w5,  w12                  // reset h
2645        b               1b
2646
26479:
2648        ldp             d14, d15, [sp, #0x30]
2649        ldp             d12, d13, [sp, #0x20]
2650        ldp             d10, d11, [sp, #0x10]
2651        ldp             d8,  d9,  [sp], 0x40
2652        ret
2653
2654L(ipred_z2_fill1_tbl):
2655        .hword L(ipred_z2_fill1_tbl) - 640b
2656        .hword L(ipred_z2_fill1_tbl) - 320b
2657        .hword L(ipred_z2_fill1_tbl) - 160b
2658        .hword L(ipred_z2_fill1_tbl) -  80b
2659        .hword L(ipred_z2_fill1_tbl) -  40b
2660endfunc
2661
2662function ipred_z2_fill2_8bpc_neon, export=1
2663        cmp             w4,  #8
2664        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2665        sub             w8,  w8,  w6              // xpos -= dx
2666
2667        movrel          x11, increments
2668        ld1             {v31.8h},  [x11]          // increments
2669        neg             w7,  w7                   // -dy
2670        b.eq            80f
2671
267240:
2673        dup             v30.4h,  w7               // -dy
2674        movi            v17.8b,  #1
2675
2676        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2677        movi            v25.16b, #0x3e
2678        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2679
2680        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2681
2682        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2683        // from left.
2684        ld1             {v0.16b}, [x3]            // left[]
2685
2686        movi            v26.16b, #64
2687        movi            v19.16b, #2
2688
2689        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2690        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2691        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2692
2693        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2694
2695        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2696        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2697
2698        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
2699
2700        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
2701
2702        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2703
2704        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2705
2706        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2707        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2708
2709        movi            v29.8b,  #2
2710        add             v31.8b,  v31.8b,  v31.8b  // {0,2,4,6,0,2,4,6}
27114:
2712        asr             w9,  w8,  #6              // base_x
2713        dup             v6.4h,   w8               // xpos
2714        sub             w8,  w8,  w6              // xpos -= dx
2715        cmp             w9,  #-8                  // base_x <= -8
2716        asr             w11, w8,  #6              // base_x
2717        b.le            49f
2718
2719        dup             v7.4h,   w8               // xpos
2720
2721        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2722        ldr             d4,  [x2, w11, sxtw]
2723
2724        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2725
2726        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2727
2728        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2729        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2730
2731        uzp2            v3.8b,   v2.8b,   v4.8b   // top[base_x+1]
2732        uzp1            v2.8b,   v2.8b,   v4.8b   // top[base_x]
2733
2734        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2735
2736        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2737
2738        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2739
2740        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2741
2742        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2743        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2744
2745        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2746        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2747
2748        cmge            v20.8b,  v20.8b,  #0
2749
2750        rshrn           v16.8b,  v16.8h,  #6
2751        rshrn           v22.8b,  v22.8h,  #6
2752
2753        bit             v16.8b,  v22.8b,  v20.8b
2754
2755        st1             {v16.s}[0], [x0], x1
2756        sub             w8,  w8,  w6              // xpos -= dx
2757        subs            w5,  w5,  #2
2758        st1             {v16.s}[1], [x0], x1
2759        b.le            9f
2760
2761        ext             v16.8b,  v17.8b,  v17.8b, #4
2762        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2763        b               4b
2764
276549:
2766        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
2767
2768        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
2769
2770        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
2771        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2772        rshrn           v18.8b,  v18.8h,  #6
2773
2774        st1             {v18.s}[0], [x0], x1
2775        subs            w5,  w5,  #2
2776        st1             {v18.s}[1], [x0], x1
2777        b.le            9f
2778
2779        ext             v16.8b,  v17.8b,  v17.8b, #4
2780        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
2781        b               49b
2782
27839:
2784        ret
2785
278680:
2787        dup             v30.8h,  w7               // -dy
2788        movi            v17.8b,  #1
2789
2790        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
2791        movi            v25.16b, #0x3e
2792        add             v30.8h,  v16.8h,  v30.8h  // -= dy
2793
2794        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
2795
2796        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2797        // from left.
2798        ld1             {v0.16b}, [x3]    // left[]
2799
2800        movi            v26.16b, #64
2801        movi            v19.16b, #2
2802
2803        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2804        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2805        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2806
2807        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2808
2809        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2810
2811        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
2812        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
2813
2814        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2815
2816        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
2817
2818        movi            v24.8b,  #2               // 2
2819        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
28208:
2821        asr             w9,  w8,  #6              // base_x
2822        dup             v16.8h,   w8              // xpos
2823        sub             w8,  w8,  w6              // xpos -= dx
2824        cmp             w9,  #-16                 // base_x <= -16
2825        asr             w11, w8,  #6              // base_x
2826        b.le            89f
2827
2828        dup             v17.8h,   w8              // xpos
2829
2830        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2831        ldr             q6,  [x2, w11, sxtw]
2832
2833        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2834
2835        shrn            v21.8b,  v16.8h,  #6      // first base_x
2836        shrn2           v21.16b, v17.8h,  #6
2837        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
2838        xtn2            v16.16b, v17.8h
2839
2840        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2841
2842        uzp2            v5.16b,  v4.16b,  v6.16b  // top[base_x+1]
2843        uzp1            v4.16b,  v4.16b,  v6.16b  // top[base_x]
2844
2845        and             v16.16b, v16.16b, v25.16b // frac_x
2846
2847        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
2848
2849        add             v21.16b, v21.16b, v31.16b // actual base_x
2850
2851        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2852        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2853        umull           v17.8h,  v19.8b,  v28.8b
2854        umlal           v17.8h,  v20.8b,  v27.8b
2855
2856        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2857        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
2858        umull2          v23.8h,  v4.16b,  v7.16b
2859        umlal2          v23.8h,  v5.16b,  v16.16b
2860
2861        cmge            v21.16b, v21.16b, #0
2862
2863        rshrn           v6.8b,   v6.8h,   #6
2864        rshrn2          v6.16b,  v17.8h,  #6
2865        rshrn           v22.8b,  v22.8h,  #6
2866        rshrn2          v22.16b, v23.8h,  #6
2867
2868        bit             v6.16b,  v22.16b, v21.16b
2869
2870        st1             {v6.d}[0], [x0], x1
2871        sub             w8,  w8,  w6              // xpos -= dx
2872        subs            w5,  w5,  #2
2873        st1             {v6.d}[1], [x0], x1
2874        b.le            9f
2875
2876        mov             v18.8b,  v20.8b
2877        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2878        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2879        b               8b
2880
288189:
2882        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
2883        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
2884
2885        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
2886        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
2887        umull           v17.8h,  v19.8b,  v28.8b
2888        umlal           v17.8h,  v20.8b,  v27.8b
2889
2890        rshrn           v6.8b,   v6.8h,   #6
2891        rshrn2          v6.16b,  v17.8h,  #6
2892
2893        st1             {v6.d}[0], [x0], x1
2894        subs            w5,  w5,  #2
2895        st1             {v6.d}[1], [x0], x1
2896        b.le            9f
2897
2898        mov             v18.8b,  v20.8b
2899        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
2900        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
2901        b               89b
2902
29039:
2904        ret
2905endfunc
2906
2907function ipred_z2_fill3_8bpc_neon, export=1
2908        cmp             w4,  #8
2909        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2910        sub             w8,  w8,  w6              // xpos -= dx
2911
2912        movrel          x11, increments
2913        ld1             {v31.8h},  [x11]          // increments
2914        neg             w7,  w7                   // -dy
2915        b.eq            80f
2916
291740:
2918        dup             v30.4h,  w7               // -dy
2919        movi            v17.8b,  #1
2920
2921        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2922        movi            v25.16b, #0x3e
2923        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2924
2925        xtn             v31.8b,  v31.8h           // {0,1,2,3}
2926
2927        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
2928        ld1             {v0.16b, v1.16b}, [x3]    // left[]
2929
2930        movi            v26.16b, #64
2931        movi            v19.16b, #2
2932
2933        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
2934        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2935        and             v27.8b,  v27.8b,  v25.8b  // frac_y
2936
2937        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
2938
2939        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
2940        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
2941
2942        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
2943
2944        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3
2945
2946        trn1            v29.2s,  v29.2s,  v28.2s  // base_y + 0, base_y + 2
2947        trn1            v30.2s,  v30.2s,  v24.2s  // base_y + 1, base_y + 3
2948
2949        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
2950
2951        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
2952        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
2953
2954        movi            v24.8b,  #4
29554:
2956        asr             w9,  w8,  #6              // base_x
2957        dup             v6.4h,   w8               // xpos
2958        sub             w8,  w8,  w6              // xpos -= dx
2959        cmp             w9,  #-4                  // base_x <= -4
2960        asr             w11, w8,  #6              // base_x
2961        b.le            49f
2962
2963        dup             v7.4h,   w8               // xpos
2964
2965        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
2966        ldr             d4,  [x2, w11, sxtw]
2967
2968        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
2969
2970        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
2971        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
2972
2973        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
2974        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
2975
2976        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
2977        ext             v5.8b,   v4.8b,   v4.8b,   #1
2978
2979        and             v6.8b,   v6.8b,   v25.8b  // frac_x
2980
2981        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
2982        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
2983
2984        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
2985
2986        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
2987
2988        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
2989        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
2990
2991        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
2992        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
2993
2994        cmge            v20.8b,  v20.8b,  #0
2995
2996        rshrn           v16.8b,  v16.8h,  #6
2997        rshrn           v22.8b,  v22.8h,  #6
2998
2999        bit             v16.8b,  v22.8b,  v20.8b
3000
3001        st1             {v16.s}[0], [x0], x1
3002        sub             w8,  w8,  w6              // xpos -= dx
3003        subs            w5,  w5,  #2
3004        st1             {v16.s}[1], [x0], x1
3005        b.le            9f
3006
3007        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3008        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3009        b               4b
3010
301149:
3012        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
3013        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
3014
3015        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
3016        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
3017        rshrn           v18.8b,  v18.8h,  #6
3018
3019        st1             {v18.s}[0], [x0], x1
3020        subs            w5,  w5,  #2
3021        st1             {v18.s}[1], [x0], x1
3022        b.le            9f
3023
3024        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
3025        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
3026        b               49b
3027
30289:
3029        ret
3030
303180:
3032        dup             v30.8h,  w7               // -dy
3033        movi            v17.8b,  #1
3034
3035        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
3036        movi            v25.16b, #0x3e
3037        add             v30.8h,  v16.8h,  v30.8h  // -= dy
3038
3039        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
3040
3041        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3042        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
3043
3044        movi            v26.16b, #64
3045        movi            v19.16b, #2
3046
3047        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
3048        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3049        and             v27.8b,  v27.8b,  v25.8b  // frac_y
3050
3051        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3052
3053        add             v28.8b,  v29.8b,  v17.8b  // base_y + 1
3054        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
3055
3056        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
3057        add             v24.8b,  v28.8b,  v19.8b  // base_y + 3
3058
3059        trn1            v29.2d,  v29.2d,  v30.2d  // base_y + 0, base_y + 2
3060        trn1            v30.2d,  v28.2d,  v24.2d  // base_y + 1, base_y + 3
3061
3062        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
3063
3064        movi            v24.16b, #4
3065
3066        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3067        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
30688:
3069        asr             w9,  w8,  #6              // base_x
3070        dup             v16.8h,   w8              // xpos
3071        sub             w8,  w8,  w6              // xpos -= dx
3072        cmp             w9,  #-8                  // base_x <= -8
3073        asr             w11, w8,  #6              // base_x
3074        b.le            89f
3075
3076        dup             v17.8h,   w8              // xpos
3077
3078        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3079        ldr             q6,  [x2, w11, sxtw]
3080
3081        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3082        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3083
3084        shrn            v21.8b,  v16.8h,  #6      // first base_x
3085        shrn2           v21.16b, v17.8h,  #6
3086        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
3087        xtn2            v16.16b, v17.8h
3088
3089        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
3090        ext             v7.16b,  v6.16b,  v6.16b,  #1
3091
3092        and             v16.16b, v16.16b, v25.16b // frac_x
3093
3094        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3095        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3096
3097        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
3098
3099        add             v21.16b, v21.16b, v31.16b // actual base_x
3100
3101        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3102        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3103        umull2          v17.8h,  v18.16b, v28.16b
3104        umlal2          v17.8h,  v19.16b, v27.16b
3105
3106        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
3107        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
3108        umull2          v23.8h,  v4.16b,  v7.16b
3109        umlal2          v23.8h,  v5.16b,  v16.16b
3110
3111        cmge            v21.16b, v21.16b, #0
3112
3113        rshrn           v6.8b,   v6.8h,   #6
3114        rshrn2          v6.16b,  v17.8h,  #6
3115        rshrn           v22.8b,  v22.8h,  #6
3116        rshrn2          v22.16b, v23.8h,  #6
3117
3118        bit             v6.16b,  v22.16b, v21.16b
3119
3120        st1             {v6.d}[0], [x0], x1
3121        sub             w8,  w8,  w6              // xpos -= dx
3122        subs            w5,  w5,  #2
3123        st1             {v6.d}[1], [x0], x1
3124        b.le            9f
3125
3126        add             v29.16b, v29.16b, v24.16b // base_y += 4
3127        add             v30.16b, v30.16b, v24.16b // base_y += 4
3128        b               8b
3129
313089:
3131        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3132        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3133
3134        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
3135        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
3136        umull2          v17.8h,  v18.16b, v28.16b
3137        umlal2          v17.8h,  v19.16b, v27.16b
3138
3139        rshrn           v6.8b,   v6.8h,   #6
3140        rshrn2          v6.16b,  v17.8h,  #6
3141
3142        st1             {v6.d}[0], [x0], x1
3143        subs            w5,  w5,  #2
3144        st1             {v6.d}[1], [x0], x1
3145        b.le            9f
3146
3147        add             v29.16b, v29.16b, v24.16b // base_y += 4
3148        add             v30.16b, v30.16b, v24.16b // base_y += 4
3149        b               89b
3150
31519:
3152        ret
3153endfunc
3154
3155
3156// void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3157//                               const pixel *const left,
3158//                               const int width, const int height,
3159//                               const int dy, const int max_base_y);
3160function ipred_z3_fill1_8bpc_neon, export=1
3161        cmp             w6,  #64
3162        clz             w9,  w3
3163        adr             x8,  L(ipred_z3_fill1_tbl)
3164        sub             w9,  w9,  #25
3165        ldrh            w9,  [x8, w9, uxtw #1]
3166        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3167        sub             x8,  x8,  w9,  uxtw
3168        movrel          x11, increments
3169        ld1r            {v31.16b}, [x10]          // padding
3170        ld1             {v30.8h},  [x11]          // increments
3171        mov             w7,  w5
3172        b.gt            L(ipred_z3_fill1_large_h16)
3173        br              x8
3174
317540:
3176        AARCH64_VALID_JUMP_TARGET
3177        dup             v29.4h,  w5               // dy
3178
3179        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3180        movi            v23.16b, #0x3e
3181
3182        // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
3183        ld1             {v0.16b, v1.16b}, [x2] // left[]
3184        add             v30.4h,  v29.4h,  v30.4h  // ypos
3185
3186        movi            v22.16b, #64
3187        movi            v20.16b, #1
3188        movi            v21.16b, #2
3189
3190        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3191        uqshrn          v26.8b,  v30.8h,  #6      // base
3192        and             v24.8b,  v24.8b,  v23.8b  // frac
3193
3194        mov             v4.8b,   v31.8b
3195        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3196        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3197        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3198
3199        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
3200
3201        trn1            v27.2s,  v27.2s,  v28.2s  // base + 1, base + 2
3202        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3203        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
32041:
3205        mov             v5.8b,   v31.8b
3206        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
3207
3208        trn1            v4.2s,   v4.2s,   v5.2s   // left[base], left[base+1]
3209
3210        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3211        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3212        rshrn           v16.8b,  v16.8h,  #6
3213        st1             {v16.s}[0], [x0], x1
3214        subs            w4,  w4,  #2
3215        st1             {v16.s}[1], [x0], x1
3216        b.le            9f
3217
3218        ext             v4.8b,   v5.8b,   v5.8b,  #4
3219        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3220        b               1b
3221
32229:
3223        ret
3224
322580:
3226        AARCH64_VALID_JUMP_TARGET
3227        dup             v29.8h,  w5               // dy
3228
3229        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3230        movi            v23.16b, #0x3e
3231
3232        // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
3233        ld1             {v0.16b, v1.16b, v2.16b}, [x2] // left[]
3234        add             v30.8h,  v29.8h,  v30.8h  // ypos
3235
3236        movi            v22.16b, #64
3237        movi            v20.16b, #1
3238        movi            v21.16b, #2
3239
3240        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3241        uqshrn          v26.8b,  v30.8h,  #6      // base
3242        and             v24.8b,  v24.8b,  v23.8b  // frac
3243
3244        mov             v4.8b,   v31.8b
3245        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3246        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3247        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3248
3249        tbx             v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
32501:
3251        mov             v5.8b,   v31.8b
3252        mov             v6.8b,   v31.8b
3253        tbx             v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
3254        tbx             v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
3255
3256        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3257        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3258        umull           v17.8h,  v5.8b,   v25.8b
3259        umlal           v17.8h,  v6.8b,   v24.8b
3260        rshrn           v16.8b,  v16.8h,  #6
3261        rshrn           v17.8b,  v17.8h,  #6
3262        st1             {v16.8b}, [x0], x1
3263        subs            w4,  w4,  #2
3264        st1             {v17.8b}, [x0], x1
3265        b.le            9f
3266
3267        mov             v4.8b,   v6.8b
3268        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
3269        uqadd           v28.8b,  v28.8b,  v21.8b  // base += 2
3270        b               1b
3271
32729:
3273        ret
3274
3275160:
3276        AARCH64_VALID_JUMP_TARGET
3277        dup             v28.8h,  w5               // dy
3278
3279        shl             v29.8h,  v28.8h,  #3      // 8*dy
3280        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3281        movi            v23.16b, #0x3e
3282
3283        // This is only executed if we've checked that max_base_y <= 64.
3284        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3285        add             v28.8h,  v28.8h,  v30.8h  // ypos
3286
3287        movi            v22.16b, #64
3288        movi            v20.16b, #1
3289        movi            v21.16b, #2
3290
3291        add             v29.8h,  v28.8h,  v29.8h  // ypos + 8*dy
3292
3293        xtn             v24.8b,  v28.8h           // (uint8_t)ypos
3294        xtn2            v24.16b, v29.8h
3295        uqshrn          v26.8b,  v28.8h,  #6      // base
3296        uqshrn2         v26.16b, v29.8h,  #6
3297        and             v24.16b, v24.16b, v23.16b // frac
3298
3299        mov             v4.16b,  v31.16b
3300        uqadd           v27.16b, v26.16b, v20.16b // base + 1
3301        uqadd           v28.16b, v26.16b, v21.16b // base + 2
3302        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3303
3304        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
33051:
3306        mov             v5.16b,  v31.16b
3307        mov             v6.16b,  v31.16b
3308        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
3309        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
3310
3311        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3312        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3313        umull2          v17.8h,  v4.16b,  v25.16b
3314        umlal2          v17.8h,  v5.16b,  v24.16b
3315        umull           v18.8h,  v5.8b,   v25.8b
3316        umlal           v18.8h,  v6.8b,   v24.8b
3317        umull2          v19.8h,  v5.16b,  v25.16b
3318        umlal2          v19.8h,  v6.16b,  v24.16b
3319        rshrn           v16.8b,  v16.8h,  #6
3320        rshrn2          v16.16b, v17.8h,  #6
3321        rshrn           v17.8b,  v18.8h,  #6
3322        rshrn2          v17.16b, v19.8h,  #6
3323        st1             {v16.16b}, [x0], x1
3324        subs            w4,  w4,  #2
3325        st1             {v17.16b}, [x0], x1
3326        b.le            9f
3327
3328        mov             v4.16b,  v6.16b
3329        uqadd           v27.16b, v27.16b, v21.16b // base += 2
3330        uqadd           v28.16b, v28.16b, v21.16b // base += 2
3331        b               1b
3332
33339:
3334        ret
3335320:
3336640:
3337        AARCH64_VALID_JUMP_TARGET
3338        dup             v28.8h,  w5               // dy
3339        mov             w12, w3
3340
3341        add             x13, x0,  x1
3342
3343        shl             v29.8h,  v28.8h,  #3      // 8*dy
3344        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
3345        movi            v23.16b, #0x3e
3346
3347        lsl             x1,  x1,  #1
3348        sub             x1,  x1,  w3,  uxtw
3349        add             v30.8h,  v28.8h,  v30.8h  // ypos
3350
3351        // This is only executed if we've checked that max_base_y <= 64.
3352        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
3353
3354        movi            v22.16b, #64
3355        movi            v20.16b, #1
3356        movi            v21.16b, #2
3357
33581:
3359        mov             v26.16b,  v30.16b         // reset ypos
3360
33612:
3362        add             v27.8h,  v26.8h,  v29.8h  // ypos + 8*dy
3363        uqshrn          v16.8b,  v26.8h,  #6      // base
3364        uqshrn2         v16.16b, v27.8h,  #6
3365        xtn             v24.8b,  v26.8h           // (uint8_t)ypos
3366        xtn2            v24.16b, v27.8h
3367        umov            w14,     v16.b[0]
3368        and             v24.16b, v24.16b, v23.16b // frac
3369
3370        uqadd           v17.16b, v16.16b, v20.16b // base + 1
3371        cmp             w14, w6                   // base >= max_base_y
3372        uqadd           v18.16b, v16.16b, v21.16b // base + 2
3373        sub             v25.16b, v22.16b, v24.16b // 64 - frac
3374
3375        b.ge            4f
3376
3377        mov             v4.16b,  v31.16b
3378        mov             v5.16b,  v31.16b
3379        mov             v6.16b,  v31.16b
3380        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
3381        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
3382        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
3383
3384        subs            w3,  w3,  #16
3385        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3386        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3387        umull2          v17.8h,  v4.16b,  v25.16b
3388        umlal2          v17.8h,  v5.16b,  v24.16b
3389        umull           v18.8h,  v5.8b,   v25.8b
3390        umlal           v18.8h,  v6.8b,   v24.8b
3391        umull2          v19.8h,  v5.16b,  v25.16b
3392        umlal2          v19.8h,  v6.16b,  v24.16b
3393        rshrn           v16.8b,  v16.8h,  #6
3394        rshrn2          v16.16b, v17.8h,  #6
3395        rshrn           v17.8b,  v18.8h,  #6
3396        rshrn2          v17.16b, v19.8h,  #6
3397        st1             {v16.16b}, [x0],  #16
3398        st1             {v17.16b}, [x13], #16
3399        b.le            3f
3400        add             v26.8h,  v27.8h,  v29.8h  // ypos += 16*dy
3401        b               2b
3402
34033:
3404        subs            w4,  w4,  #2
3405        b.le            9f
3406        movi            v16.8h,  #128
3407        add             x0,  x0,  x1
3408        add             x13, x13, x1
3409        add             v30.8h,  v30.8h,  v16.8h  // ypos = dy + y*(1<<6)*2
3410        mov             w3,  w12
3411        b               1b
3412
34134:
3414        subs            w3,  w3,  #16
3415        st1             {v31.16b}, [x0],  #16
3416        st1             {v31.16b}, [x13], #16
3417        b.gt            4b
3418        b               3b
3419
34209:
3421        ret
3422
3423L(ipred_z3_fill1_large_h16):
3424        // Fallback case for max_base_y > 64; similar to the z1
3425        // implementation. This does the filtering vertically, filling out
3426        // a 2x pixel column at a time.
3427        mov             w15, #64
3428        add             x13, x0,  x1
3429        lsl             x1,  x1,  #1
3430
3431        mov             w12, w4
34321:
3433        lsr             w8,  w7,  #6              // base
3434        and             w9,  w7,  #0x3e           // frac
3435        add             w7,  w7,  w5              // ypos += dy
3436        cmp             w8,  w6                   // base >= max_base_y
3437        lsr             w10, w7,  #6              // base
3438        and             w11, w7,  #0x3e           // frac
3439        b.ge            ipred_z3_fill_padding_neon
3440        add             x8,  x2,  w8,  uxtw
3441        add             x10, x2,  w10, uxtw
3442        dup             v4.16b,  w9               // frac
3443        dup             v5.16b,  w11
3444        ld1             {v0.16b, v1.16b}, [x8],  #32 // left[base]
3445        ld1             {v2.16b, v3.16b}, [x10], #32
3446        sub             w9,  w15, w9              // 64 - frac
3447        sub             w11, w15, w11
3448        dup             v6.16b,  w9               // 64 - frac
3449        dup             v7.16b,  w11
3450        add             w7,  w7,  w5              // ypos += dy
34512:
3452        ext             v16.16b, v0.16b,  v1.16b,  #1 // left[base+1]
3453        ext             v17.16b, v2.16b,  v3.16b,  #1
3454        subs            w4,  w4,  #16
3455        umull           v18.8h,  v16.8b,  v4.8b   // left[base+1]*frac
3456        umlal           v18.8h,  v0.8b,   v6.8b   // + left[base]*(64-frac)
3457        umull2          v19.8h,  v16.16b, v4.16b
3458        umlal2          v19.8h,  v0.16b,  v6.16b
3459        umull           v20.8h,  v17.8b,  v5.8b
3460        umlal           v20.8h,  v2.8b,   v7.8b
3461        umull2          v21.8h,  v17.16b, v5.16b
3462        umlal2          v21.8h,  v2.16b,  v7.16b
3463        rshrn           v16.8b,  v18.8h,  #6
3464        rshrn2          v16.16b, v19.8h,  #6
3465        rshrn           v17.8b,  v20.8h,  #6
3466        rshrn2          v17.16b, v21.8h,  #6
3467        zip1            v18.16b, v16.16b, v17.16b
3468        zip2            v19.16b, v16.16b, v17.16b
3469        st1             {v18.h}[0], [x0],  x1
3470        st1             {v18.h}[1], [x13], x1
3471        st1             {v18.h}[2], [x0],  x1
3472        st1             {v18.h}[3], [x13], x1
3473        st1             {v18.h}[4], [x0],  x1
3474        st1             {v18.h}[5], [x13], x1
3475        st1             {v18.h}[6], [x0],  x1
3476        st1             {v18.h}[7], [x13], x1
3477        st1             {v19.h}[0], [x0],  x1
3478        st1             {v19.h}[1], [x13], x1
3479        st1             {v19.h}[2], [x0],  x1
3480        st1             {v19.h}[3], [x13], x1
3481        st1             {v19.h}[4], [x0],  x1
3482        st1             {v19.h}[5], [x13], x1
3483        st1             {v19.h}[6], [x0],  x1
3484        st1             {v19.h}[7], [x13], x1
3485        b.le            3f
3486        mov             v0.16b,  v1.16b
3487        ld1             {v1.16b}, [x8],  #16      // left[base]
3488        mov             v2.16b,  v3.16b
3489        ld1             {v3.16b}, [x10], #16
3490        b               2b
3491
34923:
3493        subs            w3,  w3,  #2
3494        b.le            9f
3495        lsr             x1,  x1,  #1
3496        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3497        msub            x13, x1,  x12, x13
3498        lsl             x1,  x1,  #1
3499        add             x0,  x0,  #2
3500        add             x13, x13, #2
3501        mov             w4,  w12
3502        b               1b
35039:
3504        ret
3505
3506L(ipred_z3_fill1_tbl):
3507        .hword L(ipred_z3_fill1_tbl) - 640b
3508        .hword L(ipred_z3_fill1_tbl) - 320b
3509        .hword L(ipred_z3_fill1_tbl) - 160b
3510        .hword L(ipred_z3_fill1_tbl) -  80b
3511        .hword L(ipred_z3_fill1_tbl) -  40b
3512endfunc
3513
3514function ipred_z3_fill_padding_neon, export=0
3515        cmp             w3,  #16
3516        adr             x8,  L(ipred_z3_fill_padding_tbl)
3517        b.gt            L(ipred_z3_fill_padding_wide)
3518        // w3 = remaining width, w4 = constant height
3519        mov             w12, w4
3520
35211:
3522        // Fill a WxH rectangle with padding. W can be any number;
3523        // this fills the exact width by filling in the largest
3524        // power of two in the remaining width, and repeating.
3525        clz             w9,  w3
3526        sub             w9,  w9,  #25
3527        ldrh            w9,  [x8, w9, uxtw #1]
3528        sub             x9,  x8,  w9,  uxtw
3529        br              x9
3530
35312:
3532        AARCH64_VALID_JUMP_TARGET
3533        st1             {v31.h}[0], [x0],  x1
3534        subs            w4,  w4,  #4
3535        st1             {v31.h}[0], [x13], x1
3536        st1             {v31.h}[0], [x0],  x1
3537        st1             {v31.h}[0], [x13], x1
3538        b.gt            2b
3539        subs            w3,  w3,  #2
3540        lsr             x1,  x1,  #1
3541        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3542        msub            x13, x1,  x12, x13
3543        b.le            9f
3544        lsl             x1,  x1,  #1
3545        add             x0,  x0,  #2
3546        add             x13, x13, #2
3547        mov             w4,  w12
3548        b               1b
3549
35504:
3551        AARCH64_VALID_JUMP_TARGET
3552        st1             {v31.s}[0], [x0],  x1
3553        subs            w4,  w4,  #4
3554        st1             {v31.s}[0], [x13], x1
3555        st1             {v31.s}[0], [x0],  x1
3556        st1             {v31.s}[0], [x13], x1
3557        b.gt            4b
3558        subs            w3,  w3,  #4
3559        lsr             x1,  x1,  #1
3560        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3561        msub            x13, x1,  x12, x13
3562        b.le            9f
3563        lsl             x1,  x1,  #1
3564        add             x0,  x0,  #4
3565        add             x13, x13, #4
3566        mov             w4,  w12
3567        b               1b
3568
35698:
3570        AARCH64_VALID_JUMP_TARGET
3571        st1             {v31.8b}, [x0],  x1
3572        subs            w4,  w4,  #4
3573        st1             {v31.8b}, [x13], x1
3574        st1             {v31.8b}, [x0],  x1
3575        st1             {v31.8b}, [x13], x1
3576        b.gt            4b
3577        subs            w3,  w3,  #8
3578        lsr             x1,  x1,  #1
3579        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3580        msub            x13, x1,  x12, x13
3581        b.le            9f
3582        lsl             x1,  x1,  #1
3583        add             x0,  x0,  #8
3584        add             x13, x13, #8
3585        mov             w4,  w12
3586        b               1b
3587
358816:
358932:
359064:
3591        AARCH64_VALID_JUMP_TARGET
3592        st1             {v31.16b}, [x0],  x1
3593        subs            w4,  w4,  #4
3594        st1             {v31.16b}, [x13], x1
3595        st1             {v31.16b}, [x0],  x1
3596        st1             {v31.16b}, [x13], x1
3597        b.gt            4b
3598        subs            w3,  w3,  #16
3599        lsr             x1,  x1,  #1
3600        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3601        msub            x13, x1,  x12, x13
3602        b.le            9f
3603        lsl             x1,  x1,  #1
3604        add             x0,  x0,  #16
3605        add             x13, x13, #16
3606        mov             w4,  w12
3607        b               1b
3608
36099:
3610        ret
3611
3612L(ipred_z3_fill_padding_tbl):
3613        .hword L(ipred_z3_fill_padding_tbl) - 64b
3614        .hword L(ipred_z3_fill_padding_tbl) - 32b
3615        .hword L(ipred_z3_fill_padding_tbl) - 16b
3616        .hword L(ipred_z3_fill_padding_tbl) -  8b
3617        .hword L(ipred_z3_fill_padding_tbl) -  4b
3618        .hword L(ipred_z3_fill_padding_tbl) -  2b
3619
3620L(ipred_z3_fill_padding_wide):
3621        // Fill a WxH rectangle with padding, with W > 16.
3622        lsr             x1,  x1,  #1
3623        mov             w12, w3
3624        sub             x1,  x1,  w3,  uxtw
36251:
3626        ands            w5,  w3,  #15
3627        b.eq            2f
3628        // If the width isn't aligned to 16, first do one 16 byte write
3629        // and align the start pointer.
3630        sub             w3,  w3,  w5
3631        st1             {v31.16b}, [x0]
3632        add             x0,  x0,  w5,  uxtw
36332:
3634        // Fill the rest of the line with aligned 16 byte writes.
3635        subs            w3,  w3,  #16
3636        st1             {v31.16b}, [x0], #16
3637        b.gt            2b
3638        subs            w4,  w4,  #1
3639        add             x0,  x0,  x1
3640        b.le            9f
3641        mov             w3,  w12
3642        b               1b
36439:
3644        ret
3645endfunc
3646
3647function ipred_z3_fill2_8bpc_neon, export=1
3648        cmp             w3,  #8
3649        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3650        movrel          x11, increments
3651        ld1r            {v31.16b}, [x10]          // padding
3652        ld1             {v30.8h},  [x11]          // increments
3653        b.eq            80f
3654
365540:     // w == 4
3656        dup             v29.4h,  w5               // dy
3657
3658        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
3659        movi            v23.16b, #0x3e
3660
3661        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3662        // so max_base_y <= 32.
3663        ld1             {v0.16b, v1.16b}, [x2] // left[]
3664        add             v30.4h,  v29.4h,  v30.4h  // ypos
3665
3666        movi            v22.16b, #64
3667        movi            v20.16b, #1
3668        movi            v21.16b, #2
3669
3670        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3671        uqshrn          v26.8b,  v30.8h,  #6      // base
3672        and             v24.8b,  v24.8b,  v23.8b  // frac
3673
3674        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3675        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3676        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3677        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3678
3679        trn1            v24.2s,  v24.2s,  v24.2s  // frac
3680        trn1            v26.2s,  v26.2s,  v28.2s  // base + 0, base + 2
3681        trn1            v27.2s,  v27.2s,  v29.2s  // base + 1, base + 3
3682        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
3683
3684        movi            v21.16b, #4
36851:
3686        mov             v4.8b,   v31.8b
3687        mov             v5.8b,   v31.8b
3688        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
3689        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
3690
3691        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3692        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3693        rshrn           v16.8b,  v16.8h,  #6
3694        st1             {v16.s}[0], [x0], x1
3695        subs            w4,  w4,  #2
3696        st1             {v16.s}[1], [x0], x1
3697        b.le            9f
3698
3699        uqadd           v26.8b,  v26.8b,  v21.8b  // base += 4
3700        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 4
3701        b               1b
3702
37039:
3704        ret
3705
370680:     // w == 8
3707        dup             v29.8h,  w5               // dy
3708
3709        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
3710        movi            v23.16b, #0x3e
3711
3712        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
3713        // so max_base_y <= 32.
3714        ld1             {v0.16b, v1.16b}, [x2] // left[]
3715        add             v30.8h,  v29.8h,  v30.8h  // ypos
3716
3717        movi            v22.16b, #64
3718        movi            v20.16b, #1
3719        movi            v21.16b, #2
3720
3721        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
3722        uqshrn          v26.8b,  v30.8h,  #6      // base
3723        and             v24.8b,  v24.8b,  v23.8b  // frac
3724
3725        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
3726        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
3727        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
3728        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
3729
3730        trn1            v24.2d,  v24.2d,  v24.2d  // frac
3731        trn1            v26.2d,  v26.2d,  v28.2d  // base + 0, base + 2
3732        trn1            v27.2d,  v27.2d,  v29.2d  // base + 1, base + 3
3733        trn1            v25.2d,  v25.2d,  v25.2d  // 64 - frac
3734
3735        movi            v21.16b, #4
37361:
3737        mov             v4.16b,  v31.16b
3738        mov             v5.16b,  v31.16b
3739        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
3740        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
3741
3742        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
3743        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
3744        umull2          v17.8h,  v4.16b,  v25.16b
3745        umlal2          v17.8h,  v5.16b,  v24.16b
3746        rshrn           v16.8b,  v16.8h,  #6
3747        rshrn           v17.8b,  v17.8h,  #6
3748        st1             {v16.8b}, [x0], x1
3749        subs            w4,  w4,  #2
3750        st1             {v17.8b}, [x0], x1
3751        b.le            9f
3752
3753        uqadd           v26.16b, v26.16b, v21.16b // base += 4
3754        uqadd           v27.16b, v27.16b, v21.16b // base += 4
3755        b               1b
3756
37579:
3758        ret
3759endfunc
3760
3761
3762// void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3763//                             const pixel *const topleft,
3764//                             const int width, const int height, const int filt_idx,
3765//                             const int max_width, const int max_height);
3766function ipred_filter_8bpc_neon, export=1
3767        and             w5,  w5,  #511
3768        movrel          x6,  X(filter_intra_taps)
3769        lsl             w5,  w5,  #6
3770        add             x6,  x6,  w5, uxtw
3771        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3772        clz             w9,  w3
3773        adr             x5,  L(ipred_filter_tbl)
3774        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3775        sub             w9,  w9,  #26
3776        ldrh            w9,  [x5, w9, uxtw #1]
3777        sxtl            v16.8h,  v16.8b
3778        sxtl            v17.8h,  v17.8b
3779        sub             x5,  x5,  w9, uxtw
3780        sxtl            v18.8h,  v18.8b
3781        sxtl            v19.8h,  v19.8b
3782        add             x6,  x0,  x1
3783        lsl             x1,  x1,  #1
3784        sxtl            v20.8h,  v20.8b
3785        sxtl            v21.8h,  v21.8b
3786        sxtl            v22.8h,  v22.8b
3787        br              x5
378840:
3789        AARCH64_VALID_JUMP_TARGET
3790        ldur            s0,  [x2, #1]             // top (0-3)
3791        sub             x2,  x2,  #2
3792        mov             x7,  #-2
3793        uxtl            v0.8h,   v0.8b            // top (0-3)
37944:
3795        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3796        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3797        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3798        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3799        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3800        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3801        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3802        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3803        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3804        sqrshrun        v2.8b,   v2.8h,   #4
3805        subs            w4,  w4,  #2
3806        st1             {v2.s}[0], [x0], x1
3807        uxtl            v0.8h,   v2.8b
3808        st1             {v2.s}[1], [x6], x1
3809        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
3810        b.gt            4b
3811        ret
381280:
3813        AARCH64_VALID_JUMP_TARGET
3814        ldur            d0,  [x2, #1]             // top (0-7)
3815        sub             x2,  x2,  #2
3816        mov             x7,  #-2
3817        uxtl            v0.8h,   v0.8b            // top (0-7)
38188:
3819        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
3820        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3821        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3822        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3823        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
3824        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3825        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3826        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3827        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3828        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3829        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3830        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3831        sqrshrun        v2.8b,   v2.8h,   #4
3832        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
3833        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3834        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3835        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
3836        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
3837        sqrshrun        v3.8b,   v3.8h,   #4
3838        subs            w4,  w4,  #2
3839        st2             {v2.s, v3.s}[0], [x0], x1
3840        zip2            v0.2s,   v2.2s,   v3.2s
3841        st2             {v2.s, v3.s}[1], [x6], x1
3842        uxtl            v0.8h,   v0.8b
3843        b.gt            8b
3844        ret
3845160:
3846320:
3847        AARCH64_VALID_JUMP_TARGET
3848        add             x8,  x2,  #1
3849        sub             x2,  x2,  #2
3850        mov             x7,  #-2
3851        sub             x1,  x1,  w3, uxtw
3852        mov             w9,  w3
3853
38541:
3855        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
3856        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
38572:
3858        ld1             {v2.16b}, [x8],   #16     // top(0-15)
3859        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
3860        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
3861        uxtl            v1.8h,   v2.8b            // top(0-7)
3862        uxtl2           v2.8h,   v2.16b           // top(8-15)
3863        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
3864        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
3865        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
3866        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
3867        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
3868
3869        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
3870        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
3871        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
3872        sqrshrun        v3.8b,   v3.8h,   #4
3873        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
3874        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
3875        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
3876        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3877        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3878
3879        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
3880        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
3881        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
3882        sqrshrun        v4.8b,   v4.8h,   #4
3883        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
3884        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
3885        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
3886        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3887        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3888
3889        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
3890        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
3891        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
3892        sqrshrun        v5.8b,   v5.8h,   #4
3893        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
3894        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
3895        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
3896        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
3897        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
3898
3899        subs            w3,  w3,  #16
3900        sqrshrun        v6.8b,   v6.8h,   #4
3901
3902        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
3903        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
3904        b.le            8f
3905        ins             v0.h[2], v2.h[7]
3906        ins             v0.b[0], v6.b[7]
3907        ins             v0.b[2], v6.b[3]
3908        b               2b
39098:
3910        subs            w4,  w4,  #2
3911        b.le            9f
3912        sub             x8,  x6,  w9, uxtw
3913        add             x0,  x0,  x1
3914        add             x6,  x6,  x1
3915        mov             w3,  w9
3916        b               1b
39179:
3918        ret
3919
3920L(ipred_filter_tbl):
3921        .hword L(ipred_filter_tbl) - 320b
3922        .hword L(ipred_filter_tbl) - 160b
3923        .hword L(ipred_filter_tbl) -  80b
3924        .hword L(ipred_filter_tbl) -  40b
3925endfunc
3926
3927// void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
3928//                         const pixel *const pal, const uint8_t *idx,
3929//                         const int w, const int h);
3930function pal_pred_8bpc_neon, export=1
3931        ld1             {v0.8b}, [x2]
3932        clz             w9,  w4
3933        adr             x6,  L(pal_pred_tbl)
3934        sub             w9,  w9,  #25
3935        movi            v31.16b, #7
3936        ldrh            w9,  [x6, w9, uxtw #1]
3937        sub             x6,  x6,  w9, uxtw
3938        add             x2,  x0,  x1
3939        lsl             x1,  x1,  #1
3940        br              x6
39414:
3942        AARCH64_VALID_JUMP_TARGET
3943        ld1             {v1.8b}, [x3], #8
3944        subs            w5,  w5,  #4
3945        ushr            v3.8b,   v1.8b,   #4
3946        and             v2.8b,   v1.8b,   v31.8b
3947        zip1            v1.16b,  v2.16b,  v3.16b
3948        tbl             v1.16b, {v0.16b}, v1.16b
3949        st1             {v1.s}[0], [x0], x1
3950        st1             {v1.s}[1], [x2], x1
3951        st1             {v1.s}[2], [x0], x1
3952        st1             {v1.s}[3], [x2], x1
3953        b.gt            4b
3954        ret
39558:
3956        AARCH64_VALID_JUMP_TARGET
3957        ld1             {v1.16b}, [x3], #16
3958        subs            w5,  w5,  #4
3959        ushr            v4.16b,  v1.16b,  #4
3960        and             v3.16b,  v1.16b,  v31.16b
3961        zip1            v1.16b,  v3.16b,  v4.16b
3962        zip2            v2.16b,  v3.16b,  v4.16b
3963        tbl             v1.16b, {v0.16b}, v1.16b
3964        st1             {v1.d}[0], [x0], x1
3965        tbl             v2.16b, {v0.16b}, v2.16b
3966        st1             {v1.d}[1], [x2], x1
3967        st1             {v2.d}[0], [x0], x1
3968        st1             {v2.d}[1], [x2], x1
3969        b.gt            8b
3970        ret
397116:
3972        AARCH64_VALID_JUMP_TARGET
3973        ld1             {v1.16b, v2.16b}, [x3], #32
3974        subs            w5,  w5,  #4
3975        ushr            v5.16b,  v1.16b,  #4
3976        and             v4.16b,  v1.16b,  v31.16b
3977        ushr            v7.16b,  v2.16b,  #4
3978        and             v6.16b,  v2.16b,  v31.16b
3979        zip1            v1.16b,  v4.16b,  v5.16b
3980        zip2            v2.16b,  v4.16b,  v5.16b
3981        zip1            v3.16b,  v6.16b,  v7.16b
3982        tbl             v1.16b, {v0.16b}, v1.16b
3983        zip2            v4.16b,  v6.16b,  v7.16b
3984        tbl             v2.16b, {v0.16b}, v2.16b
3985        st1             {v1.16b}, [x0], x1
3986        tbl             v3.16b, {v0.16b}, v3.16b
3987        st1             {v2.16b}, [x2], x1
3988        tbl             v4.16b, {v0.16b}, v4.16b
3989        st1             {v3.16b}, [x0], x1
3990        st1             {v4.16b}, [x2], x1
3991        b.gt            16b
3992        ret
399332:
3994        AARCH64_VALID_JUMP_TARGET
3995        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
3996        subs            w5,  w5,  #4
3997        ushr            v21.16b, v16.16b, #4
3998        and             v20.16b, v16.16b, v31.16b
3999        ushr            v23.16b, v17.16b, #4
4000        and             v22.16b, v17.16b, v31.16b
4001        ushr            v25.16b, v18.16b, #4
4002        and             v24.16b, v18.16b, v31.16b
4003        ushr            v27.16b, v19.16b, #4
4004        and             v26.16b, v19.16b, v31.16b
4005        zip1            v16.16b, v20.16b, v21.16b
4006        zip2            v17.16b, v20.16b, v21.16b
4007        zip1            v18.16b, v22.16b, v23.16b
4008        zip2            v19.16b, v22.16b, v23.16b
4009        zip1            v20.16b, v24.16b, v25.16b
4010        zip2            v21.16b, v24.16b, v25.16b
4011        tbl             v16.16b, {v0.16b}, v16.16b
4012        zip1            v22.16b, v26.16b, v27.16b
4013        tbl             v17.16b, {v0.16b}, v17.16b
4014        zip2            v23.16b, v26.16b, v27.16b
4015        tbl             v18.16b, {v0.16b}, v18.16b
4016        tbl             v19.16b, {v0.16b}, v19.16b
4017        tbl             v20.16b, {v0.16b}, v20.16b
4018        st1             {v16.16b, v17.16b}, [x0], x1
4019        tbl             v21.16b, {v0.16b}, v21.16b
4020        st1             {v18.16b, v19.16b}, [x2], x1
4021        tbl             v22.16b, {v0.16b}, v22.16b
4022        st1             {v20.16b, v21.16b}, [x0], x1
4023        tbl             v23.16b, {v0.16b}, v23.16b
4024        st1             {v22.16b, v23.16b}, [x2], x1
4025        b.gt            32b
4026        ret
402764:
4028        AARCH64_VALID_JUMP_TARGET
4029        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
4030        subs            w5,  w5,  #2
4031        ushr            v21.16b, v16.16b, #4
4032        and             v20.16b, v16.16b, v31.16b
4033        ushr            v23.16b, v17.16b, #4
4034        and             v22.16b, v17.16b, v31.16b
4035        ushr            v25.16b, v18.16b, #4
4036        and             v24.16b, v18.16b, v31.16b
4037        ushr            v27.16b, v19.16b, #4
4038        and             v26.16b, v19.16b, v31.16b
4039        zip1            v16.16b, v20.16b, v21.16b
4040        zip2            v17.16b, v20.16b, v21.16b
4041        zip1            v18.16b, v22.16b, v23.16b
4042        zip2            v19.16b, v22.16b, v23.16b
4043        zip1            v20.16b, v24.16b, v25.16b
4044        zip2            v21.16b, v24.16b, v25.16b
4045        tbl             v16.16b, {v0.16b}, v16.16b
4046        zip1            v22.16b, v26.16b, v27.16b
4047        tbl             v17.16b, {v0.16b}, v17.16b
4048        zip2            v23.16b, v26.16b, v27.16b
4049        tbl             v18.16b, {v0.16b}, v18.16b
4050        tbl             v19.16b, {v0.16b}, v19.16b
4051        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
4052        tbl             v20.16b, {v0.16b}, v20.16b
4053        tbl             v21.16b, {v0.16b}, v21.16b
4054        tbl             v22.16b, {v0.16b}, v22.16b
4055        tbl             v23.16b, {v0.16b}, v23.16b
4056        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
4057        b.gt            64b
4058        ret
4059
4060L(pal_pred_tbl):
4061        .hword L(pal_pred_tbl) - 64b
4062        .hword L(pal_pred_tbl) - 32b
4063        .hword L(pal_pred_tbl) - 16b
4064        .hword L(pal_pred_tbl) -  8b
4065        .hword L(pal_pred_tbl) -  4b
4066endfunc
4067
4068// void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4069//                              const pixel *const topleft,
4070//                              const int width, const int height,
4071//                              const int16_t *ac, const int alpha);
4072function ipred_cfl_128_8bpc_neon, export=1
4073        clz             w9,  w3
4074        adr             x7,  L(ipred_cfl_128_tbl)
4075        sub             w9,  w9,  #26
4076        ldrh            w9,  [x7, w9, uxtw #1]
4077        movi            v0.8h,   #128 // dc
4078        dup             v1.8h,   w6   // alpha
4079        sub             x7,  x7,  w9, uxtw
4080        add             x6,  x0,  x1
4081        lsl             x1,  x1,  #1
4082        br              x7
4083L(ipred_cfl_splat_w4):
4084        AARCH64_VALID_JUMP_TARGET
4085        ld1             {v2.8h, v3.8h}, [x5], #32
4086        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4087        mul             v3.8h,   v3.8h,   v1.8h
4088        cmlt            v4.8h,   v2.8h,   #0     // sign
4089        cmlt            v5.8h,   v3.8h,   #0
4090        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
4091        add             v3.8h,   v3.8h,   v5.8h
4092        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4093        srshr           v3.8h,   v3.8h,   #6
4094        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4095        add             v3.8h,   v3.8h,   v0.8h
4096        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4097        sqxtun          v3.8b,   v3.8h
4098        st1             {v2.s}[0],  [x0], x1
4099        st1             {v2.s}[1],  [x6], x1
4100        subs            w4,  w4,  #4
4101        st1             {v3.s}[0],  [x0], x1
4102        st1             {v3.s}[1],  [x6], x1
4103        b.gt            L(ipred_cfl_splat_w4)
4104        ret
4105L(ipred_cfl_splat_w8):
4106        AARCH64_VALID_JUMP_TARGET
4107        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
4108        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4109        mul             v3.8h,   v3.8h,   v1.8h
4110        mul             v4.8h,   v4.8h,   v1.8h
4111        mul             v5.8h,   v5.8h,   v1.8h
4112        cmlt            v16.8h,  v2.8h,   #0     // sign
4113        cmlt            v17.8h,  v3.8h,   #0
4114        cmlt            v18.8h,  v4.8h,   #0
4115        cmlt            v19.8h,  v5.8h,   #0
4116        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4117        add             v3.8h,   v3.8h,   v17.8h
4118        add             v4.8h,   v4.8h,   v18.8h
4119        add             v5.8h,   v5.8h,   v19.8h
4120        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4121        srshr           v3.8h,   v3.8h,   #6
4122        srshr           v4.8h,   v4.8h,   #6
4123        srshr           v5.8h,   v5.8h,   #6
4124        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4125        add             v3.8h,   v3.8h,   v0.8h
4126        add             v4.8h,   v4.8h,   v0.8h
4127        add             v5.8h,   v5.8h,   v0.8h
4128        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4129        sqxtun          v3.8b,   v3.8h
4130        sqxtun          v4.8b,   v4.8h
4131        sqxtun          v5.8b,   v5.8h
4132        st1             {v2.8b},  [x0], x1
4133        st1             {v3.8b},  [x6], x1
4134        subs            w4,  w4,  #4
4135        st1             {v4.8b},  [x0], x1
4136        st1             {v5.8b},  [x6], x1
4137        b.gt            L(ipred_cfl_splat_w8)
4138        ret
4139L(ipred_cfl_splat_w16):
4140        AARCH64_VALID_JUMP_TARGET
4141        add             x7,  x5,  w3, uxtw #1
4142        sub             x1,  x1,  w3, uxtw
4143        mov             w9,  w3
41441:
4145        ld1             {v2.8h, v3.8h}, [x5], #32
4146        ld1             {v4.8h, v5.8h}, [x7], #32
4147        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
4148        mul             v3.8h,   v3.8h,   v1.8h
4149        mul             v4.8h,   v4.8h,   v1.8h
4150        mul             v5.8h,   v5.8h,   v1.8h
4151        cmlt            v16.8h,  v2.8h,   #0     // sign
4152        cmlt            v17.8h,  v3.8h,   #0
4153        cmlt            v18.8h,  v4.8h,   #0
4154        cmlt            v19.8h,  v5.8h,   #0
4155        add             v2.8h,   v2.8h,   v16.8h // diff + sign
4156        add             v3.8h,   v3.8h,   v17.8h
4157        add             v4.8h,   v4.8h,   v18.8h
4158        add             v5.8h,   v5.8h,   v19.8h
4159        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4160        srshr           v3.8h,   v3.8h,   #6
4161        srshr           v4.8h,   v4.8h,   #6
4162        srshr           v5.8h,   v5.8h,   #6
4163        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4164        add             v3.8h,   v3.8h,   v0.8h
4165        add             v4.8h,   v4.8h,   v0.8h
4166        add             v5.8h,   v5.8h,   v0.8h
4167        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
4168        sqxtun          v3.8b,   v3.8h
4169        sqxtun          v4.8b,   v4.8h
4170        sqxtun          v5.8b,   v5.8h
4171        subs            w3,  w3,  #16
4172        st1             {v2.8b, v3.8b},  [x0], #16
4173        st1             {v4.8b, v5.8b},  [x6], #16
4174        b.gt            1b
4175        subs            w4,  w4,  #2
4176        add             x5,  x5,  w9, uxtw #1
4177        add             x7,  x7,  w9, uxtw #1
4178        add             x0,  x0,  x1
4179        add             x6,  x6,  x1
4180        mov             w3,  w9
4181        b.gt            1b
4182        ret
4183
4184L(ipred_cfl_128_tbl):
4185L(ipred_cfl_splat_tbl):
4186        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
4187        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
4188        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
4189        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
4190endfunc
4191
4192// void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4193//                              const pixel *const topleft,
4194//                              const int width, const int height,
4195//                              const int16_t *ac, const int alpha);
4196function ipred_cfl_top_8bpc_neon, export=1
4197        clz             w9,  w3
4198        adr             x7,  L(ipred_cfl_top_tbl)
4199        sub             w9,  w9,  #26
4200        ldrh            w9,  [x7, w9, uxtw #1]
4201        dup             v1.8h,   w6   // alpha
4202        add             x2,  x2,  #1
4203        sub             x7,  x7,  w9, uxtw
4204        add             x6,  x0,  x1
4205        lsl             x1,  x1,  #1
4206        br              x7
42074:
4208        AARCH64_VALID_JUMP_TARGET
4209        ld1r            {v0.2s},  [x2]
4210        uaddlv          h0,      v0.8b
4211        urshr           v0.4h,   v0.4h,   #3
4212        dup             v0.8h,   v0.h[0]
4213        b               L(ipred_cfl_splat_w4)
42148:
4215        AARCH64_VALID_JUMP_TARGET
4216        ld1             {v0.8b},  [x2]
4217        uaddlv          h0,      v0.8b
4218        urshr           v0.4h,   v0.4h,   #3
4219        dup             v0.8h,   v0.h[0]
4220        b               L(ipred_cfl_splat_w8)
422116:
4222        AARCH64_VALID_JUMP_TARGET
4223        ld1             {v0.16b}, [x2]
4224        uaddlv          h0,      v0.16b
4225        urshr           v0.4h,   v0.4h,   #4
4226        dup             v0.8h,   v0.h[0]
4227        b               L(ipred_cfl_splat_w16)
422832:
4229        AARCH64_VALID_JUMP_TARGET
4230        ld1             {v2.16b, v3.16b}, [x2]
4231        uaddlv          h2,      v2.16b
4232        uaddlv          h3,      v3.16b
4233        add             v2.4h,   v2.4h,   v3.4h
4234        urshr           v2.4h,   v2.4h,   #5
4235        dup             v0.8h,   v2.h[0]
4236        b               L(ipred_cfl_splat_w16)
4237
4238L(ipred_cfl_top_tbl):
4239        .hword L(ipred_cfl_top_tbl) - 32b
4240        .hword L(ipred_cfl_top_tbl) - 16b
4241        .hword L(ipred_cfl_top_tbl) -  8b
4242        .hword L(ipred_cfl_top_tbl) -  4b
4243endfunc
4244
4245// void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4246//                               const pixel *const topleft,
4247//                               const int width, const int height,
4248//                               const int16_t *ac, const int alpha);
4249function ipred_cfl_left_8bpc_neon, export=1
4250        sub             x2,  x2,  w4, uxtw
4251        clz             w9,  w3
4252        clz             w8,  w4
4253        adr             x10, L(ipred_cfl_splat_tbl)
4254        adr             x7,  L(ipred_cfl_left_tbl)
4255        sub             w9,  w9,  #26
4256        sub             w8,  w8,  #26
4257        ldrh            w9,  [x10, w9, uxtw #1]
4258        ldrh            w8,  [x7,  w8, uxtw #1]
4259        dup             v1.8h,   w6   // alpha
4260        sub             x9,  x10, w9, uxtw
4261        sub             x7,  x7,  w8, uxtw
4262        add             x6,  x0,  x1
4263        lsl             x1,  x1,  #1
4264        br              x7
4265
4266L(ipred_cfl_left_h4):
4267        AARCH64_VALID_JUMP_TARGET
4268        ld1r            {v0.2s},  [x2]
4269        uaddlv          h0,      v0.8b
4270        urshr           v0.4h,   v0.4h,   #3
4271        dup             v0.8h,   v0.h[0]
4272        br              x9
4273
4274L(ipred_cfl_left_h8):
4275        AARCH64_VALID_JUMP_TARGET
4276        ld1             {v0.8b},  [x2]
4277        uaddlv          h0,      v0.8b
4278        urshr           v0.4h,   v0.4h,   #3
4279        dup             v0.8h,   v0.h[0]
4280        br              x9
4281
4282L(ipred_cfl_left_h16):
4283        AARCH64_VALID_JUMP_TARGET
4284        ld1             {v0.16b}, [x2]
4285        uaddlv          h0,      v0.16b
4286        urshr           v0.4h,   v0.4h,   #4
4287        dup             v0.8h,   v0.h[0]
4288        br              x9
4289
4290L(ipred_cfl_left_h32):
4291        AARCH64_VALID_JUMP_TARGET
4292        ld1             {v2.16b, v3.16b}, [x2]
4293        uaddlv          h2,      v2.16b
4294        uaddlv          h3,      v3.16b
4295        add             v2.4h,   v2.4h,   v3.4h
4296        urshr           v2.4h,   v2.4h,   #5
4297        dup             v0.8h,   v2.h[0]
4298        br              x9
4299
4300L(ipred_cfl_left_tbl):
4301        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
4302        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
4303        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
4304        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
4305endfunc
4306
4307// void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
4308//                          const pixel *const topleft,
4309//                          const int width, const int height,
4310//                          const int16_t *ac, const int alpha);
4311function ipred_cfl_8bpc_neon, export=1
4312        sub             x2,  x2,  w4, uxtw
4313        add             w8,  w3,  w4             // width + height
4314        dup             v1.8h,   w6              // alpha
4315        clz             w9,  w3
4316        clz             w6,  w4
4317        dup             v16.8h, w8               // width + height
4318        adr             x7,  L(ipred_cfl_tbl)
4319        rbit            w8,  w8                  // rbit(width + height)
4320        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4321        sub             w6,  w6,  #26
4322        clz             w8,  w8                  // ctz(width + height)
4323        ldrh            w9,  [x7, w9, uxtw #1]
4324        ldrh            w6,  [x7, w6, uxtw #1]
4325        neg             w8,  w8                  // -ctz(width + height)
4326        sub             x9,  x7,  w9, uxtw
4327        sub             x7,  x7,  w6, uxtw
4328        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
4329        dup             v17.8h,  w8              // -ctz(width + height)
4330        add             x6,  x0,  x1
4331        lsl             x1,  x1,  #1
4332        br              x7
4333
4334L(ipred_cfl_h4):
4335        AARCH64_VALID_JUMP_TARGET
4336        ld1             {v0.s}[0],  [x2], #4
4337        ins             v0.s[1], wzr
4338        add             x2,  x2,  #1
4339        uaddlv          h0,      v0.8b
4340        br              x9
4341L(ipred_cfl_w4):
4342        AARCH64_VALID_JUMP_TARGET
4343        ld1             {v2.s}[0],  [x2]
4344        ins             v2.s[1], wzr
4345        add             v0.4h,   v0.4h,   v16.4h
4346        uaddlv          h2,      v2.8b
4347        cmp             w4,  #4
4348        add             v0.4h,   v0.4h,   v2.4h
4349        ushl            v0.4h,   v0.4h,   v17.4h
4350        b.eq            1f
4351        // h = 8/16
4352        mov             w16, #(0x3334/2)
4353        movk            w16, #(0x5556/2), lsl #16
4354        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4355        lsr             w16, w16, w17
4356        dup             v16.4h,  w16
4357        sqdmulh         v0.4h,   v0.4h,   v16.4h
43581:
4359        dup             v0.8h,   v0.h[0]
4360        b               L(ipred_cfl_splat_w4)
4361
4362L(ipred_cfl_h8):
4363        AARCH64_VALID_JUMP_TARGET
4364        ld1             {v0.8b},  [x2], #8
4365        uaddlv          h0,      v0.8b
4366        add             x2,  x2,  #1
4367        br              x9
4368L(ipred_cfl_w8):
4369        AARCH64_VALID_JUMP_TARGET
4370        ld1             {v2.8b},  [x2]
4371        add             v0.4h,   v0.4h,   v16.4h
4372        uaddlv          h2,      v2.8b
4373        cmp             w4,  #8
4374        add             v0.4h,   v0.4h,   v2.4h
4375        ushl            v0.4h,   v0.4h,   v17.4h
4376        b.eq            1f
4377        // h = 4/16/32
4378        cmp             w4,  #32
4379        mov             w16, #(0x3334/2)
4380        mov             w17, #(0x5556/2)
4381        csel            w16, w16, w17, eq
4382        dup             v16.4h,  w16
4383        sqdmulh         v0.4h,   v0.4h,   v16.4h
43841:
4385        dup             v0.8h,   v0.h[0]
4386        b               L(ipred_cfl_splat_w8)
4387
4388L(ipred_cfl_h16):
4389        AARCH64_VALID_JUMP_TARGET
4390        ld1             {v0.16b}, [x2], #16
4391        uaddlv          h0,      v0.16b
4392        add             x2,  x2,  #1
4393        br              x9
4394L(ipred_cfl_w16):
4395        AARCH64_VALID_JUMP_TARGET
4396        ld1             {v2.16b}, [x2]
4397        add             v0.4h,   v0.4h,   v16.4h
4398        uaddlv          h2,      v2.16b
4399        cmp             w4,  #16
4400        add             v0.4h,   v0.4h,   v2.4h
4401        ushl            v0.4h,   v0.4h,   v17.4h
4402        b.eq            1f
4403        // h = 4/8/32
4404        cmp             w4,  #4
4405        mov             w16, #(0x3334/2)
4406        mov             w17, #(0x5556/2)
4407        csel            w16, w16, w17, eq
4408        dup             v16.4h,  w16
4409        sqdmulh         v0.4h,   v0.4h,   v16.4h
44101:
4411        dup             v0.8h,   v0.h[0]
4412        b               L(ipred_cfl_splat_w16)
4413
4414L(ipred_cfl_h32):
4415        AARCH64_VALID_JUMP_TARGET
4416        ld1             {v2.16b, v3.16b}, [x2], #32
4417        uaddlv          h2,      v2.16b
4418        uaddlv          h3,      v3.16b
4419        add             x2,  x2,  #1
4420        add             v0.4h,   v2.4h,   v3.4h
4421        br              x9
4422L(ipred_cfl_w32):
4423        AARCH64_VALID_JUMP_TARGET
4424        ld1             {v2.16b, v3.16b}, [x2]
4425        add             v0.4h,   v0.4h,   v16.4h
4426        uaddlv          h2,      v2.16b
4427        uaddlv          h3,      v3.16b
4428        cmp             w4,  #32
4429        add             v0.4h,   v0.4h,   v2.4h
4430        add             v0.4h,   v0.4h,   v3.4h
4431        ushl            v0.4h,   v0.4h,   v17.4h
4432        b.eq            1f
4433        // h = 8/16
4434        mov             w16, #(0x5556/2)
4435        movk            w16, #(0x3334/2), lsl #16
4436        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
4437        lsr             w16, w16, w17
4438        dup             v16.4h,  w16
4439        sqdmulh         v0.4h,   v0.4h,   v16.4h
44401:
4441        dup             v0.8h,   v0.h[0]
4442        b               L(ipred_cfl_splat_w16)
4443
4444L(ipred_cfl_tbl):
4445        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
4446        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
4447        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
4448        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
4449        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
4450        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
4451        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
4452        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
4453endfunc
4454
4455// void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4456//                           const ptrdiff_t stride, const int w_pad,
4457//                           const int h_pad, const int cw, const int ch);
4458function ipred_cfl_ac_420_8bpc_neon, export=1
4459        clz             w8,  w5
4460        lsl             w4,  w4,  #2
4461        adr             x7,  L(ipred_cfl_ac_420_tbl)
4462        sub             w8,  w8,  #27
4463        ldrh            w8,  [x7, w8, uxtw #1]
4464        movi            v16.8h,  #0
4465        movi            v17.8h,  #0
4466        movi            v18.8h,  #0
4467        movi            v19.8h,  #0
4468        sub             x7,  x7,  w8, uxtw
4469        sub             w8,  w6,  w4         // height - h_pad
4470        rbit            w9,  w5              // rbit(width)
4471        rbit            w10, w6              // rbit(height)
4472        clz             w9,  w9              // ctz(width)
4473        clz             w10, w10             // ctz(height)
4474        add             w9,  w9,  w10        // log2sz
4475        add             x10, x1,  x2
4476        dup             v31.4s,  w9
4477        lsl             x2,  x2,  #1
4478        neg             v31.4s,  v31.4s      // -log2sz
4479        br              x7
4480
4481L(ipred_cfl_ac_420_w4):
4482        AARCH64_VALID_JUMP_TARGET
44831:      // Copy and subsample input
4484        ld1             {v0.8b},   [x1],  x2
4485        ld1             {v1.8b},   [x10], x2
4486        ld1             {v0.d}[1], [x1],  x2
4487        ld1             {v1.d}[1], [x10], x2
4488        uaddlp          v0.8h,   v0.16b
4489        uaddlp          v1.8h,   v1.16b
4490        add             v0.8h,   v0.8h,   v1.8h
4491        shl             v0.8h,   v0.8h,   #1
4492        subs            w8,  w8,  #2
4493        st1             {v0.8h}, [x0], #16
4494        add             v16.8h,  v16.8h,  v0.8h
4495        b.gt            1b
4496        trn2            v1.2d,   v0.2d,   v0.2d
4497        trn2            v0.2d,   v0.2d,   v0.2d
4498L(ipred_cfl_ac_420_w4_hpad):
4499        cbz             w4,  3f
45002:      // Vertical padding (h_pad > 0)
4501        subs            w4,  w4,  #4
4502        st1             {v0.8h, v1.8h}, [x0], #32
4503        add             v16.8h,  v16.8h,  v0.8h
4504        add             v17.8h,  v17.8h,  v1.8h
4505        b.gt            2b
45063:
4507        // Aggregate the sums
4508        add             v0.8h,   v16.8h,  v17.8h
4509        uaddlv          s0,  v0.8h                // sum
4510        sub             x0,  x0,  w6, uxtw #3
4511        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4512        dup             v4.8h,   v4.h[0]
45136:      // Subtract dc from ac
4514        ld1             {v0.8h, v1.8h}, [x0]
4515        subs            w6,  w6,  #4
4516        sub             v0.8h,   v0.8h,   v4.8h
4517        sub             v1.8h,   v1.8h,   v4.8h
4518        st1             {v0.8h, v1.8h}, [x0], #32
4519        b.gt            6b
4520        ret
4521
4522L(ipred_cfl_ac_420_w8):
4523        AARCH64_VALID_JUMP_TARGET
4524        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
45251:      // Copy and subsample input, without padding
4526        ld1             {v0.16b}, [x1],  x2
4527        ld1             {v1.16b}, [x10], x2
4528        ld1             {v2.16b}, [x1],  x2
4529        uaddlp          v0.8h,   v0.16b
4530        ld1             {v3.16b}, [x10], x2
4531        uaddlp          v1.8h,   v1.16b
4532        uaddlp          v2.8h,   v2.16b
4533        uaddlp          v3.8h,   v3.16b
4534        add             v0.8h,   v0.8h,   v1.8h
4535        add             v2.8h,   v2.8h,   v3.8h
4536        shl             v0.8h,   v0.8h,   #1
4537        shl             v1.8h,   v2.8h,   #1
4538        subs            w8,  w8,  #2
4539        st1             {v0.8h, v1.8h}, [x0], #32
4540        add             v16.8h,  v16.8h,  v0.8h
4541        add             v17.8h,  v17.8h,  v1.8h
4542        b.gt            1b
4543        mov             v0.16b,  v1.16b
4544        b               L(ipred_cfl_ac_420_w8_hpad)
4545
4546L(ipred_cfl_ac_420_w8_wpad):
45471:      // Copy and subsample input, padding 4
4548        ld1             {v0.8b},   [x1],  x2
4549        ld1             {v1.8b},   [x10], x2
4550        ld1             {v0.d}[1], [x1],  x2
4551        ld1             {v1.d}[1], [x10], x2
4552        uaddlp          v0.8h,   v0.16b
4553        uaddlp          v1.8h,   v1.16b
4554        add             v0.8h,   v0.8h,   v1.8h
4555        shl             v0.8h,   v0.8h,   #1
4556        dup             v1.4h,   v0.h[3]
4557        dup             v3.4h,   v0.h[7]
4558        trn2            v2.2d,   v0.2d,   v0.2d
4559        subs            w8,  w8,  #2
4560        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4561        add             v16.4h,  v16.4h,  v0.4h
4562        add             v17.4h,  v17.4h,  v1.4h
4563        add             v18.4h,  v18.4h,  v2.4h
4564        add             v19.4h,  v19.4h,  v3.4h
4565        b.gt            1b
4566        trn1            v0.2d,   v2.2d,   v3.2d
4567        trn1            v1.2d,   v2.2d,   v3.2d
4568
4569L(ipred_cfl_ac_420_w8_hpad):
4570        cbz             w4,  3f
45712:      // Vertical padding (h_pad > 0)
4572        subs            w4,  w4,  #4
4573        st1             {v0.8h, v1.8h}, [x0], #32
4574        add             v16.8h,  v16.8h,  v0.8h
4575        add             v17.8h,  v17.8h,  v1.8h
4576        st1             {v0.8h, v1.8h}, [x0], #32
4577        add             v18.8h,  v18.8h,  v0.8h
4578        add             v19.8h,  v19.8h,  v1.8h
4579        b.gt            2b
45803:
4581
4582L(ipred_cfl_ac_420_w8_calc_subtract_dc):
4583        // Aggregate the sums
4584        add             v0.8h,   v16.8h,  v17.8h
4585        add             v2.8h,   v18.8h,  v19.8h
4586        uaddlp          v0.4s,   v0.8h
4587        uaddlp          v2.4s,   v2.8h
4588        add             v0.4s,   v0.4s,   v2.4s
4589        addv            s0,  v0.4s                // sum
4590        sub             x0,  x0,  w6, uxtw #4
4591        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
4592        dup             v4.8h,   v4.h[0]
4593L(ipred_cfl_ac_420_w8_subtract_dc):
45946:      // Subtract dc from ac
4595        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
4596        subs            w6,  w6,  #4
4597        sub             v0.8h,   v0.8h,   v4.8h
4598        sub             v1.8h,   v1.8h,   v4.8h
4599        sub             v2.8h,   v2.8h,   v4.8h
4600        sub             v3.8h,   v3.8h,   v4.8h
4601        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4602        b.gt            6b
4603        ret
4604
4605L(ipred_cfl_ac_420_w16):
4606        AARCH64_VALID_JUMP_TARGET
4607        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
4608        ldrh            w3,  [x7, w3, uxtw #1]
4609        sub             x7,  x7,  w3, uxtw
4610        br              x7
4611
4612L(ipred_cfl_ac_420_w16_wpad0):
4613        AARCH64_VALID_JUMP_TARGET
46141:      // Copy and subsample input, without padding
4615        ld1             {v0.16b, v1.16b}, [x1],  x2
4616        ld1             {v2.16b, v3.16b}, [x10], x2
4617        uaddlp          v0.8h,   v0.16b
4618        ld1             {v4.16b, v5.16b}, [x1],  x2
4619        uaddlp          v1.8h,   v1.16b
4620        ld1             {v6.16b, v7.16b}, [x10], x2
4621        uaddlp          v2.8h,   v2.16b
4622        uaddlp          v3.8h,   v3.16b
4623        uaddlp          v4.8h,   v4.16b
4624        uaddlp          v5.8h,   v5.16b
4625        uaddlp          v6.8h,   v6.16b
4626        uaddlp          v7.8h,   v7.16b
4627        add             v0.8h,   v0.8h,   v2.8h
4628        add             v1.8h,   v1.8h,   v3.8h
4629        add             v4.8h,   v4.8h,   v6.8h
4630        add             v5.8h,   v5.8h,   v7.8h
4631        shl             v0.8h,   v0.8h,   #1
4632        shl             v1.8h,   v1.8h,   #1
4633        shl             v2.8h,   v4.8h,   #1
4634        shl             v3.8h,   v5.8h,   #1
4635        subs            w8,  w8,  #2
4636        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4637        add             v16.8h,  v16.8h,  v0.8h
4638        add             v17.8h,  v17.8h,  v1.8h
4639        add             v18.8h,  v18.8h,  v2.8h
4640        add             v19.8h,  v19.8h,  v3.8h
4641        b.gt            1b
4642        mov             v0.16b,  v2.16b
4643        mov             v1.16b,  v3.16b
4644        b               L(ipred_cfl_ac_420_w16_hpad)
4645
4646L(ipred_cfl_ac_420_w16_wpad1):
4647        AARCH64_VALID_JUMP_TARGET
46481:      // Copy and subsample input, padding 4
4649        ldr             d1,  [x1,  #16]
4650        ld1             {v0.16b}, [x1],  x2
4651        ldr             d3,  [x10, #16]
4652        ld1             {v2.16b}, [x10], x2
4653        uaddlp          v1.4h,   v1.8b
4654        ldr             d5,  [x1,  #16]
4655        uaddlp          v0.8h,   v0.16b
4656        ld1             {v4.16b}, [x1],  x2
4657        uaddlp          v3.4h,   v3.8b
4658        ldr             d7,  [x10, #16]
4659        uaddlp          v2.8h,   v2.16b
4660        ld1             {v6.16b}, [x10], x2
4661        uaddlp          v5.4h,   v5.8b
4662        uaddlp          v4.8h,   v4.16b
4663        uaddlp          v7.4h,   v7.8b
4664        uaddlp          v6.8h,   v6.16b
4665        add             v1.4h,   v1.4h,   v3.4h
4666        add             v0.8h,   v0.8h,   v2.8h
4667        add             v5.4h,   v5.4h,   v7.4h
4668        add             v4.8h,   v4.8h,   v6.8h
4669        shl             v1.4h,   v1.4h,   #1
4670        shl             v0.8h,   v0.8h,   #1
4671        shl             v3.4h,   v5.4h,   #1
4672        shl             v2.8h,   v4.8h,   #1
4673        dup             v4.4h,   v1.h[3]
4674        dup             v5.4h,   v3.h[3]
4675        trn1            v1.2d,   v1.2d,   v4.2d
4676        trn1            v3.2d,   v3.2d,   v5.2d
4677        subs            w8,  w8,  #2
4678        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4679        add             v16.8h,  v16.8h,  v0.8h
4680        add             v17.8h,  v17.8h,  v1.8h
4681        add             v18.8h,  v18.8h,  v2.8h
4682        add             v19.8h,  v19.8h,  v3.8h
4683        b.gt            1b
4684        mov             v0.16b,  v2.16b
4685        mov             v1.16b,  v3.16b
4686        b               L(ipred_cfl_ac_420_w16_hpad)
4687
4688L(ipred_cfl_ac_420_w16_wpad2):
4689        AARCH64_VALID_JUMP_TARGET
46901:      // Copy and subsample input, padding 8
4691        ld1             {v0.16b}, [x1],  x2
4692        ld1             {v2.16b}, [x10], x2
4693        ld1             {v4.16b}, [x1],  x2
4694        uaddlp          v0.8h,   v0.16b
4695        ld1             {v6.16b}, [x10], x2
4696        uaddlp          v2.8h,   v2.16b
4697        uaddlp          v4.8h,   v4.16b
4698        uaddlp          v6.8h,   v6.16b
4699        add             v0.8h,   v0.8h,   v2.8h
4700        add             v4.8h,   v4.8h,   v6.8h
4701        shl             v0.8h,   v0.8h,   #1
4702        shl             v2.8h,   v4.8h,   #1
4703        dup             v1.8h,   v0.h[7]
4704        dup             v3.8h,   v2.h[7]
4705        subs            w8,  w8,  #2
4706        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4707        add             v16.8h,  v16.8h,  v0.8h
4708        add             v17.8h,  v17.8h,  v1.8h
4709        add             v18.8h,  v18.8h,  v2.8h
4710        add             v19.8h,  v19.8h,  v3.8h
4711        b.gt            1b
4712        mov             v0.16b,  v2.16b
4713        mov             v1.16b,  v3.16b
4714        b               L(ipred_cfl_ac_420_w16_hpad)
4715
4716L(ipred_cfl_ac_420_w16_wpad3):
4717        AARCH64_VALID_JUMP_TARGET
47181:      // Copy and subsample input, padding 12
4719        ld1             {v0.8b}, [x1],  x2
4720        ld1             {v2.8b}, [x10], x2
4721        ld1             {v4.8b}, [x1],  x2
4722        uaddlp          v0.4h,   v0.8b
4723        ld1             {v6.8b}, [x10], x2
4724        uaddlp          v2.4h,   v2.8b
4725        uaddlp          v4.4h,   v4.8b
4726        uaddlp          v6.4h,   v6.8b
4727        add             v0.4h,   v0.4h,   v2.4h
4728        add             v4.4h,   v4.4h,   v6.4h
4729        shl             v0.4h,   v0.4h,   #1
4730        shl             v2.4h,   v4.4h,   #1
4731        dup             v1.8h,   v0.h[3]
4732        dup             v3.8h,   v2.h[3]
4733        trn1            v0.2d,   v0.2d,   v1.2d
4734        trn1            v2.2d,   v2.2d,   v3.2d
4735        subs            w8,  w8,  #2
4736        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4737        add             v16.8h,  v16.8h,  v0.8h
4738        add             v17.8h,  v17.8h,  v1.8h
4739        add             v18.8h,  v18.8h,  v2.8h
4740        add             v19.8h,  v19.8h,  v3.8h
4741        b.gt            1b
4742        mov             v0.16b,  v2.16b
4743        mov             v1.16b,  v3.16b
4744
4745L(ipred_cfl_ac_420_w16_hpad):
4746        cbz             w4,  3f
47472:      // Vertical padding (h_pad > 0)
4748        subs            w4,  w4,  #4
4749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4750        add             v16.8h,  v16.8h,  v0.8h
4751        add             v17.8h,  v17.8h,  v1.8h
4752        add             v18.8h,  v18.8h,  v2.8h
4753        add             v19.8h,  v19.8h,  v3.8h
4754        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4755        add             v16.8h,  v16.8h,  v0.8h
4756        add             v17.8h,  v17.8h,  v1.8h
4757        add             v18.8h,  v18.8h,  v2.8h
4758        add             v19.8h,  v19.8h,  v3.8h
4759        b.gt            2b
47603:
4761
4762        // Double the height and reuse the w8 summing/subtracting
4763        lsl             w6,  w6,  #1
4764        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
4765
4766L(ipred_cfl_ac_420_tbl):
4767        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
4768        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
4769        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
4770        .hword 0
4771
4772L(ipred_cfl_ac_420_w16_tbl):
4773        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
4774        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
4775        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
4776        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
4777endfunc
4778
4779// void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
4780//                           const ptrdiff_t stride, const int w_pad,
4781//                           const int h_pad, const int cw, const int ch);
4782function ipred_cfl_ac_422_8bpc_neon, export=1
4783        clz             w8,  w5
4784        lsl             w4,  w4,  #2
4785        adr             x7,  L(ipred_cfl_ac_422_tbl)
4786        sub             w8,  w8,  #27
4787        ldrh            w8,  [x7, w8, uxtw #1]
4788        movi            v16.8h,  #0
4789        movi            v17.8h,  #0
4790        movi            v18.8h,  #0
4791        movi            v19.8h,  #0
4792        sub             x7,  x7,  w8, uxtw
4793        sub             w8,  w6,  w4         // height - h_pad
4794        rbit            w9,  w5              // rbit(width)
4795        rbit            w10, w6              // rbit(height)
4796        clz             w9,  w9              // ctz(width)
4797        clz             w10, w10             // ctz(height)
4798        add             w9,  w9,  w10        // log2sz
4799        add             x10, x1,  x2
4800        dup             v31.4s,  w9
4801        lsl             x2,  x2,  #1
4802        neg             v31.4s,  v31.4s      // -log2sz
4803        br              x7
4804
4805L(ipred_cfl_ac_422_w4):
4806        AARCH64_VALID_JUMP_TARGET
48071:      // Copy and subsample input
4808        ld1             {v0.8b},   [x1],  x2
4809        ld1             {v0.d}[1], [x10], x2
4810        ld1             {v1.8b},   [x1],  x2
4811        ld1             {v1.d}[1], [x10], x2
4812        uaddlp          v0.8h,   v0.16b
4813        uaddlp          v1.8h,   v1.16b
4814        shl             v0.8h,   v0.8h,   #2
4815        shl             v1.8h,   v1.8h,   #2
4816        subs            w8,  w8,  #4
4817        add             v16.8h,  v16.8h,  v0.8h
4818        add             v17.8h,  v17.8h,  v1.8h
4819        st1             {v0.8h, v1.8h}, [x0], #32
4820        b.gt            1b
4821        trn2            v0.2d,   v1.2d,   v1.2d
4822        trn2            v1.2d,   v1.2d,   v1.2d
4823        b               L(ipred_cfl_ac_420_w4_hpad)
4824
4825L(ipred_cfl_ac_422_w8):
4826        AARCH64_VALID_JUMP_TARGET
4827        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
48281:      // Copy and subsample input, without padding
4829        ld1             {v0.16b}, [x1],  x2
4830        ld1             {v1.16b}, [x10], x2
4831        ld1             {v2.16b}, [x1],  x2
4832        uaddlp          v0.8h,   v0.16b
4833        ld1             {v3.16b}, [x10], x2
4834        uaddlp          v1.8h,   v1.16b
4835        uaddlp          v2.8h,   v2.16b
4836        uaddlp          v3.8h,   v3.16b
4837        shl             v0.8h,   v0.8h,   #2
4838        shl             v1.8h,   v1.8h,   #2
4839        shl             v2.8h,   v2.8h,   #2
4840        shl             v3.8h,   v3.8h,   #2
4841        subs            w8,  w8,  #4
4842        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4843        add             v16.8h,  v16.8h,  v0.8h
4844        add             v17.8h,  v17.8h,  v1.8h
4845        add             v18.8h,  v18.8h,  v2.8h
4846        add             v19.8h,  v19.8h,  v3.8h
4847        b.gt            1b
4848        mov             v0.16b,  v3.16b
4849        mov             v1.16b,  v3.16b
4850        b               L(ipred_cfl_ac_420_w8_hpad)
4851
4852L(ipred_cfl_ac_422_w8_wpad):
48531:      // Copy and subsample input, padding 4
4854        ld1             {v0.8b},   [x1],  x2
4855        ld1             {v0.d}[1], [x10], x2
4856        ld1             {v2.8b},   [x1],  x2
4857        ld1             {v2.d}[1], [x10], x2
4858        uaddlp          v0.8h,   v0.16b
4859        uaddlp          v2.8h,   v2.16b
4860        shl             v0.8h,   v0.8h,   #2
4861        shl             v2.8h,   v2.8h,   #2
4862        dup             v4.4h,   v0.h[3]
4863        dup             v5.8h,   v0.h[7]
4864        dup             v6.4h,   v2.h[3]
4865        dup             v7.8h,   v2.h[7]
4866        trn2            v1.2d,   v0.2d,   v5.2d
4867        trn1            v0.2d,   v0.2d,   v4.2d
4868        trn2            v3.2d,   v2.2d,   v7.2d
4869        trn1            v2.2d,   v2.2d,   v6.2d
4870        subs            w8,  w8,  #4
4871        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4872        add             v16.8h,  v16.8h,  v0.8h
4873        add             v17.8h,  v17.8h,  v1.8h
4874        add             v18.8h,  v18.8h,  v2.8h
4875        add             v19.8h,  v19.8h,  v3.8h
4876        b.gt            1b
4877        mov             v0.16b,  v3.16b
4878        mov             v1.16b,  v3.16b
4879        b               L(ipred_cfl_ac_420_w8_hpad)
4880
4881L(ipred_cfl_ac_422_w16):
4882        AARCH64_VALID_JUMP_TARGET
4883        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
4884        ldrh            w3,  [x7, w3, uxtw #1]
4885        sub             x7,  x7,  w3, uxtw
4886        br              x7
4887
4888L(ipred_cfl_ac_422_w16_wpad0):
4889        AARCH64_VALID_JUMP_TARGET
48901:      // Copy and subsample input, without padding
4891        ld1             {v0.16b, v1.16b}, [x1],  x2
4892        ld1             {v2.16b, v3.16b}, [x10], x2
4893        uaddlp          v0.8h,   v0.16b
4894        uaddlp          v1.8h,   v1.16b
4895        uaddlp          v2.8h,   v2.16b
4896        uaddlp          v3.8h,   v3.16b
4897        shl             v0.8h,   v0.8h,   #2
4898        shl             v1.8h,   v1.8h,   #2
4899        shl             v2.8h,   v2.8h,   #2
4900        shl             v3.8h,   v3.8h,   #2
4901        subs            w8,  w8,  #2
4902        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4903        add             v16.8h,  v16.8h,  v0.8h
4904        add             v17.8h,  v17.8h,  v1.8h
4905        add             v18.8h,  v18.8h,  v2.8h
4906        add             v19.8h,  v19.8h,  v3.8h
4907        b.gt            1b
4908        mov             v0.16b,  v2.16b
4909        mov             v1.16b,  v3.16b
4910        b               L(ipred_cfl_ac_420_w16_hpad)
4911
4912L(ipred_cfl_ac_422_w16_wpad1):
4913        AARCH64_VALID_JUMP_TARGET
49141:      // Copy and subsample input, padding 4
4915        ldr             d1,  [x1,  #16]
4916        ld1             {v0.16b}, [x1],  x2
4917        ldr             d3,  [x10, #16]
4918        ld1             {v2.16b}, [x10], x2
4919        uaddlp          v1.4h,   v1.8b
4920        uaddlp          v0.8h,   v0.16b
4921        uaddlp          v3.4h,   v3.8b
4922        uaddlp          v2.8h,   v2.16b
4923        shl             v1.4h,   v1.4h,   #2
4924        shl             v0.8h,   v0.8h,   #2
4925        shl             v3.4h,   v3.4h,   #2
4926        shl             v2.8h,   v2.8h,   #2
4927        dup             v4.4h,   v1.h[3]
4928        dup             v5.4h,   v3.h[3]
4929        trn1            v1.2d,   v1.2d,   v4.2d
4930        trn1            v3.2d,   v3.2d,   v5.2d
4931        subs            w8,  w8,  #2
4932        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4933        add             v16.8h,  v16.8h,  v0.8h
4934        add             v17.8h,  v17.8h,  v1.8h
4935        add             v18.8h,  v18.8h,  v2.8h
4936        add             v19.8h,  v19.8h,  v3.8h
4937        b.gt            1b
4938        mov             v0.16b,  v2.16b
4939        mov             v1.16b,  v3.16b
4940        b               L(ipred_cfl_ac_420_w16_hpad)
4941
4942L(ipred_cfl_ac_422_w16_wpad2):
4943        AARCH64_VALID_JUMP_TARGET
49441:      // Copy and subsample input, padding 8
4945        ld1             {v0.16b}, [x1],  x2
4946        ld1             {v2.16b}, [x10], x2
4947        uaddlp          v0.8h,   v0.16b
4948        uaddlp          v2.8h,   v2.16b
4949        shl             v0.8h,   v0.8h,   #2
4950        shl             v2.8h,   v2.8h,   #2
4951        dup             v1.8h,   v0.h[7]
4952        dup             v3.8h,   v2.h[7]
4953        subs            w8,  w8,  #2
4954        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4955        add             v16.8h,  v16.8h,  v0.8h
4956        add             v17.8h,  v17.8h,  v1.8h
4957        add             v18.8h,  v18.8h,  v2.8h
4958        add             v19.8h,  v19.8h,  v3.8h
4959        b.gt            1b
4960        mov             v0.16b,  v2.16b
4961        mov             v1.16b,  v3.16b
4962        b               L(ipred_cfl_ac_420_w16_hpad)
4963
4964L(ipred_cfl_ac_422_w16_wpad3):
4965        AARCH64_VALID_JUMP_TARGET
49661:      // Copy and subsample input, padding 12
4967        ld1             {v0.8b}, [x1],  x2
4968        ld1             {v2.8b}, [x10], x2
4969        uaddlp          v0.4h,   v0.8b
4970        uaddlp          v2.4h,   v2.8b
4971        shl             v0.4h,   v0.4h,   #2
4972        shl             v2.4h,   v2.4h,   #2
4973        dup             v1.8h,   v0.h[3]
4974        dup             v3.8h,   v2.h[3]
4975        trn1            v0.2d,   v0.2d,   v1.2d
4976        trn1            v2.2d,   v2.2d,   v3.2d
4977        subs            w8,  w8,  #2
4978        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
4979        add             v16.8h,  v16.8h,  v0.8h
4980        add             v17.8h,  v17.8h,  v1.8h
4981        add             v18.8h,  v18.8h,  v2.8h
4982        add             v19.8h,  v19.8h,  v3.8h
4983        b.gt            1b
4984        mov             v0.16b,  v2.16b
4985        mov             v1.16b,  v3.16b
4986        b               L(ipred_cfl_ac_420_w16_hpad)
4987
4988L(ipred_cfl_ac_422_tbl):
4989        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
4990        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
4991        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
4992        .hword 0
4993
4994L(ipred_cfl_ac_422_w16_tbl):
4995        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
4996        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
4997        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
4998        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
4999endfunc
5000
5001// void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
5002//                           const ptrdiff_t stride, const int w_pad,
5003//                           const int h_pad, const int cw, const int ch);
5004function ipred_cfl_ac_444_8bpc_neon, export=1
5005        clz             w8,  w5
5006        lsl             w4,  w4,  #2
5007        adr             x7,  L(ipred_cfl_ac_444_tbl)
5008        sub             w8,  w8,  #26
5009        ldrh            w8,  [x7, w8, uxtw #1]
5010        movi            v16.8h,  #0
5011        movi            v17.8h,  #0
5012        movi            v18.8h,  #0
5013        movi            v19.8h,  #0
5014        sub             x7,  x7,  w8, uxtw
5015        sub             w8,  w6,  w4         // height - h_pad
5016        rbit            w9,  w5              // rbit(width)
5017        rbit            w10, w6              // rbit(height)
5018        clz             w9,  w9              // ctz(width)
5019        clz             w10, w10             // ctz(height)
5020        add             w9,  w9,  w10        // log2sz
5021        add             x10, x1,  x2
5022        dup             v31.4s,  w9
5023        lsl             x2,  x2,  #1
5024        neg             v31.4s,  v31.4s      // -log2sz
5025        br              x7
5026
5027L(ipred_cfl_ac_444_w4):
5028        AARCH64_VALID_JUMP_TARGET
50291:      // Copy and expand input
5030        ld1             {v0.s}[0], [x1],  x2
5031        ld1             {v0.s}[1], [x10], x2
5032        ld1             {v1.s}[0], [x1],  x2
5033        ld1             {v1.s}[1], [x10], x2
5034        ushll           v0.8h,   v0.8b,   #3
5035        ushll           v1.8h,   v1.8b,   #3
5036        subs            w8,  w8,  #4
5037        add             v16.8h,  v16.8h,  v0.8h
5038        add             v17.8h,  v17.8h,  v1.8h
5039        st1             {v0.8h, v1.8h}, [x0], #32
5040        b.gt            1b
5041        trn2            v0.2d,   v1.2d,   v1.2d
5042        trn2            v1.2d,   v1.2d,   v1.2d
5043        b               L(ipred_cfl_ac_420_w4_hpad)
5044
5045L(ipred_cfl_ac_444_w8):
5046        AARCH64_VALID_JUMP_TARGET
50471:      // Copy and expand input
5048        ld1             {v0.8b}, [x1],  x2
5049        ld1             {v1.8b}, [x10], x2
5050        ld1             {v2.8b}, [x1],  x2
5051        ushll           v0.8h,   v0.8b,   #3
5052        ld1             {v3.8b}, [x10], x2
5053        ushll           v1.8h,   v1.8b,   #3
5054        ushll           v2.8h,   v2.8b,   #3
5055        ushll           v3.8h,   v3.8b,   #3
5056        subs            w8,  w8,  #4
5057        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5058        add             v16.8h,  v16.8h,  v0.8h
5059        add             v17.8h,  v17.8h,  v1.8h
5060        add             v18.8h,  v18.8h,  v2.8h
5061        add             v19.8h,  v19.8h,  v3.8h
5062        b.gt            1b
5063        mov             v0.16b,  v3.16b
5064        mov             v1.16b,  v3.16b
5065        b               L(ipred_cfl_ac_420_w8_hpad)
5066
5067L(ipred_cfl_ac_444_w16):
5068        AARCH64_VALID_JUMP_TARGET
5069        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
50701:      // Copy and expand input, without padding
5071        ld1             {v0.16b}, [x1],  x2
5072        ld1             {v2.16b}, [x10], x2
5073        ld1             {v4.16b}, [x1],  x2
5074        ushll2          v1.8h,   v0.16b,  #3
5075        ushll           v0.8h,   v0.8b,   #3
5076        ld1             {v6.16b}, [x10], x2
5077        ushll2          v3.8h,   v2.16b,  #3
5078        ushll           v2.8h,   v2.8b,   #3
5079        ushll2          v5.8h,   v4.16b,  #3
5080        ushll           v4.8h,   v4.8b,   #3
5081        ushll2          v7.8h,   v6.16b,  #3
5082        ushll           v6.8h,   v6.8b,   #3
5083        subs            w8,  w8,  #4
5084        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5085        add             v16.8h,  v16.8h,  v0.8h
5086        add             v17.8h,  v17.8h,  v1.8h
5087        add             v18.8h,  v18.8h,  v2.8h
5088        add             v19.8h,  v19.8h,  v3.8h
5089        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5090        add             v16.8h,  v16.8h,  v4.8h
5091        add             v17.8h,  v17.8h,  v5.8h
5092        add             v18.8h,  v18.8h,  v6.8h
5093        add             v19.8h,  v19.8h,  v7.8h
5094        b.gt            1b
5095        mov             v0.16b,  v6.16b
5096        mov             v1.16b,  v7.16b
5097        mov             v2.16b,  v6.16b
5098        mov             v3.16b,  v7.16b
5099        b               L(ipred_cfl_ac_420_w16_hpad)
5100
5101L(ipred_cfl_ac_444_w16_wpad):
51021:      // Copy and expand input, padding 8
5103        ld1             {v0.8b}, [x1],  x2
5104        ld1             {v2.8b}, [x10], x2
5105        ld1             {v4.8b}, [x1],  x2
5106        ld1             {v6.8b}, [x10], x2
5107        ushll           v0.8h,   v0.8b,   #3
5108        ushll           v2.8h,   v2.8b,   #3
5109        ushll           v4.8h,   v4.8b,   #3
5110        ushll           v6.8h,   v6.8b,   #3
5111        dup             v1.8h,   v0.h[7]
5112        dup             v3.8h,   v2.h[7]
5113        dup             v5.8h,   v4.h[7]
5114        dup             v7.8h,   v6.h[7]
5115        subs            w8,  w8,  #4
5116        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5117        add             v16.8h,  v16.8h,  v0.8h
5118        add             v17.8h,  v17.8h,  v1.8h
5119        add             v18.8h,  v18.8h,  v2.8h
5120        add             v19.8h,  v19.8h,  v3.8h
5121        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5122        add             v16.8h,  v16.8h,  v4.8h
5123        add             v17.8h,  v17.8h,  v5.8h
5124        add             v18.8h,  v18.8h,  v6.8h
5125        add             v19.8h,  v19.8h,  v7.8h
5126        b.gt            1b
5127        mov             v0.16b,  v6.16b
5128        mov             v1.16b,  v7.16b
5129        mov             v2.16b,  v6.16b
5130        mov             v3.16b,  v7.16b
5131        b               L(ipred_cfl_ac_420_w16_hpad)
5132
5133L(ipred_cfl_ac_444_w32):
5134        AARCH64_VALID_JUMP_TARGET
5135        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
5136        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
5137        sub             x7,  x7,  w3, uxtw
5138        br              x7
5139
5140L(ipred_cfl_ac_444_w32_wpad0):
5141        AARCH64_VALID_JUMP_TARGET
51421:      // Copy and expand input, without padding
5143        ld1             {v2.16b, v3.16b}, [x1],  x2
5144        ld1             {v6.16b, v7.16b}, [x10], x2
5145        ushll           v0.8h,   v2.8b,   #3
5146        ushll2          v1.8h,   v2.16b,  #3
5147        ushll           v2.8h,   v3.8b,   #3
5148        ushll2          v3.8h,   v3.16b,  #3
5149        ushll           v4.8h,   v6.8b,   #3
5150        ushll2          v5.8h,   v6.16b,  #3
5151        ushll           v6.8h,   v7.8b,   #3
5152        ushll2          v7.8h,   v7.16b,  #3
5153        subs            w8,  w8,  #2
5154        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5155        add             v16.8h,  v16.8h,  v0.8h
5156        add             v17.8h,  v17.8h,  v1.8h
5157        add             v18.8h,  v18.8h,  v2.8h
5158        add             v19.8h,  v19.8h,  v3.8h
5159        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5160        add             v16.8h,  v16.8h,  v4.8h
5161        add             v17.8h,  v17.8h,  v5.8h
5162        add             v18.8h,  v18.8h,  v6.8h
5163        add             v19.8h,  v19.8h,  v7.8h
5164        b.gt            1b
5165        b               L(ipred_cfl_ac_444_w32_hpad)
5166
5167L(ipred_cfl_ac_444_w32_wpad2):
5168        AARCH64_VALID_JUMP_TARGET
51691:      // Copy and expand input, padding 8
5170        ldr             d2,  [x1,  #16]
5171        ld1             {v1.16b}, [x1],  x2
5172        ldr             d6,  [x10, #16]
5173        ld1             {v5.16b}, [x10], x2
5174        ushll           v2.8h,   v2.8b,   #3
5175        ushll           v0.8h,   v1.8b,   #3
5176        ushll2          v1.8h,   v1.16b,  #3
5177        ushll           v6.8h,   v6.8b,   #3
5178        ushll           v4.8h,   v5.8b,   #3
5179        ushll2          v5.8h,   v5.16b,  #3
5180        dup             v3.8h,   v2.h[7]
5181        dup             v7.8h,   v6.h[7]
5182        subs            w8,  w8,  #2
5183        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5184        add             v16.8h,  v16.8h,  v0.8h
5185        add             v17.8h,  v17.8h,  v1.8h
5186        add             v18.8h,  v18.8h,  v2.8h
5187        add             v19.8h,  v19.8h,  v3.8h
5188        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5189        add             v16.8h,  v16.8h,  v4.8h
5190        add             v17.8h,  v17.8h,  v5.8h
5191        add             v18.8h,  v18.8h,  v6.8h
5192        add             v19.8h,  v19.8h,  v7.8h
5193        b.gt            1b
5194        b               L(ipred_cfl_ac_444_w32_hpad)
5195
5196L(ipred_cfl_ac_444_w32_wpad4):
5197        AARCH64_VALID_JUMP_TARGET
51981:      // Copy and expand input, padding 16
5199        ld1             {v1.16b}, [x1],  x2
5200        ld1             {v5.16b}, [x10], x2
5201        ushll           v0.8h,   v1.8b,   #3
5202        ushll2          v1.8h,   v1.16b,  #3
5203        ushll           v4.8h,   v5.8b,   #3
5204        ushll2          v5.8h,   v5.16b,  #3
5205        dup             v2.8h,   v1.h[7]
5206        dup             v3.8h,   v1.h[7]
5207        dup             v6.8h,   v5.h[7]
5208        dup             v7.8h,   v5.h[7]
5209        subs            w8,  w8,  #2
5210        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5211        add             v16.8h,  v16.8h,  v0.8h
5212        add             v17.8h,  v17.8h,  v1.8h
5213        add             v18.8h,  v18.8h,  v2.8h
5214        add             v19.8h,  v19.8h,  v3.8h
5215        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5216        add             v16.8h,  v16.8h,  v4.8h
5217        add             v17.8h,  v17.8h,  v5.8h
5218        add             v18.8h,  v18.8h,  v6.8h
5219        add             v19.8h,  v19.8h,  v7.8h
5220        b.gt            1b
5221        b               L(ipred_cfl_ac_444_w32_hpad)
5222
5223L(ipred_cfl_ac_444_w32_wpad6):
5224        AARCH64_VALID_JUMP_TARGET
52251:      // Copy and expand input, padding 24
5226        ld1             {v0.8b}, [x1],  x2
5227        ld1             {v4.8b}, [x10], x2
5228        ushll           v0.8h,   v0.8b,   #3
5229        ushll           v4.8h,   v4.8b,   #3
5230        dup             v1.8h,   v0.h[7]
5231        dup             v2.8h,   v0.h[7]
5232        dup             v3.8h,   v0.h[7]
5233        dup             v5.8h,   v4.h[7]
5234        dup             v6.8h,   v4.h[7]
5235        dup             v7.8h,   v4.h[7]
5236        subs            w8,  w8,  #2
5237        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5238        add             v16.8h,  v16.8h,  v0.8h
5239        add             v17.8h,  v17.8h,  v1.8h
5240        add             v18.8h,  v18.8h,  v2.8h
5241        add             v19.8h,  v19.8h,  v3.8h
5242        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5243        add             v16.8h,  v16.8h,  v4.8h
5244        add             v17.8h,  v17.8h,  v5.8h
5245        add             v18.8h,  v18.8h,  v6.8h
5246        add             v19.8h,  v19.8h,  v7.8h
5247        b.gt            1b
5248
5249L(ipred_cfl_ac_444_w32_hpad):
5250        cbz             w4,  3f
52512:      // Vertical padding (h_pad > 0)
5252        subs            w4,  w4,  #2
5253        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5254        add             v16.8h,  v16.8h,  v4.8h
5255        add             v17.8h,  v17.8h,  v5.8h
5256        add             v18.8h,  v18.8h,  v6.8h
5257        add             v19.8h,  v19.8h,  v7.8h
5258        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
5259        add             v16.8h,  v16.8h,  v4.8h
5260        add             v17.8h,  v17.8h,  v5.8h
5261        add             v18.8h,  v18.8h,  v6.8h
5262        add             v19.8h,  v19.8h,  v7.8h
5263        b.gt            2b
52643:
5265
5266        // Quadruple the height and reuse the w8 subtracting
5267        lsl             w6,  w6,  #2
5268        // Aggregate the sums, with wider intermediates earlier than in
5269        // ipred_cfl_ac_420_w8_calc_subtract_dc.
5270        uaddlp          v0.4s,   v16.8h
5271        uaddlp          v1.4s,   v17.8h
5272        uaddlp          v2.4s,   v18.8h
5273        uaddlp          v3.4s,   v19.8h
5274        add             v0.4s,   v0.4s,   v1.4s
5275        add             v2.4s,   v2.4s,   v3.4s
5276        add             v0.4s,   v0.4s,   v2.4s
5277        addv            s0,  v0.4s                // sum
5278        sub             x0,  x0,  w6, uxtw #4
5279        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
5280        dup             v4.8h,   v4.h[0]
5281        b               L(ipred_cfl_ac_420_w8_subtract_dc)
5282
5283L(ipred_cfl_ac_444_tbl):
5284        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
5285        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
5286        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
5287        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
5288
5289L(ipred_cfl_ac_444_w32_tbl):
5290        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
5291        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
5292        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
5293        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
5294endfunc
5295