• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright © 2018, VideoLAN and dav1d authors
3 * Copyright © 2019, Martin Storsjo
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice, this
10 *    list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 *    this list of conditions and the following disclaimer in the documentation
14 *    and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28#include "src/arm/asm.S"
29#include "util.S"
30
31// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
32//                              const pixel *const topleft,
33//                              const int width, const int height, const int a,
34//                              const int max_width, const int max_height,
35//                              const int bitdepth_max);
36function ipred_dc_128_16bpc_neon, export=1
37        ldr             w8,  [sp]
38        clz             w3,  w3
39        adr             x5,  L(ipred_dc_128_tbl)
40        sub             w3,  w3,  #25
41        ldrh            w3,  [x5, w3, uxtw #1]
42        dup             v0.8h,   w8
43        sub             x5,  x5,  w3, uxtw
44        add             x6,  x0,  x1
45        lsl             x1,  x1,  #1
46        urshr           v0.8h,   v0.8h,  #1
47        br              x5
484:
49        AARCH64_VALID_JUMP_TARGET
50        st1             {v0.4h},  [x0], x1
51        st1             {v0.4h},  [x6], x1
52        subs            w4,  w4,  #4
53        st1             {v0.4h},  [x0], x1
54        st1             {v0.4h},  [x6], x1
55        b.gt            4b
56        ret
578:
58        AARCH64_VALID_JUMP_TARGET
59        st1             {v0.8h},  [x0], x1
60        st1             {v0.8h},  [x6], x1
61        subs            w4,  w4,  #4
62        st1             {v0.8h},  [x0], x1
63        st1             {v0.8h},  [x6], x1
64        b.gt            8b
65        ret
66160:
67        AARCH64_VALID_JUMP_TARGET
68        mov             v1.16b,  v0.16b
6916:
70        st1             {v0.8h, v1.8h}, [x0], x1
71        st1             {v0.8h, v1.8h}, [x6], x1
72        subs            w4,  w4,  #4
73        st1             {v0.8h, v1.8h}, [x0], x1
74        st1             {v0.8h, v1.8h}, [x6], x1
75        b.gt            16b
76        ret
77320:
78        AARCH64_VALID_JUMP_TARGET
79        mov             v1.16b,  v0.16b
80        mov             v2.16b,  v0.16b
81        mov             v3.16b,  v0.16b
8232:
83        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
84        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
85        subs            w4,  w4,  #4
86        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
87        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
88        b.gt            32b
89        ret
90640:
91        AARCH64_VALID_JUMP_TARGET
92        mov             v1.16b,  v0.16b
93        mov             v2.16b,  v0.16b
94        mov             v3.16b,  v0.16b
95        sub             x1,  x1,  #64
9664:
97        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
98        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
101        subs            w4,  w4,  #4
102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
103        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
104        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
105        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
106        b.gt            64b
107        ret
108
109L(ipred_dc_128_tbl):
110        .hword L(ipred_dc_128_tbl) - 640b
111        .hword L(ipred_dc_128_tbl) - 320b
112        .hword L(ipred_dc_128_tbl) - 160b
113        .hword L(ipred_dc_128_tbl) -   8b
114        .hword L(ipred_dc_128_tbl) -   4b
115endfunc
116
117// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
118//                         const pixel *const topleft,
119//                         const int width, const int height, const int a,
120//                         const int max_width, const int max_height);
121function ipred_v_16bpc_neon, export=1
122        clz             w3,  w3
123        adr             x5,  L(ipred_v_tbl)
124        sub             w3,  w3,  #25
125        ldrh            w3,  [x5, w3, uxtw #1]
126        add             x2,  x2,  #2
127        sub             x5,  x5,  w3, uxtw
128        add             x6,  x0,  x1
129        lsl             x1,  x1,  #1
130        br              x5
13140:
132        AARCH64_VALID_JUMP_TARGET
133        ld1             {v0.4h},  [x2]
1344:
135        st1             {v0.4h},  [x0], x1
136        st1             {v0.4h},  [x6], x1
137        subs            w4,  w4,  #4
138        st1             {v0.4h},  [x0], x1
139        st1             {v0.4h},  [x6], x1
140        b.gt            4b
141        ret
14280:
143        AARCH64_VALID_JUMP_TARGET
144        ld1             {v0.8h},  [x2]
1458:
146        st1             {v0.8h},  [x0], x1
147        st1             {v0.8h},  [x6], x1
148        subs            w4,  w4,  #4
149        st1             {v0.8h},  [x0], x1
150        st1             {v0.8h},  [x6], x1
151        b.gt            8b
152        ret
153160:
154        AARCH64_VALID_JUMP_TARGET
155        ld1             {v0.8h, v1.8h}, [x2]
15616:
157        st1             {v0.8h, v1.8h}, [x0], x1
158        st1             {v0.8h, v1.8h}, [x6], x1
159        subs            w4,  w4,  #4
160        st1             {v0.8h, v1.8h}, [x0], x1
161        st1             {v0.8h, v1.8h}, [x6], x1
162        b.gt            16b
163        ret
164320:
165        AARCH64_VALID_JUMP_TARGET
166        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
16732:
168        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
169        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
170        subs            w4,  w4,  #4
171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
173        b.gt            32b
174        ret
175640:
176        AARCH64_VALID_JUMP_TARGET
177        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
178        sub             x1,  x1,  #64
179        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
18064:
181        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
182        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
183        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
184        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
185        subs            w4,  w4,  #4
186        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
187        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
188        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
189        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
190        b.gt            64b
191        ret
192
193L(ipred_v_tbl):
194        .hword L(ipred_v_tbl) - 640b
195        .hword L(ipred_v_tbl) - 320b
196        .hword L(ipred_v_tbl) - 160b
197        .hword L(ipred_v_tbl) -  80b
198        .hword L(ipred_v_tbl) -  40b
199endfunc
200
201// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
202//                         const pixel *const topleft,
203//                         const int width, const int height, const int a,
204//                         const int max_width, const int max_height);
205function ipred_h_16bpc_neon, export=1
206        clz             w3,  w3
207        adr             x5,  L(ipred_h_tbl)
208        sub             w3,  w3,  #25
209        ldrh            w3,  [x5, w3, uxtw #1]
210        sub             x2,  x2,  #8
211        sub             x5,  x5,  w3, uxtw
212        mov             x7,  #-8
213        add             x6,  x0,  x1
214        lsl             x1,  x1,  #1
215        br              x5
2164:
217        AARCH64_VALID_JUMP_TARGET
218        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
219        st1             {v3.4h},  [x0], x1
220        st1             {v2.4h},  [x6], x1
221        subs            w4,  w4,  #4
222        st1             {v1.4h},  [x0], x1
223        st1             {v0.4h},  [x6], x1
224        b.gt            4b
225        ret
2268:
227        AARCH64_VALID_JUMP_TARGET
228        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
229        st1             {v3.8h},  [x0], x1
230        st1             {v2.8h},  [x6], x1
231        subs            w4,  w4,  #4
232        st1             {v1.8h},  [x0], x1
233        st1             {v0.8h},  [x6], x1
234        b.gt            8b
235        ret
23616:
237        AARCH64_VALID_JUMP_TARGET
238        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
239        str             q3,  [x0, #16]
240        str             q2,  [x6, #16]
241        st1             {v3.8h}, [x0], x1
242        st1             {v2.8h}, [x6], x1
243        subs            w4,  w4,  #4
244        str             q1,  [x0, #16]
245        str             q0,  [x6, #16]
246        st1             {v1.8h}, [x0], x1
247        st1             {v0.8h}, [x6], x1
248        b.gt            16b
249        ret
25032:
251        AARCH64_VALID_JUMP_TARGET
252        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
253        str             q3,  [x0, #16]
254        str             q2,  [x6, #16]
255        stp             q3,  q3,  [x0, #32]
256        stp             q2,  q2,  [x6, #32]
257        st1             {v3.8h}, [x0], x1
258        st1             {v2.8h}, [x6], x1
259        subs            w4,  w4,  #4
260        str             q1,  [x0, #16]
261        str             q0,  [x6, #16]
262        stp             q1,  q1,  [x0, #32]
263        stp             q0,  q0,  [x6, #32]
264        st1             {v1.8h}, [x0], x1
265        st1             {v0.8h}, [x6], x1
266        b.gt            32b
267        ret
26864:
269        AARCH64_VALID_JUMP_TARGET
270        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
271        str             q3,  [x0, #16]
272        str             q2,  [x6, #16]
273        stp             q3,  q3,  [x0, #32]
274        stp             q2,  q2,  [x6, #32]
275        stp             q3,  q3,  [x0, #64]
276        stp             q2,  q2,  [x6, #64]
277        stp             q3,  q3,  [x0, #96]
278        stp             q2,  q2,  [x6, #96]
279        st1             {v3.8h}, [x0], x1
280        st1             {v2.8h}, [x6], x1
281        subs            w4,  w4,  #4
282        str             q1,  [x0, #16]
283        str             q0,  [x6, #16]
284        stp             q1,  q1,  [x0, #32]
285        stp             q0,  q0,  [x6, #32]
286        stp             q1,  q1,  [x0, #64]
287        stp             q0,  q0,  [x6, #64]
288        stp             q1,  q1,  [x0, #96]
289        stp             q0,  q0,  [x6, #96]
290        st1             {v1.8h}, [x0], x1
291        st1             {v0.8h}, [x6], x1
292        b.gt            64b
293        ret
294
295L(ipred_h_tbl):
296        .hword L(ipred_h_tbl) - 64b
297        .hword L(ipred_h_tbl) - 32b
298        .hword L(ipred_h_tbl) - 16b
299        .hword L(ipred_h_tbl) -  8b
300        .hword L(ipred_h_tbl) -  4b
301endfunc
302
303// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
304//                              const pixel *const topleft,
305//                              const int width, const int height, const int a,
306//                              const int max_width, const int max_height);
307function ipred_dc_top_16bpc_neon, export=1
308        clz             w3,  w3
309        adr             x5,  L(ipred_dc_top_tbl)
310        sub             w3,  w3,  #25
311        ldrh            w3,  [x5, w3, uxtw #1]
312        add             x2,  x2,  #2
313        sub             x5,  x5,  w3, uxtw
314        add             x6,  x0,  x1
315        lsl             x1,  x1,  #1
316        br              x5
31740:
318        AARCH64_VALID_JUMP_TARGET
319        ld1             {v0.4h},  [x2]
320        addv            h0,      v0.4h
321        urshr           v0.4h,   v0.4h,   #2
322        dup             v0.4h,   v0.h[0]
3234:
324        st1             {v0.4h},  [x0], x1
325        st1             {v0.4h},  [x6], x1
326        subs            w4,  w4,  #4
327        st1             {v0.4h},  [x0], x1
328        st1             {v0.4h},  [x6], x1
329        b.gt            4b
330        ret
33180:
332        AARCH64_VALID_JUMP_TARGET
333        ld1             {v0.8h},  [x2]
334        addv            h0,      v0.8h
335        urshr           v0.4h,   v0.4h,   #3
336        dup             v0.8h,   v0.h[0]
3378:
338        st1             {v0.8h},  [x0], x1
339        st1             {v0.8h},  [x6], x1
340        subs            w4,  w4,  #4
341        st1             {v0.8h},  [x0], x1
342        st1             {v0.8h},  [x6], x1
343        b.gt            8b
344        ret
345160:
346        AARCH64_VALID_JUMP_TARGET
347        ld1             {v0.8h, v1.8h}, [x2]
348        addp            v0.8h,   v0.8h,   v1.8h
349        addv            h0,      v0.8h
350        urshr           v2.4h,   v0.4h,   #4
351        dup             v0.8h,   v2.h[0]
352        dup             v1.8h,   v2.h[0]
35316:
354        st1             {v0.8h, v1.8h}, [x0], x1
355        st1             {v0.8h, v1.8h}, [x6], x1
356        subs            w4,  w4,  #4
357        st1             {v0.8h, v1.8h}, [x0], x1
358        st1             {v0.8h, v1.8h}, [x6], x1
359        b.gt            16b
360        ret
361320:
362        AARCH64_VALID_JUMP_TARGET
363        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
364        addp            v0.8h,   v0.8h,   v1.8h
365        addp            v2.8h,   v2.8h,   v3.8h
366        addp            v0.8h,   v0.8h,   v2.8h
367        uaddlv          s0,      v0.8h
368        rshrn           v4.4h,   v0.4s,   #5
369        dup             v0.8h,   v4.h[0]
370        dup             v1.8h,   v4.h[0]
371        dup             v2.8h,   v4.h[0]
372        dup             v3.8h,   v4.h[0]
37332:
374        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
375        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
376        subs            w4,  w4,  #4
377        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
378        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
379        b.gt            32b
380        ret
381640:
382        AARCH64_VALID_JUMP_TARGET
383        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
384        addp            v0.8h,   v0.8h,   v1.8h
385        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
386        addp            v2.8h,   v2.8h,   v3.8h
387        addp            v4.8h,   v4.8h,   v5.8h
388        addp            v6.8h,   v6.8h,   v7.8h
389        addp            v0.8h,   v0.8h,   v2.8h
390        addp            v4.8h,   v4.8h,   v6.8h
391        addp            v0.8h,   v0.8h,   v4.8h
392        uaddlv          s0,      v0.8h
393        rshrn           v4.4h,   v0.4s,   #6
394        sub             x1,  x1,  #64
395        dup             v0.8h,   v4.h[0]
396        dup             v1.8h,   v4.h[0]
397        dup             v2.8h,   v4.h[0]
398        dup             v3.8h,   v4.h[0]
39964:
400        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
401        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
402        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
403        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
404        subs            w4,  w4,  #4
405        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
406        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
407        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
408        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
409        b.gt            64b
410        ret
411
412L(ipred_dc_top_tbl):
413        .hword L(ipred_dc_top_tbl) - 640b
414        .hword L(ipred_dc_top_tbl) - 320b
415        .hword L(ipred_dc_top_tbl) - 160b
416        .hword L(ipred_dc_top_tbl) -  80b
417        .hword L(ipred_dc_top_tbl) -  40b
418endfunc
419
420// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
421//                               const pixel *const topleft,
422//                               const int width, const int height, const int a,
423//                               const int max_width, const int max_height);
424function ipred_dc_left_16bpc_neon, export=1
425        sub             x2,  x2,  w4, uxtw #1
426        clz             w3,  w3
427        clz             w7,  w4
428        adr             x5,  L(ipred_dc_left_tbl)
429        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
430        sub             w7,  w7,  #25
431        ldrh            w3,  [x5, w3, uxtw #1]
432        ldrh            w7,  [x5, w7, uxtw #1]
433        sub             x3,  x5,  w3, uxtw
434        sub             x5,  x5,  w7, uxtw
435        add             x6,  x0,  x1
436        lsl             x1,  x1,  #1
437        br              x5
438
439L(ipred_dc_left_h4):
440        AARCH64_VALID_JUMP_TARGET
441        ld1             {v0.4h},  [x2]
442        addv            h0,      v0.4h
443        urshr           v0.4h,   v0.4h,   #2
444        dup             v0.8h,   v0.h[0]
445        br              x3
446L(ipred_dc_left_w4):
447        AARCH64_VALID_JUMP_TARGET
448        st1             {v0.4h},  [x0], x1
449        st1             {v0.4h},  [x6], x1
450        subs            w4,  w4,  #4
451        st1             {v0.4h},  [x0], x1
452        st1             {v0.4h},  [x6], x1
453        b.gt            L(ipred_dc_left_w4)
454        ret
455
456L(ipred_dc_left_h8):
457        AARCH64_VALID_JUMP_TARGET
458        ld1             {v0.8h},  [x2]
459        addv            h0,      v0.8h
460        urshr           v0.4h,   v0.4h,   #3
461        dup             v0.8h,   v0.h[0]
462        br              x3
463L(ipred_dc_left_w8):
464        AARCH64_VALID_JUMP_TARGET
465        st1             {v0.8h},  [x0], x1
466        st1             {v0.8h},  [x6], x1
467        subs            w4,  w4,  #4
468        st1             {v0.8h},  [x0], x1
469        st1             {v0.8h},  [x6], x1
470        b.gt            L(ipred_dc_left_w8)
471        ret
472
473L(ipred_dc_left_h16):
474        AARCH64_VALID_JUMP_TARGET
475        ld1             {v0.8h, v1.8h}, [x2]
476        addp            v0.8h,   v0.8h,   v1.8h
477        addv            h0,      v0.8h
478        urshr           v2.4h,   v0.4h,   #4
479        dup             v0.8h,   v2.h[0]
480        dup             v1.8h,   v2.h[0]
481        br              x3
482L(ipred_dc_left_w16):
483        AARCH64_VALID_JUMP_TARGET
484        mov             v1.16b,  v0.16b
4851:
486        st1             {v0.8h, v1.8h}, [x0], x1
487        st1             {v0.8h, v1.8h}, [x6], x1
488        subs            w4,  w4,  #4
489        st1             {v0.8h, v1.8h}, [x0], x1
490        st1             {v0.8h, v1.8h}, [x6], x1
491        b.gt            1b
492        ret
493
494L(ipred_dc_left_h32):
495        AARCH64_VALID_JUMP_TARGET
496        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
497        addp            v0.8h,   v0.8h,   v1.8h
498        addp            v2.8h,   v2.8h,   v3.8h
499        addp            v0.8h,   v0.8h,   v2.8h
500        uaddlp          v0.4s,   v0.8h
501        addv            s0,      v0.4s
502        rshrn           v4.4h,   v0.4s,   #5
503        dup             v0.8h,   v4.h[0]
504        br              x3
505L(ipred_dc_left_w32):
506        AARCH64_VALID_JUMP_TARGET
507        mov             v1.16b,  v0.16b
508        mov             v2.16b,  v0.16b
509        mov             v3.16b,  v0.16b
5101:
511        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
512        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
513        subs            w4,  w4,  #4
514        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
515        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
516        b.gt            1b
517        ret
518
519L(ipred_dc_left_h64):
520        AARCH64_VALID_JUMP_TARGET
521        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
522        addp            v0.8h,   v0.8h,   v1.8h
523        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
524        addp            v2.8h,   v2.8h,   v3.8h
525        addp            v4.8h,   v4.8h,   v5.8h
526        addp            v6.8h,   v6.8h,   v7.8h
527        addp            v0.8h,   v0.8h,   v2.8h
528        addp            v4.8h,   v4.8h,   v6.8h
529        addp            v0.8h,   v0.8h,   v4.8h
530        uaddlv          s0,      v0.8h
531        rshrn           v4.4h,   v0.4s,   #6
532        dup             v0.8h,   v4.h[0]
533        br              x3
534L(ipred_dc_left_w64):
535        AARCH64_VALID_JUMP_TARGET
536        mov             v1.16b,  v0.16b
537        mov             v2.16b,  v0.16b
538        mov             v3.16b,  v0.16b
539        sub             x1,  x1,  #64
5401:
541        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
542        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
543        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
544        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
545        subs            w4,  w4,  #4
546        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
547        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
548        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
549        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
550        b.gt            1b
551        ret
552
553L(ipred_dc_left_tbl):
554        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
555        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
556        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
557        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
558        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
559        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
560        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
561        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
562        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
563        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
564endfunc
565
566// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
567//                          const pixel *const topleft,
568//                          const int width, const int height, const int a,
569//                          const int max_width, const int max_height);
570function ipred_dc_16bpc_neon, export=1
571        sub             x2,  x2,  w4, uxtw #1
572        add             w7,  w3,  w4             // width + height
573        clz             w3,  w3
574        clz             w6,  w4
575        dup             v16.4s, w7               // width + height
576        adr             x5,  L(ipred_dc_tbl)
577        rbit            w7,  w7                  // rbit(width + height)
578        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
579        sub             w6,  w6,  #25
580        clz             w7,  w7                  // ctz(width + height)
581        ldrh            w3,  [x5, w3, uxtw #1]
582        ldrh            w6,  [x5, w6, uxtw #1]
583        neg             w7,  w7                  // -ctz(width + height)
584        sub             x3,  x5,  w3, uxtw
585        sub             x5,  x5,  w6, uxtw
586        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
587        dup             v17.4s,  w7              // -ctz(width + height)
588        add             x6,  x0,  x1
589        lsl             x1,  x1,  #1
590        br              x5
591
592L(ipred_dc_h4):
593        AARCH64_VALID_JUMP_TARGET
594        ld1             {v0.4h},  [x2], #8
595        uaddlv          s0,      v0.4h
596        add             x2,  x2,  #2
597        br              x3
598L(ipred_dc_w4):
599        AARCH64_VALID_JUMP_TARGET
600        ld1             {v1.4h},  [x2]
601        add             v0.2s,   v0.2s,   v16.2s
602        uaddlv          s1,      v1.4h
603        cmp             w4,  #4
604        add             v0.2s,   v0.2s,   v1.2s
605        ushl            v0.2s,   v0.2s,   v17.2s
606        b.eq            1f
607        // h = 8/16
608        cmp             w4,  #16
609        mov             w16, #0x6667
610        mov             w17, #0xAAAB
611        csel            w16, w16, w17, eq
612        dup             v16.2s,  w16
613        mul             v0.2s,   v0.2s,   v16.2s
614        ushr            v0.2s,   v0.2s,   #17
6151:
616        dup             v0.4h,   v0.h[0]
6172:
618        st1             {v0.4h},  [x0], x1
619        st1             {v0.4h},  [x6], x1
620        subs            w4,  w4,  #4
621        st1             {v0.4h},  [x0], x1
622        st1             {v0.4h},  [x6], x1
623        b.gt            2b
624        ret
625
626L(ipred_dc_h8):
627        AARCH64_VALID_JUMP_TARGET
628        ld1             {v0.8h},  [x2], #16
629        uaddlv          s0,      v0.8h
630        add             x2,  x2,  #2
631        br              x3
632L(ipred_dc_w8):
633        AARCH64_VALID_JUMP_TARGET
634        ld1             {v1.8h},  [x2]
635        add             v0.2s,   v0.2s,   v16.2s
636        uaddlv          s1,      v1.8h
637        cmp             w4,  #8
638        add             v0.2s,   v0.2s,   v1.2s
639        ushl            v0.2s,   v0.2s,   v17.2s
640        b.eq            1f
641        // h = 4/16/32
642        cmp             w4,  #32
643        mov             w16, #0x6667
644        mov             w17, #0xAAAB
645        csel            w16, w16, w17, eq
646        dup             v16.2s,  w16
647        mul             v0.2s,   v0.2s,   v16.2s
648        ushr            v0.2s,   v0.2s,   #17
6491:
650        dup             v0.8h,   v0.h[0]
6512:
652        st1             {v0.8h},  [x0], x1
653        st1             {v0.8h},  [x6], x1
654        subs            w4,  w4,  #4
655        st1             {v0.8h},  [x0], x1
656        st1             {v0.8h},  [x6], x1
657        b.gt            2b
658        ret
659
660L(ipred_dc_h16):
661        AARCH64_VALID_JUMP_TARGET
662        ld1             {v0.8h, v1.8h}, [x2], #32
663        addp            v0.8h,   v0.8h,   v1.8h
664        add             x2,  x2,  #2
665        uaddlv          s0,      v0.8h
666        br              x3
667L(ipred_dc_w16):
668        AARCH64_VALID_JUMP_TARGET
669        ld1             {v1.8h, v2.8h}, [x2]
670        add             v0.2s,   v0.2s,   v16.2s
671        addp            v1.8h,   v1.8h,   v2.8h
672        uaddlv          s1,      v1.8h
673        cmp             w4,  #16
674        add             v0.2s,   v0.2s,   v1.2s
675        ushl            v4.2s,   v0.2s,   v17.2s
676        b.eq            1f
677        // h = 4/8/32/64
678        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
679        mov             w16, #0x6667
680        mov             w17, #0xAAAB
681        csel            w16, w16, w17, eq
682        dup             v16.2s,  w16
683        mul             v4.2s,   v4.2s,   v16.2s
684        ushr            v4.2s,   v4.2s,   #17
6851:
686        dup             v0.8h,   v4.h[0]
687        dup             v1.8h,   v4.h[0]
6882:
689        st1             {v0.8h, v1.8h}, [x0], x1
690        st1             {v0.8h, v1.8h}, [x6], x1
691        subs            w4,  w4,  #4
692        st1             {v0.8h, v1.8h}, [x0], x1
693        st1             {v0.8h, v1.8h}, [x6], x1
694        b.gt            2b
695        ret
696
697L(ipred_dc_h32):
698        AARCH64_VALID_JUMP_TARGET
699        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
700        addp            v0.8h,   v0.8h,   v1.8h
701        addp            v2.8h,   v2.8h,   v3.8h
702        addp            v0.8h,   v0.8h,   v2.8h
703        add             x2,  x2,  #2
704        uaddlv          s0,      v0.8h
705        br              x3
706L(ipred_dc_w32):
707        AARCH64_VALID_JUMP_TARGET
708        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
709        add             v0.2s,   v0.2s,   v16.2s
710        addp            v1.8h,   v1.8h,   v2.8h
711        addp            v3.8h,   v3.8h,   v4.8h
712        addp            v1.8h,   v1.8h,   v3.8h
713        uaddlv          s1,      v1.8h
714        cmp             w4,  #32
715        add             v0.2s,   v0.2s,   v1.2s
716        ushl            v4.2s,   v0.2s,   v17.2s
717        b.eq            1f
718        // h = 8/16/64
719        cmp             w4,  #8
720        mov             w16, #0x6667
721        mov             w17, #0xAAAB
722        csel            w16, w16, w17, eq
723        dup             v16.2s,  w16
724        mul             v4.2s,   v4.2s,   v16.2s
725        ushr            v4.2s,   v4.2s,   #17
7261:
727        dup             v0.8h,   v4.h[0]
728        dup             v1.8h,   v4.h[0]
729        dup             v2.8h,   v4.h[0]
730        dup             v3.8h,   v4.h[0]
7312:
732        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
733        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
734        subs            w4,  w4,  #4
735        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
736        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
737        b.gt            2b
738        ret
739
740L(ipred_dc_h64):
741        AARCH64_VALID_JUMP_TARGET
742        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
743        addp            v0.8h,   v0.8h,   v1.8h
744        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
745        addp            v2.8h,   v2.8h,   v3.8h
746        addp            v4.8h,   v4.8h,   v5.8h
747        addp            v6.8h,   v6.8h,   v7.8h
748        addp            v0.8h,   v0.8h,   v2.8h
749        addp            v4.8h,   v4.8h,   v6.8h
750        addp            v0.8h,   v0.8h,   v4.8h
751        add             x2,  x2,  #2
752        uaddlv          s0,      v0.8h
753        br              x3
754L(ipred_dc_w64):
755        AARCH64_VALID_JUMP_TARGET
756        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
757        add             v0.2s,   v0.2s,   v16.2s
758        addp            v1.8h,   v1.8h,   v2.8h
759        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
760        addp            v3.8h,   v3.8h,   v4.8h
761        addp            v20.8h,  v20.8h,  v21.8h
762        addp            v22.8h,  v22.8h,  v23.8h
763        addp            v1.8h,   v1.8h,   v3.8h
764        addp            v20.8h,  v20.8h,  v22.8h
765        addp            v1.8h,   v1.8h,   v20.8h
766        uaddlv          s1,      v1.8h
767        cmp             w4,  #64
768        add             v0.2s,   v0.2s,   v1.2s
769        ushl            v4.2s,   v0.2s,   v17.2s
770        b.eq            1f
771        // h = 16/32
772        cmp             w4,  #16
773        mov             w16, #0x6667
774        mov             w17, #0xAAAB
775        csel            w16, w16, w17, eq
776        dup             v16.2s,  w16
777        mul             v4.2s,   v4.2s,   v16.2s
778        ushr            v4.2s,   v4.2s,   #17
7791:
780        sub             x1,  x1,  #64
781        dup             v0.8h,   v4.h[0]
782        dup             v1.8h,   v4.h[0]
783        dup             v2.8h,   v4.h[0]
784        dup             v3.8h,   v4.h[0]
7852:
786        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
787        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
788        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
789        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
790        subs            w4,  w4,  #4
791        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
792        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
793        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
794        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
795        b.gt            2b
796        ret
797
798L(ipred_dc_tbl):
799        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
800        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
801        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
802        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
803        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
804        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
805        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
806        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
807        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
808        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
809endfunc
810
811// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
812//                             const pixel *const topleft,
813//                             const int width, const int height, const int a,
814//                             const int max_width, const int max_height);
815function ipred_paeth_16bpc_neon, export=1
816        clz             w9,  w3
817        adr             x5,  L(ipred_paeth_tbl)
818        sub             w9,  w9,  #25
819        ldrh            w9,  [x5, w9, uxtw #1]
820        ld1r            {v4.8h},  [x2]
821        add             x8,  x2,  #2
822        sub             x2,  x2,  #8
823        sub             x5,  x5,  w9, uxtw
824        mov             x7,  #-8
825        add             x6,  x0,  x1
826        lsl             x1,  x1,  #1
827        br              x5
82840:
829        AARCH64_VALID_JUMP_TARGET
830        ld1r            {v5.2d},  [x8]
831        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
8324:
833        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
834        zip1            v0.2d,   v0.2d,   v1.2d
835        zip1            v2.2d,   v2.2d,   v3.2d
836        add             v16.8h,  v6.8h,   v0.8h   // base
837        add             v17.8h,  v6.8h,   v2.8h
838        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
839        sabd            v21.8h,  v5.8h,   v17.8h
840        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
841        sabd            v23.8h,  v4.8h,   v17.8h
842        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
843        sabd            v17.8h,  v2.8h,   v17.8h
844        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
845        umin            v19.8h,  v21.8h,  v23.8h
846        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
847        cmge            v21.8h,  v23.8h,  v21.8h
848        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
849        cmge            v17.8h,  v19.8h,  v17.8h
850        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
851        bsl             v20.16b, v5.16b,  v4.16b
852        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
853        bit             v20.16b, v0.16b,  v16.16b
854        st1             {v21.d}[1], [x0], x1
855        st1             {v21.d}[0], [x6], x1
856        subs            w4,  w4,  #4
857        st1             {v20.d}[1], [x0], x1
858        st1             {v20.d}[0], [x6], x1
859        b.gt            4b
860        ret
86180:
862160:
863320:
864640:
865        AARCH64_VALID_JUMP_TARGET
866        ld1             {v5.8h},  [x8], #16
867        mov             w9,  w3
868        // Set up pointers for four rows in parallel; x0, x6, x5, x10
869        add             x5,  x0,  x1
870        add             x10, x6,  x1
871        lsl             x1,  x1,  #1
872        sub             x1,  x1,  w3, uxtw #1
8731:
874        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
8752:
876        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
877        add             v16.8h,  v6.8h,   v0.8h   // base
878        add             v17.8h,  v6.8h,   v1.8h
879        add             v18.8h,  v6.8h,   v2.8h
880        add             v19.8h,  v6.8h,   v3.8h
881        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
882        sabd            v21.8h,  v5.8h,   v17.8h
883        sabd            v22.8h,  v5.8h,   v18.8h
884        sabd            v23.8h,  v5.8h,   v19.8h
885        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
886        sabd            v25.8h,  v4.8h,   v17.8h
887        sabd            v26.8h,  v4.8h,   v18.8h
888        sabd            v27.8h,  v4.8h,   v19.8h
889        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
890        sabd            v17.8h,  v1.8h,   v17.8h
891        sabd            v18.8h,  v2.8h,   v18.8h
892        sabd            v19.8h,  v3.8h,   v19.8h
893        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
894        umin            v29.8h,  v21.8h,  v25.8h
895        umin            v30.8h,  v22.8h,  v26.8h
896        umin            v31.8h,  v23.8h,  v27.8h
897        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
898        cmge            v21.8h,  v25.8h,  v21.8h
899        cmge            v22.8h,  v26.8h,  v22.8h
900        cmge            v23.8h,  v27.8h,  v23.8h
901        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
902        cmge            v17.8h,  v29.8h,  v17.8h
903        cmge            v18.8h,  v30.8h,  v18.8h
904        cmge            v19.8h,  v31.8h,  v19.8h
905        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
906        bsl             v22.16b, v5.16b,  v4.16b
907        bsl             v21.16b, v5.16b,  v4.16b
908        bsl             v20.16b, v5.16b,  v4.16b
909        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
910        bit             v22.16b, v2.16b,  v18.16b
911        bit             v21.16b, v1.16b,  v17.16b
912        bit             v20.16b, v0.16b,  v16.16b
913        st1             {v23.8h}, [x0], #16
914        st1             {v22.8h}, [x6], #16
915        subs            w3,  w3,  #8
916        st1             {v21.8h}, [x5], #16
917        st1             {v20.8h}, [x10], #16
918        b.le            8f
919        ld1             {v5.8h},  [x8], #16
920        b               2b
9218:
922        subs            w4,  w4,  #4
923        b.le            9f
924        // End of horizontal loop, move pointers to next four rows
925        sub             x8,  x8,  w9, uxtw #1
926        add             x0,  x0,  x1
927        add             x6,  x6,  x1
928        // Load the top row as early as possible
929        ld1             {v5.8h},  [x8], #16
930        add             x5,  x5,  x1
931        add             x10, x10, x1
932        mov             w3,  w9
933        b               1b
9349:
935        ret
936
937L(ipred_paeth_tbl):
938        .hword L(ipred_paeth_tbl) - 640b
939        .hword L(ipred_paeth_tbl) - 320b
940        .hword L(ipred_paeth_tbl) - 160b
941        .hword L(ipred_paeth_tbl) -  80b
942        .hword L(ipred_paeth_tbl) -  40b
943endfunc
944
945// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
946//                              const pixel *const topleft,
947//                              const int width, const int height, const int a,
948//                              const int max_width, const int max_height);
949function ipred_smooth_16bpc_neon, export=1
950        movrel          x10, X(sm_weights)
951        add             x11, x10, w4, uxtw
952        add             x10, x10, w3, uxtw
953        clz             w9,  w3
954        adr             x5,  L(ipred_smooth_tbl)
955        sub             x12, x2,  w4, uxtw #1
956        sub             w9,  w9,  #25
957        ldrh            w9,  [x5, w9, uxtw #1]
958        ld1r            {v4.8h},  [x12] // bottom
959        add             x8,  x2,  #2
960        sub             x5,  x5,  w9, uxtw
961        add             x6,  x0,  x1
962        lsl             x1,  x1,  #1
963        br              x5
96440:
965        AARCH64_VALID_JUMP_TARGET
966        ld1r            {v6.2d}, [x8]             // top
967        ld1r            {v7.2s}, [x10]            // weights_hor
968        sub             x2,  x2,  #8
969        mov             x7,  #-8
970        dup             v5.8h,   v6.h[3]          // right
971        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
972        uxtl            v7.8h,   v7.8b            // weights_hor
973        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
9744:
975        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
976        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
977        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
978        ushll           v21.4s,  v31.4h,  #8
979        ushll           v22.4s,  v31.4h,  #8
980        ushll           v23.4s,  v31.4h,  #8
981        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
982        zip1            v0.2d,   v3.2d,   v2.2d
983        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
984        zip1            v18.2s,  v18.2s,  v19.2s
985        sub             v0.8h,   v0.8h,   v5.8h   // left-right
986        sub             v1.8h,   v1.8h,   v5.8h
987        uxtl            v16.8h,  v16.8b           // weights_ver
988        uxtl            v18.8h,  v18.8b
989        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
990        smlal2          v21.4s,  v0.8h,   v7.8h
991        smlal           v22.4s,  v1.4h,   v7.4h
992        smlal2          v23.4s,  v1.8h,   v7.8h
993        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
994        smlal2          v21.4s,  v6.8h,   v16.8h
995        smlal           v22.4s,  v6.4h,   v18.4h
996        smlal2          v23.4s,  v6.8h,   v18.8h
997        rshrn           v20.4h,  v20.4s,  #9
998        rshrn           v21.4h,  v21.4s,  #9
999        rshrn           v22.4h,  v22.4s,  #9
1000        rshrn           v23.4h,  v23.4s,  #9
1001        st1             {v20.4h}, [x0], x1
1002        st1             {v21.4h}, [x6], x1
1003        subs            w4,  w4,  #4
1004        st1             {v22.4h}, [x0], x1
1005        st1             {v23.4h}, [x6], x1
1006        b.gt            4b
1007        ret
100880:
1009        AARCH64_VALID_JUMP_TARGET
1010        ld1             {v6.8h}, [x8]             // top
1011        ld1             {v7.8b}, [x10]            // weights_hor
1012        sub             x2,  x2,  #8
1013        mov             x7,  #-8
1014        dup             v5.8h,   v6.h[7]          // right
1015        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
1016        uxtl            v7.8h,   v7.8b            // weights_hor
1017        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
10188:
1019        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1020        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
1021        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1022        ushll           v21.4s,  v31.4h,  #8
1023        ushll           v22.4s,  v31.4h,  #8
1024        ushll           v23.4s,  v31.4h,  #8
1025        ushll           v24.4s,  v31.4h,  #8
1026        ushll           v25.4s,  v31.4h,  #8
1027        ushll           v26.4s,  v31.4h,  #8
1028        ushll           v27.4s,  v31.4h,  #8
1029        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1030        sub             v1.8h,   v1.8h,   v5.8h
1031        sub             v2.8h,   v2.8h,   v5.8h
1032        sub             v3.8h,   v3.8h,   v5.8h
1033        uxtl            v16.8h,  v16.8b           // weights_ver
1034        uxtl            v17.8h,  v17.8b
1035        uxtl            v18.8h,  v18.8b
1036        uxtl            v19.8h,  v19.8b
1037        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
1038        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
1039        smlal           v22.4s,  v2.4h,   v7.4h
1040        smlal2          v23.4s,  v2.8h,   v7.8h
1041        smlal           v24.4s,  v1.4h,   v7.4h
1042        smlal2          v25.4s,  v1.8h,   v7.8h
1043        smlal           v26.4s,  v0.4h,   v7.4h
1044        smlal2          v27.4s,  v0.8h,   v7.8h
1045        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
1046        smlal2          v21.4s,  v6.8h,   v16.8h
1047        smlal           v22.4s,  v6.4h,   v17.4h
1048        smlal2          v23.4s,  v6.8h,   v17.8h
1049        smlal           v24.4s,  v6.4h,   v18.4h
1050        smlal2          v25.4s,  v6.8h,   v18.8h
1051        smlal           v26.4s,  v6.4h,   v19.4h
1052        smlal2          v27.4s,  v6.8h,   v19.8h
1053        rshrn           v20.4h,  v20.4s,  #9
1054        rshrn2          v20.8h,  v21.4s,  #9
1055        rshrn           v21.4h,  v22.4s,  #9
1056        rshrn2          v21.8h,  v23.4s,  #9
1057        rshrn           v22.4h,  v24.4s,  #9
1058        rshrn2          v22.8h,  v25.4s,  #9
1059        rshrn           v23.4h,  v26.4s,  #9
1060        rshrn2          v23.8h,  v27.4s,  #9
1061        st1             {v20.8h}, [x0], x1
1062        st1             {v21.8h}, [x6], x1
1063        subs            w4,  w4,  #4
1064        st1             {v22.8h}, [x0], x1
1065        st1             {v23.8h}, [x6], x1
1066        b.gt            8b
1067        ret
1068160:
1069320:
1070640:
1071        AARCH64_VALID_JUMP_TARGET
1072        add             x12, x2,  w3, uxtw #1
1073        sub             x1,  x1,  w3, uxtw #1
1074        ld1r            {v5.8h}, [x12]            // right
1075        sub             x2,  x2,  #4
1076        mov             x7,  #-4
1077        mov             w9,  w3
1078        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
1079
10801:
1081        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
1082        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
1083        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1084        sub             v1.8h,   v1.8h,   v5.8h
1085        uxtl            v16.8h,  v16.8b           // weights_ver
1086        uxtl            v17.8h,  v17.8b
10872:
1088        ld1             {v7.16b}, [x10],  #16     // weights_hor
1089        ld1             {v2.8h, v3.8h}, [x8], #32 // top
1090        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
1091        ushll           v21.4s,  v31.4h,  #8
1092        ushll           v22.4s,  v31.4h,  #8
1093        ushll           v23.4s,  v31.4h,  #8
1094        ushll           v24.4s,  v31.4h,  #8
1095        ushll           v25.4s,  v31.4h,  #8
1096        ushll           v26.4s,  v31.4h,  #8
1097        ushll           v27.4s,  v31.4h,  #8
1098        uxtl            v6.8h,   v7.8b            // weights_hor
1099        uxtl2           v7.8h,   v7.16b
1100        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1101        sub             v3.8h,   v3.8h,   v4.8h
1102        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
1103        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
1104        smlal           v22.4s,  v1.4h,   v7.4h
1105        smlal2          v23.4s,  v1.8h,   v7.8h
1106        smlal           v24.4s,  v0.4h,   v6.4h
1107        smlal2          v25.4s,  v0.8h,   v6.8h
1108        smlal           v26.4s,  v0.4h,   v7.4h
1109        smlal2          v27.4s,  v0.8h,   v7.8h
1110        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
1111        smlal2          v21.4s,  v2.8h,   v16.8h
1112        smlal           v22.4s,  v3.4h,   v16.4h
1113        smlal2          v23.4s,  v3.8h,   v16.8h
1114        smlal           v24.4s,  v2.4h,   v17.4h
1115        smlal2          v25.4s,  v2.8h,   v17.8h
1116        smlal           v26.4s,  v3.4h,   v17.4h
1117        smlal2          v27.4s,  v3.8h,   v17.8h
1118        rshrn           v20.4h,  v20.4s,  #9
1119        rshrn2          v20.8h,  v21.4s,  #9
1120        rshrn           v21.4h,  v22.4s,  #9
1121        rshrn2          v21.8h,  v23.4s,  #9
1122        rshrn           v22.4h,  v24.4s,  #9
1123        rshrn2          v22.8h,  v25.4s,  #9
1124        rshrn           v23.4h,  v26.4s,  #9
1125        rshrn2          v23.8h,  v27.4s,  #9
1126        subs            w3,  w3,  #16
1127        st1             {v20.8h, v21.8h}, [x0], #32
1128        st1             {v22.8h, v23.8h}, [x6], #32
1129        b.gt            2b
1130        subs            w4,  w4,  #2
1131        b.le            9f
1132        sub             x8,  x8,  w9, uxtw #1
1133        sub             x10, x10, w9, uxtw
1134        add             x0,  x0,  x1
1135        add             x6,  x6,  x1
1136        mov             w3,  w9
1137        b               1b
11389:
1139        ret
1140
1141L(ipred_smooth_tbl):
1142        .hword L(ipred_smooth_tbl) - 640b
1143        .hword L(ipred_smooth_tbl) - 320b
1144        .hword L(ipred_smooth_tbl) - 160b
1145        .hword L(ipred_smooth_tbl) -  80b
1146        .hword L(ipred_smooth_tbl) -  40b
1147endfunc
1148
1149// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1150//                                const pixel *const topleft,
1151//                                const int width, const int height, const int a,
1152//                                const int max_width, const int max_height);
1153function ipred_smooth_v_16bpc_neon, export=1
1154        movrel          x7,  X(sm_weights)
1155        add             x7,  x7,  w4, uxtw
1156        clz             w9,  w3
1157        adr             x5,  L(ipred_smooth_v_tbl)
1158        sub             x8,  x2,  w4, uxtw #1
1159        sub             w9,  w9,  #25
1160        ldrh            w9,  [x5, w9, uxtw #1]
1161        ld1r            {v4.8h},  [x8] // bottom
1162        add             x2,  x2,  #2
1163        sub             x5,  x5,  w9, uxtw
1164        add             x6,  x0,  x1
1165        lsl             x1,  x1,  #1
1166        br              x5
116740:
1168        AARCH64_VALID_JUMP_TARGET
1169        ld1r            {v6.2d}, [x2]             // top
1170        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11714:
1172        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1173        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
1174        zip1            v18.2s,  v18.2s,  v19.2s
1175        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1176        ushll           v18.8h,  v18.8b,  #7
1177        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1178        sqrdmulh        v21.8h,  v6.8h,   v18.8h
1179        add             v20.8h,  v20.8h,  v4.8h
1180        add             v21.8h,  v21.8h,  v4.8h
1181        st1             {v20.d}[0], [x0], x1
1182        st1             {v20.d}[1], [x6], x1
1183        subs            w4,  w4,  #4
1184        st1             {v21.d}[0], [x0], x1
1185        st1             {v21.d}[1], [x6], x1
1186        b.gt            4b
1187        ret
118880:
1189        AARCH64_VALID_JUMP_TARGET
1190        ld1             {v6.8h}, [x2]             // top
1191        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
11928:
1193        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
1194        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1195        ushll           v17.8h,  v17.8b,  #7
1196        ushll           v18.8h,  v18.8b,  #7
1197        ushll           v19.8h,  v19.8b,  #7
1198        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1199        sqrdmulh        v21.8h,  v6.8h,   v17.8h
1200        sqrdmulh        v22.8h,  v6.8h,   v18.8h
1201        sqrdmulh        v23.8h,  v6.8h,   v19.8h
1202        add             v20.8h,  v20.8h,  v4.8h
1203        add             v21.8h,  v21.8h,  v4.8h
1204        add             v22.8h,  v22.8h,  v4.8h
1205        add             v23.8h,  v23.8h,  v4.8h
1206        st1             {v20.8h}, [x0], x1
1207        st1             {v21.8h}, [x6], x1
1208        subs            w4,  w4,  #4
1209        st1             {v22.8h}, [x0], x1
1210        st1             {v23.8h}, [x6], x1
1211        b.gt            8b
1212        ret
1213160:
1214320:
1215640:
1216        AARCH64_VALID_JUMP_TARGET
1217        // Set up pointers for four rows in parallel; x0, x6, x5, x8
1218        add             x5,  x0,  x1
1219        add             x8,  x6,  x1
1220        lsl             x1,  x1,  #1
1221        sub             x1,  x1,  w3, uxtw #1
1222        mov             w9,  w3
1223
12241:
1225        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
1226        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
1227        ushll           v17.8h,  v17.8b,  #7
1228        ushll           v18.8h,  v18.8b,  #7
1229        ushll           v19.8h,  v19.8b,  #7
12302:
1231        ld1             {v2.8h, v3.8h}, [x2], #32 // top
1232        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
1233        sub             v3.8h,   v3.8h,   v4.8h
1234        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
1235        sqrdmulh        v21.8h,  v3.8h,   v16.8h
1236        sqrdmulh        v22.8h,  v2.8h,   v17.8h
1237        sqrdmulh        v23.8h,  v3.8h,   v17.8h
1238        sqrdmulh        v24.8h,  v2.8h,   v18.8h
1239        sqrdmulh        v25.8h,  v3.8h,   v18.8h
1240        sqrdmulh        v26.8h,  v2.8h,   v19.8h
1241        sqrdmulh        v27.8h,  v3.8h,   v19.8h
1242        add             v20.8h,  v20.8h,  v4.8h
1243        add             v21.8h,  v21.8h,  v4.8h
1244        add             v22.8h,  v22.8h,  v4.8h
1245        add             v23.8h,  v23.8h,  v4.8h
1246        add             v24.8h,  v24.8h,  v4.8h
1247        add             v25.8h,  v25.8h,  v4.8h
1248        add             v26.8h,  v26.8h,  v4.8h
1249        add             v27.8h,  v27.8h,  v4.8h
1250        subs            w3,  w3,  #16
1251        st1             {v20.8h, v21.8h}, [x0], #32
1252        st1             {v22.8h, v23.8h}, [x6], #32
1253        st1             {v24.8h, v25.8h}, [x5], #32
1254        st1             {v26.8h, v27.8h}, [x8], #32
1255        b.gt            2b
1256        subs            w4,  w4,  #4
1257        b.le            9f
1258        sub             x2,  x2,  w9, uxtw #1
1259        add             x0,  x0,  x1
1260        add             x6,  x6,  x1
1261        add             x5,  x5,  x1
1262        add             x8,  x8,  x1
1263        mov             w3,  w9
1264        b               1b
12659:
1266        ret
1267
1268L(ipred_smooth_v_tbl):
1269        .hword L(ipred_smooth_v_tbl) - 640b
1270        .hword L(ipred_smooth_v_tbl) - 320b
1271        .hword L(ipred_smooth_v_tbl) - 160b
1272        .hword L(ipred_smooth_v_tbl) -  80b
1273        .hword L(ipred_smooth_v_tbl) -  40b
1274endfunc
1275
1276// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1277//                                const pixel *const topleft,
1278//                                const int width, const int height, const int a,
1279//                                const int max_width, const int max_height);
1280function ipred_smooth_h_16bpc_neon, export=1
1281        movrel          x8,  X(sm_weights)
1282        add             x8,  x8,  w3, uxtw
1283        clz             w9,  w3
1284        adr             x5,  L(ipred_smooth_h_tbl)
1285        add             x12, x2,  w3, uxtw #1
1286        sub             w9,  w9,  #25
1287        ldrh            w9,  [x5, w9, uxtw #1]
1288        ld1r            {v5.8h},  [x12] // right
1289        sub             x5,  x5,  w9, uxtw
1290        add             x6,  x0,  x1
1291        lsl             x1,  x1,  #1
1292        br              x5
129340:
1294        AARCH64_VALID_JUMP_TARGET
1295        ld1r            {v7.2s}, [x8]             // weights_hor
1296        sub             x2,  x2,  #8
1297        mov             x7,  #-8
1298        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
12994:
1300        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
1301        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
1302        zip1            v0.2d,   v3.2d,   v2.2d
1303        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1304        sub             v1.8h,   v1.8h,   v5.8h
1305        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1306        sqrdmulh        v21.8h,  v1.8h,   v7.8h
1307        add             v20.8h,  v20.8h,  v5.8h
1308        add             v21.8h,  v21.8h,  v5.8h
1309        st1             {v20.d}[0], [x0], x1
1310        st1             {v20.d}[1], [x6], x1
1311        subs            w4,  w4,  #4
1312        st1             {v21.d}[0], [x0], x1
1313        st1             {v21.d}[1], [x6], x1
1314        b.gt            4b
1315        ret
131680:
1317        AARCH64_VALID_JUMP_TARGET
1318        ld1             {v7.8b}, [x8]             // weights_hor
1319        sub             x2,  x2,  #8
1320        mov             x7,  #-8
1321        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
13228:
1323        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
1324        sub             v3.8h,   v3.8h,   v5.8h   // left-right
1325        sub             v2.8h,   v2.8h,   v5.8h
1326        sub             v1.8h,   v1.8h,   v5.8h
1327        sub             v0.8h,   v0.8h,   v5.8h
1328        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
1329        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
1330        sqrdmulh        v22.8h,  v1.8h,   v7.8h
1331        sqrdmulh        v23.8h,  v0.8h,   v7.8h
1332        add             v20.8h,  v20.8h,  v5.8h
1333        add             v21.8h,  v21.8h,  v5.8h
1334        add             v22.8h,  v22.8h,  v5.8h
1335        add             v23.8h,  v23.8h,  v5.8h
1336        st1             {v20.8h}, [x0], x1
1337        st1             {v21.8h}, [x6], x1
1338        subs            w4,  w4,  #4
1339        st1             {v22.8h}, [x0], x1
1340        st1             {v23.8h}, [x6], x1
1341        b.gt            8b
1342        ret
1343160:
1344320:
1345640:
1346        AARCH64_VALID_JUMP_TARGET
1347        sub             x2,  x2,  #8
1348        mov             x7,  #-8
1349        // Set up pointers for four rows in parallel; x0, x6, x5, x10
1350        add             x5,  x0,  x1
1351        add             x10, x6,  x1
1352        lsl             x1,  x1,  #1
1353        sub             x1,  x1,  w3, uxtw #1
1354        mov             w9,  w3
1355
13561:
1357        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
1358        sub             v0.8h,   v0.8h,   v5.8h   // left-right
1359        sub             v1.8h,   v1.8h,   v5.8h
1360        sub             v2.8h,   v2.8h,   v5.8h
1361        sub             v3.8h,   v3.8h,   v5.8h
13622:
1363        ld1             {v7.16b}, [x8],   #16     // weights_hor
1364        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
1365        ushll2          v7.8h,   v7.16b,  #7
1366        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
1367        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
1368        sqrdmulh        v22.8h,  v2.8h,   v6.8h
1369        sqrdmulh        v23.8h,  v2.8h,   v7.8h
1370        sqrdmulh        v24.8h,  v1.8h,   v6.8h
1371        sqrdmulh        v25.8h,  v1.8h,   v7.8h
1372        sqrdmulh        v26.8h,  v0.8h,   v6.8h
1373        sqrdmulh        v27.8h,  v0.8h,   v7.8h
1374        add             v20.8h,  v20.8h,  v5.8h
1375        add             v21.8h,  v21.8h,  v5.8h
1376        add             v22.8h,  v22.8h,  v5.8h
1377        add             v23.8h,  v23.8h,  v5.8h
1378        add             v24.8h,  v24.8h,  v5.8h
1379        add             v25.8h,  v25.8h,  v5.8h
1380        add             v26.8h,  v26.8h,  v5.8h
1381        add             v27.8h,  v27.8h,  v5.8h
1382        subs            w3,  w3,  #16
1383        st1             {v20.8h, v21.8h}, [x0],  #32
1384        st1             {v22.8h, v23.8h}, [x6],  #32
1385        st1             {v24.8h, v25.8h}, [x5],  #32
1386        st1             {v26.8h, v27.8h}, [x10], #32
1387        b.gt            2b
1388        subs            w4,  w4,  #4
1389        b.le            9f
1390        sub             x8,  x8,  w9, uxtw
1391        add             x0,  x0,  x1
1392        add             x6,  x6,  x1
1393        add             x5,  x5,  x1
1394        add             x10, x10, x1
1395        mov             w3,  w9
1396        b               1b
13979:
1398        ret
1399
1400L(ipred_smooth_h_tbl):
1401        .hword L(ipred_smooth_h_tbl) - 640b
1402        .hword L(ipred_smooth_h_tbl) - 320b
1403        .hword L(ipred_smooth_h_tbl) - 160b
1404        .hword L(ipred_smooth_h_tbl) -  80b
1405        .hword L(ipred_smooth_h_tbl) -  40b
1406endfunc
1407
1408const padding_mask_buf
1409        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1410        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1411        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1412        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1413        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1414        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1415padding_mask:
1416        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1417        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1418        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1419        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1420        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1421        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
1422endconst
1423
1424// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
1425//                                        const pixel *const in, const int end,
1426//                                        const int bitdepth_max);
1427function ipred_z1_upsample_edge_16bpc_neon, export=1
1428        dup             v30.8h,  w4               // bitdepth_max
1429        movrel          x4,  padding_mask
1430        ld1             {v0.8h, v1.8h},  [x2]     // in[]
1431        add             x5,  x2,  w3,  uxtw #1    // in[end]
1432        sub             x4,  x4,  w3,  uxtw #1
1433
1434        ld1r            {v2.8h},  [x5]            // padding
1435        ld1             {v3.8h, v4.8h}, [x4]      // padding_mask
1436
1437        movi            v31.8h,  #9
1438
1439        bit             v0.16b,  v2.16b,  v3.16b  // padded in[]
1440        bit             v1.16b,  v2.16b,  v4.16b
1441
1442        ext             v4.16b,  v0.16b,  v1.16b,  #2
1443        ext             v5.16b,  v1.16b,  v2.16b,  #2
1444        ext             v6.16b,  v0.16b,  v1.16b,  #4
1445        ext             v7.16b,  v1.16b,  v2.16b,  #4
1446        ext             v16.16b, v0.16b,  v1.16b,  #6
1447        ext             v17.16b, v1.16b,  v2.16b,  #6
1448
1449        add             v18.8h,  v4.8h,   v6.8h   // in[i+1] + in[i+2]
1450        add             v19.8h,  v5.8h,   v7.8h
1451        add             v20.8h,  v0.8h,   v16.8h
1452        add             v21.8h,  v1.8h,   v17.8h
1453        umull           v22.4s,  v18.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1454        umull2          v23.4s,  v18.8h,  v31.8h
1455        umull           v24.4s,  v19.4h,  v31.4h
1456        umull2          v25.4s,  v19.8h,  v31.8h
1457        usubw           v22.4s,  v22.4s,  v20.4h
1458        usubw2          v23.4s,  v23.4s,  v20.8h
1459        usubw           v24.4s,  v24.4s,  v21.4h
1460        usubw2          v25.4s,  v25.4s,  v21.8h
1461
1462        sqrshrun        v16.4h,  v22.4s,  #4
1463        sqrshrun2       v16.8h,  v23.4s,  #4
1464        sqrshrun        v17.4h,  v24.4s,  #4
1465        sqrshrun2       v17.8h,  v25.4s,  #4
1466
1467        smin            v16.8h,  v16.8h,  v30.8h
1468        smin            v17.8h,  v17.8h,  v30.8h
1469
1470        zip1            v0.8h,   v4.8h,   v16.8h
1471        zip2            v1.8h,   v4.8h,   v16.8h
1472        zip1            v2.8h,   v5.8h,   v17.8h
1473        zip2            v3.8h,   v5.8h,   v17.8h
1474
1475        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
1476
1477        ret
1478endfunc
1479
1480// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
1481//                                        const pixel *const in,
1482//                                        const int bitdepth_max);
1483function ipred_z2_upsample_edge_16bpc_neon, export=1
1484        dup             v30.8h,  w3               // bitdepth_max
1485        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
1486        movrel          x4,  padding_mask
1487        ld1             {v0.8h, v1.8h}, [x2]      // in[]
1488        add             x5,  x2,  w1,  uxtw #1    // in[sz]
1489        sub             x4,  x4,  w1,  uxtw #1
1490
1491        ld1r            {v3.8h},  [x2]            // in[0] for padding
1492        ld1r            {v2.8h},  [x5]            // padding
1493        ld1             {v4.8h, v5.8h}, [x4]      // padding_mask
1494
1495        movi            v31.8h,  #9
1496
1497        bit             v0.16b,  v2.16b,  v4.16b  // padded in[]
1498        bit             v1.16b,  v2.16b,  v5.16b
1499
1500        ext             v4.16b,  v3.16b,  v0.16b,  #14
1501        ext             v5.16b,  v0.16b,  v1.16b,  #2
1502        ext             v6.16b,  v0.16b,  v1.16b,  #4
1503
1504        add             v16.8h,  v0.8h,   v5.8h   // in[i+0] + in[i+1]
1505        add             v17.8h,  v4.8h,   v6.8h   // in[i-1] + in[i+2]
1506        umull           v18.4s,  v16.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
1507        umull2          v19.4s,  v16.8h,  v31.8h
1508        usubw           v18.4s,  v18.4s,  v17.4h
1509        usubw2          v19.4s,  v19.4s,  v17.8h
1510
1511        sqrshrun        v16.4h,  v18.4s,  #4
1512        sqrshrun2       v16.8h,  v19.4s,  #4
1513
1514        add             x5,  x0,  #2*16
1515
1516        smin            v16.8h,  v16.8h,  v30.8h
1517
1518        zip1            v4.8h,   v0.8h,   v16.8h
1519        zip2            v5.8h,   v0.8h,   v16.8h
1520
1521        st1             {v2.h}[0], [x5]
1522        // In case sz=8, output one single pixel in out[16].
1523        st1             {v4.8h, v5.8h}, [x0]
1524
1525        ret
1526endfunc
1527
1528const edge_filter
1529        .short 0, 4, 8, 0
1530        .short 0, 5, 6, 0
1531// Leaving out the coeffs for strength=3
1532//      .byte 2, 4, 4, 0
1533endconst
1534
1535// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
1536//                                      const pixel *const in, const int end,
1537//                                      const int strength);
1538function ipred_z1_filter_edge_16bpc_neon, export=1
1539        cmp             w4, #3
1540        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
1541
1542        movrel          x5,  edge_filter, -6
1543        add             x5,  x5,  w4,  uxtw #3    // edge_filter + 2*((strength - 1)*4 + 1)
1544
1545        ld1             {v31.s}[0], [x5]          // kernel[1-2]
1546
1547        ld1             {v0.8h}, [x2], #16
1548
1549        dup             v30.8h, v31.h[0]
1550        dup             v31.8h, v31.h[1]
15511:
1552        // in[end], is the last valid pixel. We produce 16 pixels out by
1553        // using 18 pixels in - the last pixel used is [17] of the ones
1554        // read/buffered.
1555        cmp             w3,  #17
1556        ld1             {v1.8h, v2.8h}, [x2], #32
1557        b.lt            2f
1558        ext             v3.16b,  v0.16b,  v1.16b,  #2
1559        ext             v4.16b,  v1.16b,  v2.16b,  #2
1560        ext             v5.16b,  v0.16b,  v1.16b,  #4
1561        ext             v6.16b,  v1.16b,  v2.16b,  #4
1562        mul             v16.8h,  v0.8h,   v30.8h
1563        mla             v16.8h,  v3.8h,   v31.8h
1564        mla             v16.8h,  v5.8h,   v30.8h
1565        mul             v17.8h,  v1.8h,   v30.8h
1566        mla             v17.8h,  v4.8h,   v31.8h
1567        mla             v17.8h,  v6.8h,   v30.8h
1568        subs            w1,  w1,  #16
1569        mov             v0.16b,  v2.16b
1570        urshr           v16.8h,  v16.8h,  #4
1571        urshr           v17.8h,  v17.8h,  #4
1572        sub             w3,  w3,  #16
1573        st1             {v16.8h, v17.8h}, [x0], #32
1574        b.gt            1b
1575        ret
15762:
1577        // Right padding
1578
1579        // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
1580        movrel          x5,  padding_mask
1581        sub             w6,  w3,  #24
1582        sub             x5,  x5,  w3,  uxtw #1
1583        add             x6,  x2,  w6,  sxtw #1
1584
1585        ld1             {v3.8h, v4.8h}, [x5] // padding_mask
1586
1587        ld1r            {v2.8h}, [x6]
1588        bit             v0.16b,  v2.16b,  v3.16b  // Pad v0-v1
1589        bit             v1.16b,  v2.16b,  v4.16b
1590
1591        // Filter one block
1592        ext             v3.16b,  v0.16b,  v1.16b,  #2
1593        ext             v4.16b,  v1.16b,  v2.16b,  #2
1594        ext             v5.16b,  v0.16b,  v1.16b,  #4
1595        ext             v6.16b,  v1.16b,  v2.16b,  #4
1596        mul             v16.8h,  v0.8h,   v30.8h
1597        mla             v16.8h,  v3.8h,   v31.8h
1598        mla             v16.8h,  v5.8h,   v30.8h
1599        mul             v17.8h,  v1.8h,   v30.8h
1600        mla             v17.8h,  v4.8h,   v31.8h
1601        mla             v17.8h,  v6.8h,   v30.8h
1602        subs            w1,  w1,  #16
1603        urshr           v16.8h,  v16.8h,  #4
1604        urshr           v17.8h,  v17.8h,  #4
1605        st1             {v16.8h, v17.8h}, [x0], #32
1606        b.le            9f
16075:
1608        // After one block, any remaining output would only be filtering
1609        // padding - thus just store the padding.
1610        subs            w1,  w1,  #16
1611        st1             {v2.16b}, [x0], #16
1612        b.gt            5b
16139:
1614        ret
1615
1616L(fivetap):
1617        sub             x2,  x2,  #2              // topleft -= 1 pixel
1618        movi            v29.8h, #2
1619        ld1             {v0.8h}, [x2], #16
1620        movi            v30.8h, #4
1621        movi            v31.8h, #4
1622        ins             v0.h[0], v0.h[1]
16231:
1624        // in[end+1], is the last valid pixel. We produce 16 pixels out by
1625        // using 20 pixels in - the last pixel used is [19] of the ones
1626        // read/buffered.
1627        cmp             w3,  #18
1628        ld1             {v1.8h, v2.8h}, [x2], #32
1629        b.lt            2f                        // if (end + 1 < 19)
1630        ext             v3.16b,  v0.16b,  v1.16b,  #2
1631        ext             v4.16b,  v1.16b,  v2.16b,  #2
1632        ext             v5.16b,  v0.16b,  v1.16b,  #4
1633        ext             v6.16b,  v1.16b,  v2.16b,  #4
1634        ext             v16.16b, v0.16b,  v1.16b,  #6
1635        ext             v17.16b, v1.16b,  v2.16b,  #6
1636        ext             v18.16b, v0.16b,  v1.16b,  #8
1637        ext             v19.16b, v1.16b,  v2.16b,  #8
1638        mul             v20.8h,  v0.8h,   v29.8h
1639        mla             v20.8h,  v3.8h,   v30.8h
1640        mla             v20.8h,  v5.8h,   v31.8h
1641        mla             v20.8h,  v16.8h,  v30.8h
1642        mla             v20.8h,  v18.8h,  v29.8h
1643        mul             v21.8h,  v1.8h,   v29.8h
1644        mla             v21.8h,  v4.8h,   v30.8h
1645        mla             v21.8h,  v6.8h,   v31.8h
1646        mla             v21.8h,  v17.8h,  v30.8h
1647        mla             v21.8h,  v19.8h,  v29.8h
1648        subs            w1,  w1,  #16
1649        mov             v0.16b,  v2.16b
1650        urshr           v20.8h,  v20.8h,  #4
1651        urshr           v21.8h,  v21.8h,  #4
1652        sub             w3,  w3,  #16
1653        st1             {v20.8h, v21.8h}, [x0], #32
1654        b.gt            1b
1655        ret
16562:
1657        // Right padding
1658
1659        // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
1660        movrel          x5,  padding_mask, -2
1661        sub             w6,  w3,  #23
1662        sub             x5,  x5,  w3,  uxtw #1
1663        add             x6,  x2,  w6,  sxtw #1
1664
1665        ld1             {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
1666
1667        ld1r            {v28.8h}, [x6]
1668        bit             v0.16b,  v28.16b, v3.16b  // Pad v0-v2
1669        bit             v1.16b,  v28.16b, v4.16b
1670        bit             v2.16b,  v28.16b, v5.16b
16714:
1672        // Filter one block
1673        ext             v3.16b,  v0.16b,  v1.16b,  #2
1674        ext             v4.16b,  v1.16b,  v2.16b,  #2
1675        ext             v5.16b,  v0.16b,  v1.16b,  #4
1676        ext             v6.16b,  v1.16b,  v2.16b,  #4
1677        ext             v16.16b, v0.16b,  v1.16b,  #6
1678        ext             v17.16b, v1.16b,  v2.16b,  #6
1679        ext             v18.16b, v0.16b,  v1.16b,  #8
1680        ext             v19.16b, v1.16b,  v2.16b,  #8
1681        mul             v20.8h,  v0.8h,   v29.8h
1682        mla             v20.8h,  v3.8h,   v30.8h
1683        mla             v20.8h,  v5.8h,   v31.8h
1684        mla             v20.8h,  v16.8h,  v30.8h
1685        mla             v20.8h,  v18.8h,  v29.8h
1686        mul             v21.8h,  v1.8h,   v29.8h
1687        mla             v21.8h,  v4.8h,   v30.8h
1688        mla             v21.8h,  v6.8h,   v31.8h
1689        mla             v21.8h,  v17.8h,  v30.8h
1690        mla             v21.8h,  v19.8h,  v29.8h
1691        subs            w1,  w1,  #16
1692        mov             v0.16b,  v2.16b
1693        mov             v1.16b,  v28.16b
1694        mov             v2.16b,  v28.16b
1695        urshr           v20.8h,  v20.8h,  #4
1696        urshr           v21.8h,  v21.8h,  #4
1697        sub             w3,  w3,  #16
1698        st1             {v20.8h, v21.8h}, [x0], #32
1699        b.le            9f
1700        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
1701        // filter properly once more - aka (w3 >= 0).
1702        cmp             w3,  #0
1703        b.ge            4b
17045:
1705        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
1706        // last valid pixel - thus just output that without filtering.
1707        subs            w1,  w1,  #8
1708        st1             {v28.8h}, [x0], #16
1709        b.gt            5b
17109:
1711        ret
1712endfunc
1713
1714// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
1715//                                 const int n);
1716function ipred_pixel_set_16bpc_neon, export=1
1717        dup             v0.8h,   w1
17181:
1719        subs            w2,  w2,  #8
1720        st1             {v0.8h}, [x0], #16
1721        b.gt            1b
1722        ret
1723endfunc
1724
1725// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
1726//                                const pixel *const top,
1727//                                const int width, const int height,
1728//                                const int dx, const int max_base_x);
1729function ipred_z1_fill1_16bpc_neon, export=1
1730        clz             w9,  w3
1731        adr             x8,  L(ipred_z1_fill1_tbl)
1732        sub             w9,  w9,  #25
1733        ldrh            w9,  [x8, w9, uxtw #1]
1734        add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
1735        sub             x8,  x8,  w9,  uxtw
1736        ld1r            {v31.8h}, [x10]           // padding
1737        mov             w7,  w5
1738        mov             w15, #64
1739        br              x8
174040:
1741        AARCH64_VALID_JUMP_TARGET
17424:
1743        lsr             w8,  w7,  #6              // base
1744        and             w9,  w7,  #0x3e           // frac
1745        add             w7,  w7,  w5              // xpos += dx
1746        cmp             w8,  w6                   // base >= max_base_x
1747        lsr             w10, w7,  #6              // base
1748        and             w11, w7,  #0x3e           // frac
1749        b.ge            49f
1750        lsl             w8,  w8,  #1
1751        lsl             w10, w10, #1
1752        ldr             q0,  [x2, w8, uxtw]       // top[base]
1753        ldr             q2,  [x2, w10, uxtw]
1754        dup             v4.4h,   w9               // frac
1755        dup             v5.4h,   w11
1756        ext             v1.16b,  v0.16b,  v0.16b,  #2 // top[base+1]
1757        ext             v3.16b,  v2.16b,  v2.16b,  #2
1758        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1759        sub             v7.4h,   v3.4h,   v2.4h
1760        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1761        ushll           v17.4s,  v2.4h,   #6
1762        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1763        smlal           v17.4s,  v7.4h,   v5.4h
1764        rshrn           v16.4h,  v16.4s,  #6
1765        rshrn           v17.4h,  v17.4s,  #6
1766        st1             {v16.4h}, [x0], x1
1767        add             w7,  w7,  w5              // xpos += dx
1768        subs            w4,  w4,  #2
1769        st1             {v17.4h}, [x0], x1
1770        b.gt            4b
1771        ret
1772
177349:
1774        st1             {v31.4h}, [x0], x1
1775        subs            w4,  w4,  #2
1776        st1             {v31.4h}, [x0], x1
1777        b.gt            49b
1778        ret
1779
178080:
1781        AARCH64_VALID_JUMP_TARGET
17828:
1783        lsr             w8,  w7,  #6              // base
1784        and             w9,  w7,  #0x3e           // frac
1785        add             w7,  w7,  w5              // xpos += dx
1786        cmp             w8,  w6                   // base >= max_base_x
1787        lsr             w10, w7,  #6              // base
1788        and             w11, w7,  #0x3e           // frac
1789        b.ge            89f
1790        add             x8,  x2,  w8,  uxtw #1
1791        add             x10, x2,  w10, uxtw #1
1792        dup             v4.8h,   w9               // frac
1793        dup             v5.8h,   w11
1794        ld1             {v0.8h},  [x8]            // top[base]
1795        ld1             {v2.8h},  [x10]
1796        sub             w9,  w15, w9              // 64 - frac
1797        sub             w11, w15, w11
1798        ldr             h1, [x8, #16]
1799        ldr             h3, [x10, #16]
1800        dup             v6.8h,   w9               // 64 - frac
1801        dup             v7.8h,   w11
1802        ext             v1.16b,  v0.16b,  v1.16b,  #2 // top[base+1]
1803        ext             v3.16b,  v2.16b,  v3.16b,  #2
1804        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
1805        umlal           v16.4s,  v1.4h,   v4.4h   // + top[base+1]*frac
1806        umull2          v17.4s,  v0.8h,   v6.8h
1807        umlal2          v17.4s,  v1.8h,   v4.8h
1808        umull           v18.4s,  v2.4h,   v7.4h
1809        umlal           v18.4s,  v3.4h,   v5.4h
1810        umull2          v19.4s,  v2.8h,   v7.8h
1811        umlal2          v19.4s,  v3.8h,   v5.8h
1812        rshrn           v16.4h,  v16.4s,  #6
1813        rshrn2          v16.8h,  v17.4s,  #6
1814        rshrn           v17.4h,  v18.4s,  #6
1815        rshrn2          v17.8h,  v19.4s,  #6
1816        st1             {v16.8h}, [x0], x1
1817        add             w7,  w7,  w5              // xpos += dx
1818        subs            w4,  w4,  #2
1819        st1             {v17.8h}, [x0], x1
1820        b.gt            8b
1821        ret
1822
182389:
1824        st1             {v31.8h}, [x0], x1
1825        subs            w4,  w4,  #2
1826        st1             {v31.8h}, [x0], x1
1827        b.gt            89b
1828        ret
1829
1830160:
1831320:
1832640:
1833        AARCH64_VALID_JUMP_TARGET
1834
1835        mov             w12, w3
1836
1837        add             x13, x0,  x1
1838        lsl             x1,  x1,  #1
1839        sub             x1,  x1,  w3,  uxtw #1
18401:
1841        lsr             w8,  w7,  #6              // base
1842        and             w9,  w7,  #0x3e           // frac
1843        add             w7,  w7,  w5              // xpos += dx
1844        cmp             w8,  w6                   // base >= max_base_x
1845        lsr             w10, w7,  #6              // base
1846        and             w11, w7,  #0x3e           // frac
1847        b.ge            169f
1848        add             x8,  x2,  w8,  uxtw #1
1849        add             x10, x2,  w10, uxtw #1
1850        dup             v6.8h,   w9               // frac
1851        dup             v7.8h,   w11
1852        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // top[base]
1853        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
1854        sub             w9,  w15, w9              // 64 - frac
1855        sub             w11, w15, w11
1856        dup             v16.8h,  w9               // 64 - frac
1857        dup             v17.8h,  w11
1858        add             w7,  w7,  w5              // xpos += dx
18592:
1860        ext             v18.16b, v0.16b,  v1.16b,  #2 // top[base+1]
1861        ext             v19.16b, v1.16b,  v2.16b,  #2
1862        ext             v20.16b, v3.16b,  v4.16b,  #2
1863        ext             v21.16b, v4.16b,  v5.16b,  #2
1864        subs            w3,  w3,  #16
1865        umull           v22.4s,  v0.4h,   v16.4h  // top[base]*(64-frac)
1866        umlal           v22.4s,  v18.4h,  v6.4h   // + top[base+1]*frac
1867        umull2          v23.4s,  v0.8h,   v16.8h
1868        umlal2          v23.4s,  v18.8h,  v6.8h
1869        umull           v24.4s,  v1.4h,   v16.4h
1870        umlal           v24.4s,  v19.4h,  v6.4h
1871        umull2          v25.4s,  v1.8h,   v16.8h
1872        umlal2          v25.4s,  v19.8h,  v6.8h
1873        umull           v26.4s,  v3.4h,   v17.4h
1874        umlal           v26.4s,  v20.4h,  v7.4h
1875        umull2          v27.4s,  v3.8h,   v17.8h
1876        umlal2          v27.4s,  v20.8h,  v7.8h
1877        umull           v28.4s,  v4.4h,   v17.4h
1878        umlal           v28.4s,  v21.4h,  v7.4h
1879        umull2          v29.4s,  v4.8h,   v17.8h
1880        umlal2          v29.4s,  v21.8h,  v7.8h
1881        rshrn           v22.4h,  v22.4s,  #6
1882        rshrn2          v22.8h,  v23.4s,  #6
1883        rshrn           v23.4h,  v24.4s,  #6
1884        rshrn2          v23.8h,  v25.4s,  #6
1885        rshrn           v24.4h,  v26.4s,  #6
1886        rshrn2          v24.8h,  v27.4s,  #6
1887        rshrn           v25.4h,  v28.4s,  #6
1888        rshrn2          v25.8h,  v29.4s,  #6
1889        st1             {v22.8h, v23.8h}, [x0],  #32
1890        st1             {v24.8h, v25.8h}, [x13], #32
1891        b.le            3f
1892        mov             v0.16b,  v2.16b
1893        ld1             {v1.8h, v2.8h}, [x8],  #32 // top[base]
1894        mov             v3.16b,  v5.16b
1895        ld1             {v4.8h, v5.8h}, [x10], #32
1896        b               2b
1897
18983:
1899        subs            w4,  w4,  #2
1900        b.le            9f
1901        add             x0,  x0,  x1
1902        add             x13, x13, x1
1903        mov             w3,  w12
1904        b               1b
19059:
1906        ret
1907
1908169:
1909        st1             {v31.8h}, [x0],  #16
1910        subs            w3,  w3,  #8
1911        st1             {v31.8h}, [x13], #16
1912        b.gt            169b
1913        subs            w4,  w4,  #2
1914        b.le            9b
1915        add             x0,  x0,  x1
1916        add             x13, x13, x1
1917        mov             w3,  w12
1918        b               169b
1919
1920L(ipred_z1_fill1_tbl):
1921        .hword L(ipred_z1_fill1_tbl) - 640b
1922        .hword L(ipred_z1_fill1_tbl) - 320b
1923        .hword L(ipred_z1_fill1_tbl) - 160b
1924        .hword L(ipred_z1_fill1_tbl) -  80b
1925        .hword L(ipred_z1_fill1_tbl) -  40b
1926endfunc
1927
1928function ipred_z1_fill2_16bpc_neon, export=1
1929        cmp             w3,  #8
1930        add             x10, x2,  w6,  uxtw       // top[max_base_x]
1931        ld1r            {v31.16b}, [x10]          // padding
1932        mov             w7,  w5
1933        mov             w15, #64
1934        b.eq            8f
1935
19364:      // w == 4
1937        lsr             w8,  w7,  #6              // base
1938        and             w9,  w7,  #0x3e           // frac
1939        add             w7,  w7,  w5              // xpos += dx
1940        cmp             w8,  w6                   // base >= max_base_x
1941        lsr             w10, w7,  #6              // base
1942        and             w11, w7,  #0x3e           // frac
1943        b.ge            49f
1944        lsl             w8,  w8,  #1
1945        lsl             w10, w10, #1
1946        ldr             q0,  [x2, w8, uxtw]       // top[base]
1947        ldr             q2,  [x2, w10, uxtw]
1948        dup             v4.4h,   w9               // frac
1949        dup             v5.4h,   w11
1950        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
1951        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
1952        uzp2            v3.8h,   v2.8h,   v2.8h
1953        uzp1            v2.8h,   v2.8h,   v2.8h
1954        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
1955        sub             v7.4h,   v3.4h,   v2.4h
1956        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
1957        ushll           v17.4s,  v2.4h,   #6
1958        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
1959        smlal           v17.4s,  v7.4h,   v5.4h
1960        rshrn           v16.4h,  v16.4s,  #6
1961        rshrn           v17.4h,  v17.4s,  #6
1962        st1             {v16.4h}, [x0], x1
1963        add             w7,  w7,  w5              // xpos += dx
1964        subs            w4,  w4,  #2
1965        st1             {v17.4h}, [x0], x1
1966        b.gt            4b
1967        ret
1968
196949:
1970        st1             {v31.4h}, [x0], x1
1971        subs            w4,  w4,  #2
1972        st1             {v31.4h}, [x0], x1
1973        b.gt            49b
1974        ret
1975
19768:      // w == 8
1977        lsr             w8,  w7,  #6              // base
1978        and             w9,  w7,  #0x3e           // frac
1979        add             w7,  w7,  w5              // xpos += dx
1980        cmp             w8,  w6                   // base >= max_base_x
1981        lsr             w10, w7,  #6              // base
1982        and             w11, w7,  #0x3e           // frac
1983        b.ge            89f
1984        add             x8,  x2,  w8,  uxtw #1
1985        add             x10, x2,  w10, uxtw #1
1986        dup             v4.8h,   w9               // frac
1987        dup             v5.8h,   w11
1988        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
1989        ld1             {v2.8h, v3.8h},  [x10]
1990        sub             w9,  w15, w9              // 64 - frac
1991        sub             w11, w15, w11
1992        dup             v6.8h,   w9               // 64 - frac
1993        dup             v7.8h,   w11
1994        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
1995        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
1996        uzp2            v21.8h,  v2.8h,   v3.8h
1997        uzp1            v2.8h,   v2.8h,   v3.8h
1998        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
1999        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
2000        umull2          v17.4s,  v0.8h,   v6.8h
2001        umlal2          v17.4s,  v20.8h,  v4.8h
2002        umull           v18.4s,  v2.4h,   v7.4h
2003        umlal           v18.4s,  v21.4h,  v5.4h
2004        umull2          v19.4s,  v2.8h,   v7.8h
2005        umlal2          v19.4s,  v21.8h,  v5.8h
2006        rshrn           v16.4h,  v16.4s,  #6
2007        rshrn2          v16.8h,  v17.4s,  #6
2008        rshrn           v17.4h,  v18.4s,  #6
2009        rshrn2          v17.8h,  v19.4s,  #6
2010        st1             {v16.8h}, [x0], x1
2011        add             w7,  w7,  w5              // xpos += dx
2012        subs            w4,  w4,  #2
2013        st1             {v17.8h}, [x0], x1
2014        b.gt            8b
2015        ret
2016
201789:
2018        st1             {v31.8h}, [x0], x1
2019        subs            w4,  w4,  #2
2020        st1             {v31.8h}, [x0], x1
2021        b.gt            89b
2022        ret
2023endfunc
2024
2025// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
2026//                               const int n);
2027function ipred_reverse_16bpc_neon, export=1
2028        sub             x1,  x1,  #16
2029        add             x3,  x0,  #8
2030        mov             x4,  #16
20311:
2032        ld1             {v0.8h}, [x1]
2033        subs            w2,  w2,  #8
2034        rev64           v0.8h,  v0.8h
2035        sub             x1,  x1,  #16
2036        st1             {v0.d}[1], [x0], x4
2037        st1             {v0.d}[0], [x3], x4
2038        b.gt            1b
2039        ret
2040endfunc
2041
2042const increments
2043        .short          0,  1,  2,  3,  4,  5,  6,  7
2044endconst
2045
2046// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
2047//                                const pixel *const top,
2048//                                const pixel *const left,
2049//                                const int width, const int height,
2050//                                const int dx, const int dy);
2051function ipred_z2_fill1_16bpc_neon, export=1
2052        clz             w10, w4
2053        adr             x9,  L(ipred_z2_fill1_tbl)
2054        sub             w10, w10, #25
2055        ldrh            w10, [x9, w10, uxtw #1]
2056        mov             w8,  #(1 << 6)            // xpos = 1 << 6
2057        sub             x9,  x9,  w10, uxtw
2058        sub             w8,  w8,  w6              // xpos -= dx
2059
2060        movrel          x11, increments
2061        ld1             {v31.8h},  [x11]          // increments
2062        neg             w7,  w7                   // -dy
2063
2064        br              x9
206540:
2066        AARCH64_VALID_JUMP_TARGET
2067
2068        dup             v30.4h,  w7               // -dy
2069        movi            v17.8b,  #1
2070
2071        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2072        movi            v25.8h,  #0x3e
2073        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2074
2075        // Worst case height for w=4 is 16, but we need at least h+1 elements
2076        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
2077
2078        movi            v26.8h,  #64
2079        movi            v19.16b, #4
2080
2081        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2082        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2083
2084        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2085
2086        movi            v23.4h,  #1, lsl #8
2087        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2088        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2089        movi            v17.8b,  #2
2090        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2091
2092        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2093        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2094
2095        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2096
2097        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2098
2099        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2100
2101        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2102
2103        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2104        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2105
2106        movi            v29.16b, #4
21074:
2108        asr             w9,  w8,  #6              // base_x
2109        dup             v16.4h,  w8               // xpos
2110        sub             w8,  w8,  w6              // xpos -= dx
2111        cmp             w9,  #-4                  // base_x <= -4
2112        asr             w11, w8,  #6              // base_x
2113        b.le            49f
2114
2115        lsl             w9,  w9,  #1
2116        lsl             w11, w11, #1
2117
2118        dup             v17.4h,  w8               // xpos
2119
2120        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2121        ldr             q6,  [x2, w11, sxtw]
2122
2123        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2124
2125        // Cut corners here; only doing tbl over v0-v1 here; we only
2126        // seem to need the last pixel, from v2, after skipping to the
2127        // left-only codepath below.
2128        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2129
2130        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2131
2132        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
2133        ext             v7.16b,  v6.16b,  v6.16b,  #2
2134
2135        and             v16.16b, v16.16b, v25.16b // frac_x
2136
2137        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2138
2139        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
2140        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
2141
2142        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2143
2144        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2145
2146        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2147        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2148        umull2          v22.4s,  v18.8h,  v28.8h
2149        umlal2          v22.4s,  v19.8h,  v27.8h
2150
2151        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2152        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2153        umull2          v24.4s,  v4.8h,   v17.8h
2154        umlal2          v24.4s,  v5.8h,   v16.8h
2155
2156        cmge            v20.8h,  v20.8h,  #0
2157
2158        rshrn           v21.4h,  v21.4s,  #6
2159        rshrn2          v21.8h,  v22.4s,  #6
2160        rshrn           v22.4h,  v23.4s,  #6
2161        rshrn2          v22.8h,  v24.4s,  #6
2162
2163        bit             v21.16b, v22.16b, v20.16b
2164
2165        st1             {v21.d}[0], [x0], x1
2166        sub             w8,  w8,  w6              // xpos -= dx
2167        subs            w5,  w5,  #2
2168        st1             {v21.d}[1], [x0], x1
2169        b.le            9f
2170
2171        ext             v18.16b, v19.16b, v19.16b, #8
2172        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2173        b               4b
2174
217549:
2176        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
2177
2178        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2179
2180        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2181        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2182        umull2          v21.4s,  v18.8h,  v28.8h
2183        umlal2          v21.4s,  v19.8h,  v27.8h
2184
2185        rshrn           v20.4h,  v20.4s,  #6
2186        rshrn2          v20.8h,  v21.4s,  #6
2187
2188        st1             {v20.d}[0], [x0], x1
2189        subs            w5,  w5,  #2
2190        st1             {v20.d}[1], [x0], x1
2191        b.le            9f
2192
2193        ext             v18.16b, v19.16b, v19.16b, #8
2194        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2195        b               49b
2196
21979:
2198        ret
2199
220080:
2201        AARCH64_VALID_JUMP_TARGET
2202
2203        stp             d8,  d9,  [sp, #-0x40]!
2204        stp             d10, d11, [sp, #0x10]
2205        stp             d12, d13, [sp, #0x20]
2206        stp             d14, d15, [sp, #0x30]
2207
2208        dup             v18.8h,  w7               // -dy
2209        add             x3,  x3,  #2              // Skip past left[0]
2210
2211        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2212        movi            v25.8h,  #0x3e
2213        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2214
2215        // Worst case height for w=8 is 32.
2216        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2217        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2218
2219        movi            v26.8h,  #64
2220        movi            v19.16b, #4
2221
2222        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2223        and             v27.16b, v16.16b, v25.16b // frac_y
2224
2225        movi            v23.8h,  #1, lsl #8
2226        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2227        mov             v18.16b, v15.16b          // left[0]
2228        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2229        movi            v17.16b, #2
2230        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
2231
2232        // Cut corners here; for the first row we don't expect to need to
2233        // read outside of v0.
2234        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
2235
2236        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
2237        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
2238
2239        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
2240
2241        movi            v24.16b, #4
22428:
2243        asr             w9,  w8,  #6              // base_x
2244        dup             v16.8h,   w8              // xpos
2245        sub             w8,  w8,  w6              // xpos -= dx
2246        cmp             w9,  #-16                 // base_x <= -16
2247        asr             w11, w8,  #6              // base_x
2248        b.le            89f
2249
2250        dup             v17.8h,   w8              // xpos
2251
2252        add             x9,  x2,  w9,  sxtw #1
2253        add             x11, x2,  w11, sxtw #1
2254
2255        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
2256        mov             v19.16b, v15.16b          // left[0]
2257        ld1             {v6.8h, v7.8h}, [x11]
2258
2259        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2260
2261        mov             v20.16b, v15.16b          // left[0]
2262
2263        sshr            v21.8h,  v16.8h,  #6      // first base_x
2264        sshr            v22.8h,  v17.8h,  #6
2265
2266        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2267
2268        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
2269        ext             v7.16b,  v6.16b,  v7.16b,  #2
2270
2271        and             v16.16b, v16.16b, v25.16b // frac_x
2272        and             v17.16b, v17.16b, v25.16b
2273
2274        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2275        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2276
2277        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
2278        sub             v9.8h,   v26.8h,  v17.8h
2279
2280        umull2          v11.4s,  v18.8h,  v28.8h
2281        umlal2          v11.4s,  v19.8h,  v27.8h
2282
2283        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2284        add             v22.8h,  v22.8h,  v31.8h
2285
2286        umull           v12.4s,  v19.4h,  v28.4h
2287        umlal           v12.4s,  v20.4h,  v27.4h
2288        umull2          v13.4s,  v19.8h,  v28.8h
2289        umlal2          v13.4s,  v20.8h,  v27.8h
2290
2291        rshrn           v10.4h,  v10.4s,  #6
2292        rshrn2          v10.8h,  v11.4s,  #6
2293        rshrn           v11.4h,  v12.4s,  #6
2294        rshrn2          v11.8h,  v13.4s,  #6
2295
2296        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2297        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2298        umull2          v13.4s,  v4.8h,   v8.8h
2299        umlal2          v13.4s,  v5.8h,   v16.8h
2300        umull           v14.4s,  v6.4h,   v9.4h
2301        umlal           v14.4s,  v7.4h,   v17.4h
2302        umull2          v18.4s,  v6.8h,   v9.8h
2303        umlal2          v18.4s,  v7.8h,   v17.8h
2304
2305        cmge            v21.8h,  v21.8h,  #0
2306        cmge            v22.8h,  v22.8h,  #0
2307
2308        rshrn           v12.4h,  v12.4s,  #6
2309        rshrn2          v12.8h,  v13.4s,  #6
2310        rshrn           v13.4h,  v14.4s,  #6
2311        rshrn2          v13.8h,  v18.4s,  #6
2312
2313        bit             v10.16b, v12.16b, v21.16b
2314        bit             v11.16b, v13.16b, v22.16b
2315
2316        st1             {v10.8h}, [x0], x1
2317        subs            w5,  w5,  #2
2318        sub             w8,  w8,  w6              // xpos -= dx
2319        st1             {v11.8h}, [x0], x1
2320        b.le            9f
2321
2322        mov             v18.16b, v20.16b
2323        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2324        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2325        b               8b
2326
232789:
2328        mov             v19.16b, v15.16b
2329        mov             v20.16b, v15.16b
2330        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2331        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
2332
2333        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2334        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2335        umull2          v5.4s,   v18.8h,  v28.8h
2336        umlal2          v5.4s,   v19.8h,  v27.8h
2337        umull           v6.4s,   v19.4h,  v28.4h
2338        umlal           v6.4s,   v20.4h,  v27.4h
2339        umull2          v7.4s,   v19.8h,  v28.8h
2340        umlal2          v7.4s,   v20.8h,  v27.8h
2341
2342        rshrn           v4.4h,   v4.4s,   #6
2343        rshrn2          v4.8h,   v5.4s,   #6
2344        rshrn           v5.4h,   v6.4s,   #6
2345        rshrn2          v5.8h,   v7.4s,   #6
2346
2347        st1             {v4.8h}, [x0], x1
2348        subs            w5,  w5,  #2
2349        st1             {v5.8h}, [x0], x1
2350        b.le            9f
2351
2352        mov             v18.16b, v20.16b
2353        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
2354        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
2355        b               89b
2356
23579:
2358        ldp             d14, d15, [sp, #0x30]
2359        ldp             d12, d13, [sp, #0x20]
2360        ldp             d10, d11, [sp, #0x10]
2361        ldp             d8,  d9,  [sp], 0x40
2362        ret
2363
2364160:
2365320:
2366640:
2367        AARCH64_VALID_JUMP_TARGET
2368
2369        stp             d8,  d9,  [sp, #-0x40]!
2370        stp             d10, d11, [sp, #0x10]
2371        stp             d12, d13, [sp, #0x20]
2372        stp             d14, d15, [sp, #0x30]
2373
2374        dup             v25.8h,  w7               // -dy
2375        add             x3,  x3,  #2              // Skip past left[0]
2376
2377        add             x13, x0,  x1              // alternating row
2378        lsl             x1,  x1,  #1              // stride *= 2
2379        sub             x1,  x1,  w4,  uxtw #1    // stride -= width
2380
2381        movi            v11.8h,  #8
2382        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
2383        add             v26.8h,  v26.8h,  v25.8h  // -= dy
2384        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
2385
2386        // Worst case height is 64, but we can only fit 32 pixels into
2387        // v0-v3 usable within one tbx instruction. As long as base_y is
2388        // up to 32, we use tbx.
2389        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
2390        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
2391
2392        mov             w12, w4                   // orig w
2393        neg             w14, w4                   // -w
2394
23951:
2396        mov             v23.16b, v26.16b          // reset ypos
2397
2398        asr             w9,  w8,  #6              // base_x
2399        dup             v16.8h,   w8              // xpos
2400        sub             w8,  w8,  w6              // xpos -= dx
2401        cmp             w9,  w14                  // base_x <= -2*w
2402        asr             w11, w8,  #6              // base_x
2403        b.le            169f
2404
2405        dup             v17.8h,   w8              // xpos
2406        sub             w8,  w8,  w6              // xpos -= dx
2407
2408        add             x9,  x2,  w9,  sxtw #1
2409        add             x11, x2,  w11, sxtw #1
2410
2411        sshr            v21.8h,  v16.8h,  #6      // first base_x
2412        sshr            v22.8h,  v17.8h,  #6
2413
2414        ld1             {v4.8h}, [x9], #16        // top[base_x]
2415        ld1             {v6.8h}, [x11], #16
2416
2417        movi            v10.8h,  #0x3e
2418        movi            v11.8h,  #64
2419
2420        and             v16.16b, v16.16b, v10.16b // frac_x
2421        and             v17.16b, v17.16b, v10.16b
2422
2423        sub             v8.8h,   v11.8h,  v16.8h  // 64 - frac_x
2424        sub             v9.8h,   v11.8h,  v17.8h
2425
2426        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
2427        add             v22.8h,  v22.8h,  v31.8h
2428
24292:
2430        smov            w10,     v22.h[0]
2431
2432        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2433        movi            v12.8h,  #64
2434        cmp             w10, #0                   // base_x (bottom left) >= 0
2435        smov            w10,     v29.b[0]         // base_y[0]
2436        movi            v10.8h,  #0x3e
2437
2438        b.ge            4f
2439        and             v27.16b, v23.16b, v10.16b // frac_y
2440        cmp             w10,     #(32-3)
2441
2442        mov             v18.16b, v15.16b          // left[0]
2443        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2444        b.gt            22f
2445
244621:
2447        // base_y < 32, using tbx
2448        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2449        movi            v11.8h,  #1, lsl #8
2450        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2451        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2452
2453        movi            v13.16b, #2
2454
2455        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2456
2457        add             v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
2458        mov             v19.16b, v15.16b          // left[0]
2459
2460        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2461
2462        add             v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
2463        mov             v20.16b, v15.16b          // left[0]
2464
2465        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2466
2467        b               23f
2468
246922:
2470        // base_y >= 32, using separate loads.
2471        smov            w15,     v29.b[1]
2472        smov            w16,     v29.b[2]
2473        add             x10, x3,  w10, sxtw #1
2474        smov            w17,     v29.b[3]
2475        add             x15, x3,  w15, sxtw #1
2476        ld3             {v18.h, v19.h, v20.h}[0], [x10]
2477        smov            w10,     v29.b[4]
2478        add             x16, x3,  w16, sxtw #1
2479        ld3             {v18.h, v19.h, v20.h}[1], [x15]
2480        smov            w15,     v29.b[5]
2481        add             x17, x3,  w17, sxtw #1
2482        ld3             {v18.h, v19.h, v20.h}[2], [x16]
2483        smov            w16,     v29.b[6]
2484        add             x10, x3,  w10, sxtw #1
2485        ld3             {v18.h, v19.h, v20.h}[3], [x17]
2486        smov            w17,     v29.b[7]
2487        add             x15, x3,  w15, sxtw #1
2488        add             x16, x3,  w16, sxtw #1
2489        ld3             {v18.h, v19.h, v20.h}[4], [x10]
2490        add             x17, x3,  w17, sxtw #1
2491        ld3             {v18.h, v19.h, v20.h}[5], [x15]
2492        ld3             {v18.h, v19.h, v20.h}[6], [x16]
2493        ld3             {v18.h, v19.h, v20.h}[7], [x17]
2494
249523:
2496
2497        ld1             {v5.8h}, [x9], #16        // top[base_x]
2498        ld1             {v7.8h}, [x11], #16
2499
2500        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2501
2502        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2503        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2504        umull2          v11.4s,  v18.8h,  v28.8h
2505        umlal2          v11.4s,  v19.8h,  v27.8h
2506        umull           v12.4s,  v19.4h,  v28.4h
2507        umlal           v12.4s,  v20.4h,  v27.4h
2508        umull2          v13.4s,  v19.8h,  v28.8h
2509        umlal2          v13.4s,  v20.8h,  v27.8h
2510
2511        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2512        ext             v19.16b, v6.16b,  v7.16b,  #2
2513
2514        rshrn           v10.4h,  v10.4s,  #6
2515        rshrn2          v10.8h,  v11.4s,  #6
2516        rshrn           v11.4h,  v12.4s,  #6
2517        rshrn2          v11.8h,  v13.4s,  #6
2518
2519        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2520        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2521        umull2          v13.4s,  v4.8h,   v8.8h
2522        umlal2          v13.4s,  v18.8h,  v16.8h
2523        umull           v14.4s,  v6.4h,   v9.4h
2524        umlal           v14.4s,  v19.4h,  v17.4h
2525        umull2          v20.4s,  v6.8h,   v9.8h
2526        umlal2          v20.4s,  v19.8h,  v17.8h
2527
2528        cmge            v18.8h,  v21.8h,  #0
2529        cmge            v19.8h,  v22.8h,  #0
2530
2531        rshrn           v12.4h,  v12.4s,  #6
2532        rshrn2          v12.8h,  v13.4s,  #6
2533        rshrn           v13.4h,  v14.4s,  #6
2534        rshrn2          v13.8h,  v20.4s,  #6
2535
2536        bit             v10.16b, v12.16b, v18.16b
2537        bit             v11.16b, v13.16b, v19.16b
2538
2539        st1             {v10.8h}, [x0], #16
2540        subs            w4,  w4,  #8
2541        st1             {v11.8h}, [x13], #16
2542        b.le            3f
2543
2544        movi            v10.8h,  #8
2545        mov             v4.16b,  v5.16b
2546        mov             v6.16b,  v7.16b
2547        add             v21.8h,  v21.8h,  v10.8h  // base_x += 8
2548        add             v22.8h,  v22.8h,  v10.8h
2549        b               2b
2550
25513:
2552        subs            w5,  w5,  #2
2553        b.le            9f
2554        movi            v10.8h, #128
2555        add             x0,  x0,  x1
2556        add             x13, x13, x1
2557        mov             w4,  w12                  // reset w
2558        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
2559        b               1b
2560
25614:      // The rest of the row only predicted from top[]
2562        ld1             {v5.8h}, [x9], #16        // top[base_x]
2563        ld1             {v7.8h}, [x11], #16
2564
2565        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
2566        ext             v19.16b, v6.16b,  v7.16b,  #2
2567
2568        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
2569        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
2570        umull2          v13.4s,  v4.8h,   v8.8h
2571        umlal2          v13.4s,  v18.8h,  v16.8h
2572        umull           v14.4s,  v6.4h,   v9.4h
2573        umlal           v14.4s,  v19.4h,  v17.4h
2574        umull2          v20.4s,  v6.8h,   v9.8h
2575        umlal2          v20.4s,  v19.8h,  v17.8h
2576
2577        rshrn           v12.4h,  v12.4s,  #6
2578        rshrn2          v12.8h,  v13.4s,  #6
2579        rshrn           v13.4h,  v14.4s,  #6
2580        rshrn2          v13.8h,  v20.4s,  #6
2581
2582        st1             {v12.8h}, [x0], #16
2583        subs            w4,  w4,  #8
2584        st1             {v13.8h}, [x13], #16
2585        b.le            3b
2586
2587        mov             v4.16b,  v5.16b
2588        mov             v6.16b,  v7.16b
2589        b               4b
2590
2591169:    // The rest of the block only predicted from left[]
2592        add             x1,  x1,  w4,  uxtw #1    // restore stride
2593        mov             w12, w5                   // orig remaining h
25941:
2595        movi            v12.8h,  #64
2596        movi            v10.8h,  #0x3e
2597
2598        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
2599        and             v27.16b, v23.16b, v10.16b // frac_y
2600
2601        smov            w10,     v29.b[0]         // base_y[0]
2602
2603        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2604        movi            v11.8h,  #1, lsl #8
2605        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2606        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
2607        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
2608
2609        cmp             w10,     #(32-1)
2610
2611        mov             v18.16b, v15.16b          // left[0]
2612        movi            v21.16b, #2
2613
2614        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
2615
2616        b.gt            31f
2617
2618        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
2619        add             v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
2620
26212:
2622        // base_y < 32, using tbx.
2623        smov            w10,     v29.b[0]         // base_y[0]
2624        mov             v19.16b, v15.16b          // left[0]
2625        cmp             w10,     #(64-4)
2626        b.gt            32f
2627        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
2628        add             v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
2629        mov             v20.16b, v15.16b          // left[0]
2630        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
2631        add             v29.16b, v29.16b, v21.16b // next base_y
2632
2633        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2634        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2635        umull2          v11.4s,  v18.8h,  v28.8h
2636        umlal2          v11.4s,  v19.8h,  v27.8h
2637        umull           v12.4s,  v19.4h,  v28.4h
2638        umlal           v12.4s,  v20.4h,  v27.4h
2639        umull2          v13.4s,  v19.8h,  v28.8h
2640        umlal2          v13.4s,  v20.8h,  v27.8h
2641
2642        rshrn           v10.4h,  v10.4s,  #6
2643        rshrn2          v10.8h,  v11.4s,  #6
2644        rshrn           v11.4h,  v12.4s,  #6
2645        rshrn2          v11.8h,  v13.4s,  #6
2646
2647        st1             {v10.8h}, [x0], x1
2648        subs            w5,  w5,  #2
2649        st1             {v11.8h}, [x13], x1
2650        b.le            4f
2651        mov             v18.16b, v20.16b
2652        b               2b
2653
265431:     // base_y >= 32, using separate loads, loading v18 if we had to bail
2655        // in the prologue.
2656        smov            w10,     v29.b[0]
2657        smov            w15,     v29.b[2]
2658        movi            v21.16b, #2
2659        smov            w16,     v29.b[4]
2660        add             x10, x3,  w10, sxtw
2661        smov            w17,     v29.b[6]
2662        add             x15, x3,  w15, sxtw
2663        ld1             {v18.h}[0], [x10]
2664        smov            w10,     v29.b[8]
2665        add             x16, x3,  w16, sxtw
2666        ld1             {v18.h}[1], [x15]
2667        smov            w15,     v29.b[10]
2668        add             x17, x3,  w17, sxtw
2669        ld1             {v18.h}[2], [x16]
2670        smov            w16,     v29.b[12]
2671        add             x10, x3,  w10, sxtw
2672        ld1             {v18.h}[3], [x17]
2673        smov            w17,     v29.b[14]
2674        add             x15, x3,  w15, sxtw
2675        add             x16, x3,  w16, sxtw
2676        ld1             {v18.h}[4], [x10]
2677        add             x17, x3,  w17, sxtw
2678        ld1             {v18.h}[5], [x15]
2679        add             v29.16b, v29.16b, v21.16b // next base_y
2680        ld1             {v18.h}[6], [x16]
2681        ld1             {v18.h}[7], [x17]
2682
268332:     // base_y >= 32, using separate loads.
2684        cmp             w5,  #4
2685        b.lt            34f
268633:     // h >= 4, preserving v18 from the previous round, loading v19-v22.
2687        smov            w10,     v29.b[0]
2688        subs            w5,  w5,  #4
2689        smov            w15,     v29.b[2]
2690        movi            v10.16b, #8
2691        smov            w16,     v29.b[4]
2692        add             x10, x3,  w10, sxtw
2693        smov            w17,     v29.b[6]
2694        add             x15, x3,  w15, sxtw
2695        ld4             {v19.h, v20.h, v21.h, v22.h}[0], [x10]
2696        smov            w10,     v29.b[8]
2697        add             x16, x3,  w16, sxtw
2698        ld4             {v19.h, v20.h, v21.h, v22.h}[1], [x15]
2699        smov            w15,     v29.b[10]
2700        add             x17, x3,  w17, sxtw
2701        ld4             {v19.h, v20.h, v21.h, v22.h}[2], [x16]
2702        smov            w16,     v29.b[12]
2703        add             x10, x3,  w10, sxtw
2704        ld4             {v19.h, v20.h, v21.h, v22.h}[3], [x17]
2705        smov            w17,     v29.b[14]
2706        add             x15, x3,  w15, sxtw
2707        add             x16, x3,  w16, sxtw
2708        ld4             {v19.h, v20.h, v21.h, v22.h}[4], [x10]
2709        add             x17, x3,  w17, sxtw
2710        ld4             {v19.h, v20.h, v21.h, v22.h}[5], [x15]
2711        ld4             {v19.h, v20.h, v21.h, v22.h}[6], [x16]
2712        add             v29.16b, v29.16b, v10.16b // next base_y
2713        ld4             {v19.h, v20.h, v21.h, v22.h}[7], [x17]
2714
2715        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2716        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2717        umull2          v11.4s,  v18.8h,  v28.8h
2718        umlal2          v11.4s,  v19.8h,  v27.8h
2719        umull           v12.4s,  v19.4h,  v28.4h
2720        umlal           v12.4s,  v20.4h,  v27.4h
2721        umull2          v13.4s,  v19.8h,  v28.8h
2722        umlal2          v13.4s,  v20.8h,  v27.8h
2723
2724        rshrn           v10.4h,  v10.4s,  #6
2725        rshrn2          v10.8h,  v11.4s,  #6
2726        rshrn           v11.4h,  v12.4s,  #6
2727        rshrn2          v11.8h,  v13.4s,  #6
2728
2729        umull           v12.4s,  v20.4h,  v28.4h  // left[base_y]*(64-frac_y)
2730        umlal           v12.4s,  v21.4h,  v27.4h  // + left[base_y+1]*frac_y
2731        umull2          v13.4s,  v20.8h,  v28.8h
2732        umlal2          v13.4s,  v21.8h,  v27.8h
2733        umull           v14.4s,  v21.4h,  v28.4h
2734        umlal           v14.4s,  v22.4h,  v27.4h
2735        umull2          v18.4s,  v21.8h,  v28.8h
2736        umlal2          v18.4s,  v22.8h,  v27.8h
2737
2738        rshrn           v12.4h,  v12.4s,  #6
2739        rshrn2          v12.8h,  v13.4s,  #6
2740        rshrn           v13.4h,  v14.4s,  #6
2741        rshrn2          v13.8h,  v18.4s,  #6
2742
2743        st1             {v10.8h}, [x0],  x1
2744        cmp             w5,  #2
2745        st1             {v11.8h}, [x13], x1
2746        st1             {v12.8h}, [x0],  x1
2747        st1             {v13.8h}, [x13], x1
2748        b.lt            4f
2749        mov             v18.16b, v22.16b
2750        b.gt            33b
2751
275234:     // h == 2, preserving v18 from the previous round, loading v19-v20.
2753        smov            w10,     v29.b[0]
2754        smov            w15,     v29.b[2]
2755        movi            v21.16b, #4
2756        smov            w16,     v29.b[4]
2757        add             x10, x3,  w10, sxtw
2758        smov            w17,     v29.b[6]
2759        add             x15, x3,  w15, sxtw
2760        ld2             {v19.h, v20.h}[0], [x10]
2761        smov            w10,     v29.b[8]
2762        add             x16, x3,  w16, sxtw
2763        ld2             {v19.h, v20.h}[1], [x15]
2764        smov            w15,     v29.b[10]
2765        add             x17, x3,  w17, sxtw
2766        ld2             {v19.h, v20.h}[2], [x16]
2767        smov            w16,     v29.b[12]
2768        add             x10, x3,  w10, sxtw
2769        ld2             {v19.h, v20.h}[3], [x17]
2770        smov            w17,     v29.b[14]
2771        add             x15, x3,  w15, sxtw
2772        add             x16, x3,  w16, sxtw
2773        ld2             {v19.h, v20.h}[4], [x10]
2774        add             x17, x3,  w17, sxtw
2775        ld2             {v19.h, v20.h}[5], [x15]
2776        ld2             {v19.h, v20.h}[6], [x16]
2777        add             v29.16b, v29.16b, v21.16b // next base_y
2778        ld2             {v19.h, v20.h}[7], [x17]
2779
2780        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2781        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2782        umull2          v11.4s,  v18.8h,  v28.8h
2783        umlal2          v11.4s,  v19.8h,  v27.8h
2784        umull           v12.4s,  v19.4h,  v28.4h
2785        umlal           v12.4s,  v20.4h,  v27.4h
2786        umull2          v13.4s,  v19.8h,  v28.8h
2787        umlal2          v13.4s,  v20.8h,  v27.8h
2788
2789        rshrn           v10.4h,  v10.4s,  #6
2790        rshrn2          v10.8h,  v11.4s,  #6
2791        rshrn           v11.4h,  v12.4s,  #6
2792        rshrn2          v11.8h,  v13.4s,  #6
2793
2794        st1             {v10.8h}, [x0], x1
2795        st1             {v11.8h}, [x13], x1
2796        // The h==2 case only happens once at the end, if at all.
2797
27984:
2799        subs            w4,  w4,  #8
2800        b.le            9f
2801
2802        lsr             x1,  x1,  #1
2803        msub            x0,  x1,  x12, x0         // ptr -= h * stride
2804        msub            x13, x1,  x12, x13
2805        lsl             x1,  x1,  #1
2806        add             x0,  x0,  #16
2807        add             x13, x13, #16
2808        mov             w5,  w12                  // reset h
2809        b               1b
2810
28119:
2812        ldp             d14, d15, [sp, #0x30]
2813        ldp             d12, d13, [sp, #0x20]
2814        ldp             d10, d11, [sp, #0x10]
2815        ldp             d8,  d9,  [sp], 0x40
2816        ret
2817
2818L(ipred_z2_fill1_tbl):
2819        .hword L(ipred_z2_fill1_tbl) - 640b
2820        .hword L(ipred_z2_fill1_tbl) - 320b
2821        .hword L(ipred_z2_fill1_tbl) - 160b
2822        .hword L(ipred_z2_fill1_tbl) -  80b
2823        .hword L(ipred_z2_fill1_tbl) -  40b
2824endfunc
2825
2826function ipred_z2_fill2_16bpc_neon, export=1
2827        cmp             w4,  #8
2828        mov             w8,  #(2 << 6)            // xpos = 2 << 6
2829        sub             w8,  w8,  w6              // xpos -= dx
2830
2831        movrel          x11, increments
2832        ld1             {v31.8h},  [x11]          // increments
2833        neg             w7,  w7                   // -dy
2834        b.eq            80f
2835
283640:
2837        dup             v30.4h,  w7               // -dy
2838        movi            v17.8b,  #1
2839
2840        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
2841        movi            v25.8h,  #0x3e
2842        add             v30.4h,  v16.4h,  v30.4h  // -= dy
2843
2844        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2845        // from left.
2846        ld1             {v0.8h, v1.8h}, [x3]      // left[]
2847
2848        movi            v26.8h,  #64
2849        movi            v19.16b, #4
2850
2851        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
2852        and             v27.8b,  v30.8b,  v25.8b  // frac_y
2853
2854        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2855
2856        movi            v23.4h,  #1, lsl #8
2857        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2858        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
2859        movi            v17.8b,  #2
2860        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
2861
2862        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
2863        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
2864
2865        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
2866
2867        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
2868
2869        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
2870
2871        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
2872
2873        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
2874        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
2875
2876        movi            v29.16b, #4
2877        add             v31.8h,  v31.8h,  v31.8h  // {0,2,4,6,0,2,4,6}
28784:
2879        asr             w9,  w8,  #6              // base_x
2880        dup             v16.4h,  w8               // xpos
2881        sub             w8,  w8,  w6              // xpos -= dx
2882        cmp             w9,  #-8                  // base_x <= -8
2883        asr             w11, w8,  #6              // base_x
2884        b.le            49f
2885
2886        lsl             w9,  w9,  #1
2887        lsl             w11, w11, #1
2888
2889        dup             v17.4h,  w8               // xpos
2890
2891        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
2892        ldr             q6,  [x2, w11, sxtw]
2893
2894        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
2895
2896        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2897
2898        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
2899
2900        uzp2            v5.8h,   v4.8h,   v6.8h   // top[base_x+1]
2901        uzp1            v4.8h,   v4.8h,   v6.8h   // top[base_x]
2902
2903        and             v16.16b, v16.16b, v25.16b // frac_x
2904
2905        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2906
2907        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
2908
2909        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
2910
2911        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2912        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2913        umull2          v22.4s,  v18.8h,  v28.8h
2914        umlal2          v22.4s,  v19.8h,  v27.8h
2915
2916        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
2917        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
2918        umull2          v24.4s,  v4.8h,   v17.8h
2919        umlal2          v24.4s,  v5.8h,   v16.8h
2920
2921        cmge            v20.8h,  v20.8h,  #0
2922
2923        rshrn           v21.4h,  v21.4s,  #6
2924        rshrn2          v21.8h,  v22.4s,  #6
2925        rshrn           v22.4h,  v23.4s,  #6
2926        rshrn2          v22.8h,  v24.4s,  #6
2927
2928        bit             v21.16b, v22.16b, v20.16b
2929
2930        st1             {v21.d}[0], [x0], x1
2931        sub             w8,  w8,  w6              // xpos -= dx
2932        subs            w5,  w5,  #2
2933        st1             {v21.d}[1], [x0], x1
2934        b.le            9f
2935
2936        ext             v18.16b, v19.16b, v19.16b, #8
2937        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2938        b               4b
2939
294049:
2941        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
2942
2943        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
2944
2945        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
2946        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
2947        umull2          v21.4s,  v18.8h,  v28.8h
2948        umlal2          v21.4s,  v19.8h,  v27.8h
2949
2950        rshrn           v20.4h,  v20.4s,  #6
2951        rshrn2          v20.8h,  v21.4s,  #6
2952
2953        st1             {v20.d}[0], [x0], x1
2954        subs            w5,  w5,  #2
2955        st1             {v20.d}[1], [x0], x1
2956        b.le            9f
2957
2958        ext             v18.16b, v19.16b, v19.16b, #8
2959        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
2960        b               49b
2961
29629:
2963        ret
2964
296580:
2966        stp             d8,  d9,  [sp, #-0x40]!
2967        stp             d10, d11, [sp, #0x10]
2968        stp             d12, d13, [sp, #0x20]
2969        stp             d14, d15, [sp, #0x30]
2970
2971        dup             v18.8h,  w7               // -dy
2972        movi            v17.8b,  #1
2973
2974        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
2975        movi            v25.8h,  #0x3e
2976        add             v16.8h,  v16.8h,  v18.8h  // -= dy
2977
2978        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
2979        // from left.
2980        ld1             {v0.8h, v1.8h}, [x3]      // left[]
2981
2982        movi            v26.8h,  #64
2983        movi            v19.16b, #4
2984
2985        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
2986        and             v27.16b, v16.16b, v25.16b // frac_y
2987
2988        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
2989
2990        movi            v23.8h,  #1, lsl #8
2991        shl             v29.8b,  v29.8b,  #1      // 2*base_y
2992        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
2993        movi            v17.16b, #2
2994        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
2995
2996        // Cut corners here; for the first row we don't expect to need to
2997        // read outside of v0.
2998        tbl             v18.16b, {v0.16b}, v29.16b // left[base_y]
2999
3000        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
3001        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
3002
3003        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3004
3005        movi            v24.16b, #4
3006        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
30078:
3008        asr             w9,  w8,  #6              // base_x
3009        dup             v16.8h,   w8              // xpos
3010        sub             w8,  w8,  w6              // xpos -= dx
3011        cmp             w9,  #-16                 // base_x <= -16
3012        asr             w11, w8,  #6              // base_x
3013        b.le            89f
3014
3015        dup             v17.8h,   w8              // xpos
3016
3017        add             x9,  x2,  w9,  sxtw #1
3018        add             x11, x2,  w11, sxtw #1
3019
3020        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3021        ld1             {v6.8h, v7.8h}, [x11]
3022
3023        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3024
3025        sshr            v21.8h,  v16.8h,  #6      // first base_x
3026        sshr            v22.8h,  v17.8h,  #6
3027
3028        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3029
3030        uzp2            v2.8h,   v4.8h,   v5.8h   // top[base_x+1]
3031        uzp1            v4.8h,   v4.8h,   v5.8h   // top[base_x]
3032        uzp2            v3.8h,   v6.8h,   v7.8h
3033        uzp1            v6.8h,   v6.8h,   v7.8h
3034        mov             v5.16b,  v2.16b
3035        mov             v7.16b,  v3.16b
3036
3037        and             v16.16b, v16.16b, v25.16b // frac_x
3038        and             v17.16b, v17.16b, v25.16b
3039
3040        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3041        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3042
3043        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3044        sub             v9.8h,   v26.8h,  v17.8h
3045
3046        umull2          v11.4s,  v18.8h,  v28.8h
3047        umlal2          v11.4s,  v19.8h,  v27.8h
3048
3049        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
3050        add             v22.8h,  v22.8h,  v31.8h
3051
3052        umull           v12.4s,  v19.4h,  v28.4h
3053        umlal           v12.4s,  v20.4h,  v27.4h
3054        umull2          v13.4s,  v19.8h,  v28.8h
3055        umlal2          v13.4s,  v20.8h,  v27.8h
3056
3057        rshrn           v10.4h,  v10.4s,  #6
3058        rshrn2          v10.8h,  v11.4s,  #6
3059        rshrn           v11.4h,  v12.4s,  #6
3060        rshrn2          v11.8h,  v13.4s,  #6
3061
3062        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3063        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3064        umull2          v13.4s,  v4.8h,   v8.8h
3065        umlal2          v13.4s,  v5.8h,   v16.8h
3066        umull           v14.4s,  v6.4h,   v9.4h
3067        umlal           v14.4s,  v7.4h,   v17.4h
3068        umull2          v18.4s,  v6.8h,   v9.8h
3069        umlal2          v18.4s,  v7.8h,   v17.8h
3070
3071        cmge            v21.8h,  v21.8h,  #0
3072        cmge            v22.8h,  v22.8h,  #0
3073
3074        rshrn           v12.4h,  v12.4s,  #6
3075        rshrn2          v12.8h,  v13.4s,  #6
3076        rshrn           v13.4h,  v14.4s,  #6
3077        rshrn2          v13.8h,  v18.4s,  #6
3078
3079        bit             v10.16b, v12.16b, v21.16b
3080        bit             v11.16b, v13.16b, v22.16b
3081
3082        st1             {v10.8h}, [x0], x1
3083        subs            w5,  w5,  #2
3084        sub             w8,  w8,  w6              // xpos -= dx
3085        st1             {v11.8h}, [x0], x1
3086        b.le            9f
3087
3088        mov             v18.16b, v20.16b
3089        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3090        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3091        b               8b
3092
309389:
3094        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
3095        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
3096
3097        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3098        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3099        umull2          v5.4s,   v18.8h,  v28.8h
3100        umlal2          v5.4s,   v19.8h,  v27.8h
3101        umull           v6.4s,   v19.4h,  v28.4h
3102        umlal           v6.4s,   v20.4h,  v27.4h
3103        umull2          v7.4s,   v19.8h,  v28.8h
3104        umlal2          v7.4s,   v20.8h,  v27.8h
3105
3106        rshrn           v4.4h,   v4.4s,   #6
3107        rshrn2          v4.8h,   v5.4s,   #6
3108        rshrn           v5.4h,   v6.4s,   #6
3109        rshrn2          v5.8h,   v7.4s,   #6
3110
3111        st1             {v4.8h}, [x0], x1
3112        subs            w5,  w5,  #2
3113        st1             {v5.8h}, [x0], x1
3114        b.le            9f
3115
3116        mov             v18.16b, v20.16b
3117        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3118        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
3119        b               89b
3120
31219:
3122        ldp             d14, d15, [sp, #0x30]
3123        ldp             d12, d13, [sp, #0x20]
3124        ldp             d10, d11, [sp, #0x10]
3125        ldp             d8,  d9,  [sp], 0x40
3126        ret
3127endfunc
3128
3129function ipred_z2_fill3_16bpc_neon, export=1
3130        cmp             w4,  #8
3131        mov             w8,  #(1 << 6)            // xpos = 1 << 6
3132        sub             w8,  w8,  w6              // xpos -= dx
3133
3134        movrel          x11, increments
3135        ld1             {v31.8h},  [x11]          // increments
3136        neg             w7,  w7                   // -dy
3137        b.eq            80f
3138
313940:
3140        dup             v30.4h,  w7               // -dy
3141        movi            v17.8b,  #1
3142
3143        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
3144        movi            v25.8h,  #0x3e
3145        add             v30.4h,  v16.4h,  v30.4h  // -= dy
3146
3147        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3148        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3149
3150        movi            v26.8h,  #64
3151        movi            v19.16b, #2
3152
3153        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
3154        and             v27.8b,  v30.8b,  v25.8b  // frac_y
3155
3156        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
3157
3158        movi            v23.4h,  #1, lsl #8
3159        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3160        movi            v19.16b, #4
3161        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
3162        movi            v17.8b,  #2
3163        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
3164
3165        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
3166        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
3167
3168        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
3169
3170        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3 (*2)
3171
3172        trn1            v29.2d,  v29.2d,  v28.2d  // base_y + 0, base_y + 2
3173        trn1            v30.2d,  v30.2d,  v24.2d  // base_y + 1, base_y + 3
3174
3175        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
3176
3177        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
3178        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
3179
3180        movi            v24.16b, #8
31814:
3182        asr             w9,  w8,  #6              // base_x
3183        dup             v16.4h,  w8               // xpos
3184        sub             w8,  w8,  w6              // xpos -= dx
3185        cmp             w9,  #-4                  // base_x <= -4
3186        asr             w11, w8,  #6              // base_x
3187        b.le            49f
3188
3189        lsl             w9,  w9,  #1
3190        lsl             w11, w11, #1
3191
3192        dup             v17.4h,  w8               // xpos
3193
3194        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
3195        ldr             q6,  [x2, w11, sxtw]
3196
3197        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
3198
3199        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3200        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3201
3202        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
3203
3204        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
3205        ext             v7.16b,  v6.16b,  v6.16b,  #2
3206
3207        and             v16.16b, v16.16b, v25.16b // frac_x
3208
3209        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
3210        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
3211
3212        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
3213
3214        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
3215
3216        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3217        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3218        umull2          v22.4s,  v18.8h,  v28.8h
3219        umlal2          v22.4s,  v19.8h,  v27.8h
3220
3221        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
3222        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3223        umull2          v24.4s,  v4.8h,   v17.8h
3224        umlal2          v24.4s,  v5.8h,   v16.8h
3225
3226        cmge            v20.8h,  v20.8h,  #0
3227
3228        rshrn           v21.4h,  v21.4s,  #6
3229        rshrn2          v21.8h,  v22.4s,  #6
3230        rshrn           v22.4h,  v23.4s,  #6
3231        rshrn2          v22.8h,  v24.4s,  #6
3232
3233        movi            v24.16b, #8
3234
3235        bit             v21.16b, v22.16b, v20.16b
3236
3237        st1             {v21.d}[0], [x0], x1
3238        sub             w8,  w8,  w6              // xpos -= dx
3239        subs            w5,  w5,  #2
3240        st1             {v21.d}[1], [x0], x1
3241        b.le            9f
3242
3243        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3244        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3245        b               4b
3246
324749:
3248        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
3249        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
3250
3251        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3252        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3253        umull2          v21.4s,  v18.8h,  v28.8h
3254        umlal2          v21.4s,  v19.8h,  v27.8h
3255
3256        rshrn           v20.4h,  v20.4s,  #6
3257        rshrn2          v20.8h,  v21.4s,  #6
3258
3259        st1             {v20.d}[0], [x0], x1
3260        subs            w5,  w5,  #2
3261        st1             {v20.d}[1], [x0], x1
3262        b.le            9f
3263
3264        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
3265        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
3266        b               49b
3267
32689:
3269        ret
3270
327180:
3272        stp             d8,  d9,  [sp, #-0x40]!
3273        stp             d10, d11, [sp, #0x10]
3274        stp             d12, d13, [sp, #0x20]
3275        stp             d14, d15, [sp, #0x30]
3276
3277        dup             v18.8h,  w7               // -dy
3278        movi            v17.16b, #2
3279
3280        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
3281        movi            v25.8h,  #0x3e
3282        add             v16.8h,  v16.8h,  v18.8h  // -= dy
3283
3284        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
3285        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
3286
3287        movi            v26.8h,  #64
3288        movi            v19.16b, #4
3289
3290        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
3291        and             v27.16b, v16.16b, v25.16b // frac_y
3292
3293        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 2
3294
3295        movi            v23.8h,  #1, lsl #8
3296        shl             v29.8b,  v29.8b,  #1      // 2*base_y
3297        mov             v18.16b, v15.16b          // left[0]
3298        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
3299        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
3300
3301        add             v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
3302
3303        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
3304
3305        movi            v24.16b, #4
33068:
3307        asr             w9,  w8,  #6              // base_x
3308        dup             v16.8h,   w8              // xpos
3309        sub             w8,  w8,  w6              // xpos -= dx
3310        cmp             w9,  #-16                 // base_x <= -16
3311        asr             w11, w8,  #6              // base_x
3312        b.le            89f
3313
3314        dup             v17.8h,   w8              // xpos
3315
3316        add             x9,  x2,  w9,  sxtw #1
3317        add             x11, x2,  w11, sxtw #1
3318
3319        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
3320        ld1             {v6.8h, v7.8h}, [x11]
3321
3322        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3323        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3324        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3325        add             v30.16b, v30.16b, v24.16b
3326
3327        sshr            v22.8h,  v16.8h,  #6      // first base_x
3328        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3329        sshr            v23.8h,  v17.8h,  #6
3330        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3331
3332        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
3333        ext             v7.16b,  v6.16b,  v7.16b,  #2
3334
3335        and             v16.16b, v16.16b, v25.16b // frac_x
3336        and             v17.16b, v17.16b, v25.16b
3337
3338        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3339        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3340
3341        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
3342        sub             v9.8h,   v26.8h,  v17.8h
3343
3344        umull2          v11.4s,  v18.8h,  v28.8h
3345        umlal2          v11.4s,  v19.8h,  v27.8h
3346
3347        add             v22.8h,  v22.8h,  v31.8h  // actual base_x
3348        add             v23.8h,  v23.8h,  v31.8h
3349
3350        umull           v12.4s,  v20.4h,  v28.4h
3351        umlal           v12.4s,  v21.4h,  v27.4h
3352        umull2          v13.4s,  v20.8h,  v28.8h
3353        umlal2          v13.4s,  v21.8h,  v27.8h
3354
3355        rshrn           v10.4h,  v10.4s,  #6
3356        rshrn2          v10.8h,  v11.4s,  #6
3357        rshrn           v11.4h,  v12.4s,  #6
3358        rshrn2          v11.8h,  v13.4s,  #6
3359
3360        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
3361        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
3362        umull2          v13.4s,  v4.8h,   v8.8h
3363        umlal2          v13.4s,  v5.8h,   v16.8h
3364        umull           v14.4s,  v6.4h,   v9.4h
3365        umlal           v14.4s,  v7.4h,   v17.4h
3366        umull2          v18.4s,  v6.8h,   v9.8h
3367        umlal2          v18.4s,  v7.8h,   v17.8h
3368
3369        cmge            v22.8h,  v22.8h,  #0
3370        cmge            v23.8h,  v23.8h,  #0
3371
3372        rshrn           v12.4h,  v12.4s,  #6
3373        rshrn2          v12.8h,  v13.4s,  #6
3374        rshrn           v13.4h,  v14.4s,  #6
3375        rshrn2          v13.8h,  v18.4s,  #6
3376
3377        bit             v10.16b, v12.16b, v22.16b
3378        bit             v11.16b, v13.16b, v23.16b
3379
3380        st1             {v10.8h}, [x0], x1
3381        subs            w5,  w5,  #2
3382        sub             w8,  w8,  w6              // xpos -= dx
3383        st1             {v11.8h}, [x0], x1
3384        b.le            9f
3385
3386        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3387        add             v30.16b, v30.16b, v24.16b
3388        b               8b
3389
339089:
3391        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
3392        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3393        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
3394        add             v30.16b, v30.16b, v24.16b
3395        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
3396        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
3397
3398        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
3399        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
3400        umull2          v5.4s,   v18.8h,  v28.8h
3401        umlal2          v5.4s,   v19.8h,  v27.8h
3402        umull           v6.4s,   v20.4h,  v28.4h
3403        umlal           v6.4s,   v21.4h,  v27.4h
3404        umull2          v7.4s,   v20.8h,  v28.8h
3405        umlal2          v7.4s,   v21.8h,  v27.8h
3406
3407        rshrn           v4.4h,   v4.4s,   #6
3408        rshrn2          v4.8h,   v5.4s,   #6
3409        rshrn           v5.4h,   v6.4s,   #6
3410        rshrn2          v5.8h,   v7.4s,   #6
3411
3412        st1             {v4.8h}, [x0], x1
3413        subs            w5,  w5,  #2
3414        st1             {v5.8h}, [x0], x1
3415        b.le            9f
3416
3417        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
3418        add             v30.16b, v30.16b, v24.16b
3419        b               89b
3420
34219:
3422        ldp             d14, d15, [sp, #0x30]
3423        ldp             d12, d13, [sp, #0x20]
3424        ldp             d10, d11, [sp, #0x10]
3425        ldp             d8,  d9,  [sp], 0x40
3426        ret
3427endfunc
3428
3429// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3430//                                const pixel *const left,
3431//                                const int width, const int height,
3432//                                const int dy, const int max_base_y);
3433function ipred_z3_fill1_16bpc_neon, export=1
3434        clz             w9,  w4
3435        adr             x8,  L(ipred_z3_fill1_tbl)
3436        sub             w9,  w9,  #25
3437        ldrh            w9,  [x8, w9, uxtw #1]
3438        add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
3439        sub             x8,  x8,  w9,  uxtw
3440        ld1r            {v31.8h}, [x10]           // padding
3441        mov             w7,  w5
3442        mov             w15, #64
3443        add             x13, x0,  x1
3444        lsl             x1,  x1,  #1
3445        br              x8
3446
344740:
3448        AARCH64_VALID_JUMP_TARGET
34494:
3450        lsr             w8,  w7,  #6              // base
3451        and             w9,  w7,  #0x3e           // frac
3452        add             w7,  w7,  w5              // xpos += dx
3453        cmp             w8,  w6                   // base >= max_base_x
3454        lsr             w10, w7,  #6              // base
3455        and             w11, w7,  #0x3e           // frac
3456        b.ge            ipred_z3_fill_padding_neon
3457        lsl             w8,  w8,  #1
3458        lsl             w10, w10, #1
3459        ldr             q0,  [x2, w8, uxtw]       // left[base]
3460        ldr             q2,  [x2, w10, uxtw]
3461        dup             v4.8h,   w9               // frac
3462        dup             v5.8h,   w11
3463        ext             v1.16b,  v0.16b,  v0.16b,  #2 // left[base+1]
3464        ext             v3.16b,  v2.16b,  v2.16b,  #2
3465        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3466        sub             v7.4h,   v3.4h,   v2.4h
3467        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3468        ushll           v17.4s,  v2.4h,   #6
3469        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3470        smlal           v17.4s,  v7.4h,   v5.4h
3471        rshrn           v16.4h,  v16.4s,  #6
3472        rshrn           v17.4h,  v17.4s,  #6
3473        subs            w3,  w3,  #2
3474        zip1            v18.8h,  v16.8h,  v17.8h
3475        st1             {v18.s}[0], [x0],  x1
3476        st1             {v18.s}[1], [x13], x1
3477        add             w7,  w7,  w5              // xpos += dx
3478        st1             {v18.s}[2], [x0]
3479        st1             {v18.s}[3], [x13]
3480        b.le            9f
3481        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3482        sub             x13, x13, x1
3483        add             x0,  x0,  #4
3484        add             x13, x13, #4
3485        b               4b
34869:
3487        ret
3488
348980:
3490        AARCH64_VALID_JUMP_TARGET
34918:
3492        lsr             w8,  w7,  #6              // base
3493        and             w9,  w7,  #0x3e           // frac
3494        add             w7,  w7,  w5              // xpos += dx
3495        cmp             w8,  w6                   // base >= max_base_x
3496        lsr             w10, w7,  #6              // base
3497        and             w11, w7,  #0x3e           // frac
3498        b.ge            ipred_z3_fill_padding_neon
3499        add             x8,  x2,  w8,  uxtw #1
3500        add             x10, x2,  w10, uxtw #1
3501        dup             v4.8h,   w9               // frac
3502        dup             v5.8h,   w11
3503        ld1             {v0.8h},  [x8]            // left[base]
3504        ld1             {v2.8h},  [x10]
3505        sub             w9,  w15, w9              // 64 - frac
3506        sub             w11, w15, w11
3507        ldr             h1, [x8, #16]
3508        ldr             h3, [x10, #16]
3509        dup             v6.8h,   w9               // 64 - frac
3510        dup             v7.8h,   w11
3511        ext             v1.16b,  v0.16b,  v1.16b,  #2 // left[base+1]
3512        ext             v3.16b,  v2.16b,  v3.16b,  #2
3513        umull           v16.4s,  v0.4h,   v6.4h   // left[base]*(64-frac)
3514        umlal           v16.4s,  v1.4h,   v4.4h   // + left[base+1]*frac
3515        umull2          v17.4s,  v0.8h,   v6.8h
3516        umlal2          v17.4s,  v1.8h,   v4.8h
3517        umull           v18.4s,  v2.4h,   v7.4h
3518        umlal           v18.4s,  v3.4h,   v5.4h
3519        umull2          v19.4s,  v2.8h,   v7.8h
3520        umlal2          v19.4s,  v3.8h,   v5.8h
3521        rshrn           v16.4h,  v16.4s,  #6
3522        rshrn2          v16.8h,  v17.4s,  #6
3523        rshrn           v17.4h,  v18.4s,  #6
3524        rshrn2          v17.8h,  v19.4s,  #6
3525        subs            w3,  w3,  #2
3526        zip1            v18.8h,  v16.8h,  v17.8h
3527        zip2            v19.8h,  v16.8h,  v17.8h
3528        add             w7,  w7,  w5              // xpos += dx
3529        st1             {v18.s}[0], [x0],  x1
3530        st1             {v18.s}[1], [x13], x1
3531        st1             {v18.s}[2], [x0],  x1
3532        st1             {v18.s}[3], [x13], x1
3533        st1             {v19.s}[0], [x0],  x1
3534        st1             {v19.s}[1], [x13], x1
3535        st1             {v19.s}[2], [x0],  x1
3536        st1             {v19.s}[3], [x13], x1
3537        b.le            9f
3538        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3539        sub             x13, x13, x1, lsl #2
3540        add             x0,  x0,  #4
3541        add             x13, x13, #4
3542        b               8b
35439:
3544        ret
3545
3546160:
3547320:
3548640:
3549        AARCH64_VALID_JUMP_TARGET
3550        mov             w12, w4
35511:
3552        lsr             w8,  w7,  #6              // base
3553        and             w9,  w7,  #0x3e           // frac
3554        add             w7,  w7,  w5              // ypos += dy
3555        cmp             w8,  w6                   // base >= max_base_y
3556        lsr             w10, w7,  #6              // base
3557        and             w11, w7,  #0x3e           // frac
3558        b.ge            ipred_z3_fill_padding_neon
3559        add             x8,  x2,  w8,  uxtw #1
3560        add             x10, x2,  w10, uxtw #1
3561        dup             v6.8h,   w9               // frac
3562        dup             v7.8h,   w11
3563        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // left[base]
3564        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
3565        sub             w9,  w15, w9              // 64 - frac
3566        sub             w11, w15, w11
3567        dup             v16.8h,  w9               // 64 - frac
3568        dup             v17.8h,  w11
3569        add             w7,  w7,  w5              // ypos += dy
35702:
3571        ext             v18.16b, v0.16b,  v1.16b,  #2 // left[base+1]
3572        ext             v19.16b, v1.16b,  v2.16b,  #2
3573        ext             v20.16b, v3.16b,  v4.16b,  #2
3574        ext             v21.16b, v4.16b,  v5.16b,  #2
3575        subs            w4,  w4,  #16
3576        umull           v22.4s,  v0.4h,   v16.4h  // left[base]*(64-frac)
3577        umlal           v22.4s,  v18.4h,  v6.4h   // + left[base+1]*frac
3578        umull2          v23.4s,  v0.8h,   v16.8h
3579        umlal2          v23.4s,  v18.8h,  v6.8h
3580        umull           v24.4s,  v1.4h,   v16.4h
3581        umlal           v24.4s,  v19.4h,  v6.4h
3582        umull2          v25.4s,  v1.8h,   v16.8h
3583        umlal2          v25.4s,  v19.8h,  v6.8h
3584        umull           v26.4s,  v3.4h,   v17.4h
3585        umlal           v26.4s,  v20.4h,  v7.4h
3586        umull2          v27.4s,  v3.8h,   v17.8h
3587        umlal2          v27.4s,  v20.8h,  v7.8h
3588        umull           v28.4s,  v4.4h,   v17.4h
3589        umlal           v28.4s,  v21.4h,  v7.4h
3590        umull2          v29.4s,  v4.8h,   v17.8h
3591        umlal2          v29.4s,  v21.8h,  v7.8h
3592        rshrn           v22.4h,  v22.4s,  #6
3593        rshrn2          v22.8h,  v23.4s,  #6
3594        rshrn           v23.4h,  v24.4s,  #6
3595        rshrn2          v23.8h,  v25.4s,  #6
3596        rshrn           v24.4h,  v26.4s,  #6
3597        rshrn2          v24.8h,  v27.4s,  #6
3598        rshrn           v25.4h,  v28.4s,  #6
3599        rshrn2          v25.8h,  v29.4s,  #6
3600        zip1            v18.8h,  v22.8h,  v24.8h
3601        zip2            v19.8h,  v22.8h,  v24.8h
3602        zip1            v20.8h,  v23.8h,  v25.8h
3603        zip2            v21.8h,  v23.8h,  v25.8h
3604        st1             {v18.s}[0], [x0],  x1
3605        st1             {v18.s}[1], [x13], x1
3606        st1             {v18.s}[2], [x0],  x1
3607        st1             {v18.s}[3], [x13], x1
3608        st1             {v19.s}[0], [x0],  x1
3609        st1             {v19.s}[1], [x13], x1
3610        st1             {v19.s}[2], [x0],  x1
3611        st1             {v19.s}[3], [x13], x1
3612        st1             {v20.s}[0], [x0],  x1
3613        st1             {v20.s}[1], [x13], x1
3614        st1             {v20.s}[2], [x0],  x1
3615        st1             {v20.s}[3], [x13], x1
3616        st1             {v21.s}[0], [x0],  x1
3617        st1             {v21.s}[1], [x13], x1
3618        st1             {v21.s}[2], [x0],  x1
3619        st1             {v21.s}[3], [x13], x1
3620        b.le            3f
3621        mov             v0.16b,  v2.16b
3622        ld1             {v1.8h, v2.8h}, [x8],  #32      // left[base]
3623        mov             v3.16b,  v5.16b
3624        ld1             {v4.8h, v5.8h}, [x10], #32
3625        b               2b
3626
36273:
3628        subs            w3,  w3,  #2
3629        b.le            9f
3630        lsr             x1,  x1,  #1
3631        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3632        msub            x13, x1,  x12, x13
3633        lsl             x1,  x1,  #1
3634        add             x0,  x0,  #4
3635        add             x13, x13, #4
3636        mov             w4,  w12
3637        b               1b
36389:
3639        ret
3640
3641L(ipred_z3_fill1_tbl):
3642        .hword L(ipred_z3_fill1_tbl) - 640b
3643        .hword L(ipred_z3_fill1_tbl) - 320b
3644        .hword L(ipred_z3_fill1_tbl) - 160b
3645        .hword L(ipred_z3_fill1_tbl) -  80b
3646        .hword L(ipred_z3_fill1_tbl) -  40b
3647endfunc
3648
3649function ipred_z3_fill_padding_neon, export=0
3650        cmp             w3,  #8
3651        adr             x8,  L(ipred_z3_fill_padding_tbl)
3652        b.gt            L(ipred_z3_fill_padding_wide)
3653        // w3 = remaining width, w4 = constant height
3654        mov             w12, w4
3655
36561:
3657        // Fill a WxH rectangle with padding. W can be any number;
3658        // this fills the exact width by filling in the largest
3659        // power of two in the remaining width, and repeating.
3660        clz             w9,  w3
3661        sub             w9,  w9,  #25
3662        ldrh            w9,  [x8, w9, uxtw #1]
3663        sub             x9,  x8,  w9,  uxtw
3664        br              x9
3665
36662:
3667        AARCH64_VALID_JUMP_TARGET
3668        st1             {v31.s}[0], [x0],  x1
3669        subs            w4,  w4,  #4
3670        st1             {v31.s}[0], [x13], x1
3671        st1             {v31.s}[0], [x0],  x1
3672        st1             {v31.s}[0], [x13], x1
3673        b.gt            2b
3674        subs            w3,  w3,  #2
3675        lsr             x1,  x1,  #1
3676        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3677        msub            x13, x1,  x12, x13
3678        b.le            9f
3679        lsl             x1,  x1,  #1
3680        add             x0,  x0,  #4
3681        add             x13, x13, #4
3682        mov             w4,  w12
3683        b               1b
3684
36854:
3686        AARCH64_VALID_JUMP_TARGET
3687        st1             {v31.4h}, [x0],  x1
3688        subs            w4,  w4,  #4
3689        st1             {v31.4h}, [x13], x1
3690        st1             {v31.4h}, [x0],  x1
3691        st1             {v31.4h}, [x13], x1
3692        b.gt            4b
3693        subs            w3,  w3,  #4
3694        lsr             x1,  x1,  #1
3695        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3696        msub            x13, x1,  x12, x13
3697        b.le            9f
3698        lsl             x1,  x1,  #1
3699        add             x0,  x0,  #8
3700        add             x13, x13, #8
3701        mov             w4,  w12
3702        b               1b
3703
37048:
370516:
370632:
370764:
3708        AARCH64_VALID_JUMP_TARGET
3709        st1             {v31.8h}, [x0],  x1
3710        subs            w4,  w4,  #4
3711        st1             {v31.8h}, [x13], x1
3712        st1             {v31.8h}, [x0],  x1
3713        st1             {v31.8h}, [x13], x1
3714        b.gt            4b
3715        subs            w3,  w3,  #8
3716        lsr             x1,  x1,  #1
3717        msub            x0,  x1,  x12, x0         // ptr -= h * stride
3718        msub            x13, x1,  x12, x13
3719        b.le            9f
3720        lsl             x1,  x1,  #1
3721        add             x0,  x0,  #16
3722        add             x13, x13, #16
3723        mov             w4,  w12
3724        b               1b
3725
37269:
3727        ret
3728
3729L(ipred_z3_fill_padding_tbl):
3730        .hword L(ipred_z3_fill_padding_tbl) - 64b
3731        .hword L(ipred_z3_fill_padding_tbl) - 32b
3732        .hword L(ipred_z3_fill_padding_tbl) - 16b
3733        .hword L(ipred_z3_fill_padding_tbl) -  8b
3734        .hword L(ipred_z3_fill_padding_tbl) -  4b
3735        .hword L(ipred_z3_fill_padding_tbl) -  2b
3736
3737L(ipred_z3_fill_padding_wide):
3738        // Fill a WxH rectangle with padding, with W > 8.
3739        lsr             x1,  x1,  #1
3740        mov             w12, w3
3741        sub             x1,  x1,  w3,  uxtw #1
37421:
3743        ands            w5,  w3,  #7
3744        b.eq            2f
3745        // If the width isn't aligned to 8, first do one 8 pixel write
3746        // and align the start pointer.
3747        sub             w3,  w3,  w5
3748        st1             {v31.8h}, [x0]
3749        add             x0,  x0,  w5,  uxtw #1
37502:
3751        // Fill the rest of the line with aligned 8 pixel writes.
3752        subs            w3,  w3,  #8
3753        st1             {v31.8h}, [x0], #16
3754        b.gt            2b
3755        subs            w4,  w4,  #1
3756        add             x0,  x0,  x1
3757        b.le            9f
3758        mov             w3,  w12
3759        b               1b
37609:
3761        ret
3762endfunc
3763
3764function ipred_z3_fill2_16bpc_neon, export=1
3765        cmp             w4,  #8
3766        add             x10, x2,  w6,  uxtw       // left[max_base_y]
3767        ld1r            {v31.16b}, [x10]          // padding
3768        mov             w7,  w5
3769        mov             w15, #64
3770        add             x13, x0,  x1
3771        lsl             x1,  x1,  #1
3772        b.eq            8f
3773
37744:      // h == 4
3775        lsr             w8,  w7,  #6              // base
3776        and             w9,  w7,  #0x3e           // frac
3777        add             w7,  w7,  w5              // xpos += dx
3778        cmp             w8,  w6                   // base >= max_base_x
3779        lsr             w10, w7,  #6              // base
3780        and             w11, w7,  #0x3e           // frac
3781        b.ge            ipred_z3_fill_padding_neon
3782        lsl             w8,  w8,  #1
3783        lsl             w10, w10, #1
3784        ldr             q0,  [x2, w8, uxtw]       // top[base]
3785        ldr             q2,  [x2, w10, uxtw]
3786        dup             v4.4h,   w9               // frac
3787        dup             v5.4h,   w11
3788        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
3789        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
3790        uzp2            v3.8h,   v2.8h,   v2.8h
3791        uzp1            v2.8h,   v2.8h,   v2.8h
3792        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
3793        sub             v7.4h,   v3.4h,   v2.4h
3794        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
3795        ushll           v17.4s,  v2.4h,   #6
3796        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
3797        smlal           v17.4s,  v7.4h,   v5.4h
3798        rshrn           v16.4h,  v16.4s,  #6
3799        rshrn           v17.4h,  v17.4s,  #6
3800        subs            w3,  w3,  #2
3801        zip1            v18.8h,  v16.8h,  v17.8h
3802        st1             {v18.s}[0], [x0],  x1
3803        st1             {v18.s}[1], [x13], x1
3804        add             w7,  w7,  w5              // xpos += dx
3805        st1             {v18.s}[2], [x0]
3806        st1             {v18.s}[3], [x13]
3807        b.le            9f
3808        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
3809        sub             x13, x13, x1
3810        add             x0,  x0,  #4
3811        add             x13, x13, #4
3812        b               4b
38139:
3814        ret
3815
38168:      // h == 8
3817        lsr             w8,  w7,  #6              // base
3818        and             w9,  w7,  #0x3e           // frac
3819        add             w7,  w7,  w5              // xpos += dx
3820        cmp             w8,  w6                   // base >= max_base_x
3821        lsr             w10, w7,  #6              // base
3822        and             w11, w7,  #0x3e           // frac
3823        b.ge            ipred_z3_fill_padding_neon
3824        add             x8,  x2,  w8,  uxtw #1
3825        add             x10, x2,  w10, uxtw #1
3826        dup             v4.8h,   w9               // frac
3827        dup             v5.8h,   w11
3828        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
3829        ld1             {v2.8h, v3.8h},  [x10]
3830        sub             w9,  w15, w9              // 64 - frac
3831        sub             w11, w15, w11
3832        dup             v6.8h,   w9               // 64 - frac
3833        dup             v7.8h,   w11
3834        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
3835        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
3836        uzp2            v21.8h,  v2.8h,   v3.8h
3837        uzp1            v2.8h,   v2.8h,   v3.8h
3838        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
3839        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
3840        umull2          v17.4s,  v0.8h,   v6.8h
3841        umlal2          v17.4s,  v20.8h,  v4.8h
3842        umull           v18.4s,  v2.4h,   v7.4h
3843        umlal           v18.4s,  v21.4h,  v5.4h
3844        umull2          v19.4s,  v2.8h,   v7.8h
3845        umlal2          v19.4s,  v21.8h,  v5.8h
3846        rshrn           v16.4h,  v16.4s,  #6
3847        rshrn2          v16.8h,  v17.4s,  #6
3848        rshrn           v17.4h,  v18.4s,  #6
3849        rshrn2          v17.8h,  v19.4s,  #6
3850        subs            w3,  w3,  #2
3851        zip1            v18.8h,  v16.8h,  v17.8h
3852        zip2            v19.8h,  v16.8h,  v17.8h
3853        add             w7,  w7,  w5              // xpos += dx
3854        st1             {v18.s}[0], [x0],  x1
3855        st1             {v18.s}[1], [x13], x1
3856        st1             {v18.s}[2], [x0],  x1
3857        st1             {v18.s}[3], [x13], x1
3858        st1             {v19.s}[0], [x0],  x1
3859        st1             {v19.s}[1], [x13], x1
3860        st1             {v19.s}[2], [x0],  x1
3861        st1             {v19.s}[3], [x13], x1
3862        b.le            9f
3863        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
3864        sub             x13, x13, x1, lsl #2
3865        add             x0,  x0,  #4
3866        add             x13, x13, #4
3867        b               8b
38689:
3869        ret
3870endfunc
3871
3872
3873// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
3874//                              const pixel *const topleft,
3875//                              const int width, const int height, const int filt_idx,
3876//                              const int max_width, const int max_height,
3877//                              const int bitdepth_max);
3878.macro filter_fn bpc
3879function ipred_filter_\bpc\()bpc_neon
3880        and             w5,  w5,  #511
3881        movrel          x6,  X(filter_intra_taps)
3882        lsl             w5,  w5,  #6
3883        add             x6,  x6,  w5, uxtw
3884        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
3885        clz             w9,  w3
3886        adr             x5,  L(ipred_filter\bpc\()_tbl)
3887        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
3888        sub             w9,  w9,  #26
3889        ldrh            w9,  [x5, w9, uxtw #1]
3890        sxtl            v16.8h,  v16.8b
3891        sxtl            v17.8h,  v17.8b
3892        sub             x5,  x5,  w9, uxtw
3893        sxtl            v18.8h,  v18.8b
3894        sxtl            v19.8h,  v19.8b
3895        add             x6,  x0,  x1
3896        lsl             x1,  x1,  #1
3897        sxtl            v20.8h,  v20.8b
3898        sxtl            v21.8h,  v21.8b
3899        sxtl            v22.8h,  v22.8b
3900        dup             v31.8h,  w8
3901.if \bpc == 10
3902        movi            v30.8h,  #0
3903.endif
3904        br              x5
390540:
3906        AARCH64_VALID_JUMP_TARGET
3907        ldur            d0,  [x2, #2]             // top (0-3)
3908        sub             x2,  x2,  #4
3909        mov             x7,  #-4
39104:
3911        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3912.if \bpc == 10
3913        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3914        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3915        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3916        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3917        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3918        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3919        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3920        srshr           v2.8h,   v2.8h,   #4
3921        smax            v2.8h,   v2.8h,   v30.8h
3922.else
3923        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
3924        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
3925        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
3926        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
3927        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
3928        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
3929        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
3930        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3931        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3932        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3933        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3934        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3935        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3936        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3937        sqrshrun        v2.4h,   v2.4s,   #4
3938        sqrshrun2       v2.8h,   v3.4s,   #4
3939.endif
3940        smin            v2.8h,   v2.8h,   v31.8h
3941        subs            w4,  w4,  #2
3942        st1             {v2.d}[0], [x0], x1
3943        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
3944        st1             {v2.d}[1], [x6], x1
3945        b.gt            4b
3946        ret
394780:
3948        AARCH64_VALID_JUMP_TARGET
3949        ldur            q0,  [x2, #2]             // top (0-7)
3950        sub             x2,  x2,  #4
3951        mov             x7,  #-4
39528:
3953        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
3954.if \bpc == 10
3955        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3956        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3957        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3958        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3959        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3960        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3961        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3962        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
3963        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
3964        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
3965        srshr           v2.8h,   v2.8h,   #4
3966        smax            v2.8h,   v2.8h,   v30.8h
3967        smin            v2.8h,   v2.8h,   v31.8h
3968        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
3969        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
3970        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
3971        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
3972        srshr           v3.8h,   v3.8h,   #4
3973        smax            v3.8h,   v3.8h,   v30.8h
3974.else
3975        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
3976        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
3977        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
3978        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
3979        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
3980        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
3981        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
3982        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
3983        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
3984        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
3985        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
3986        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
3987        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
3988        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
3989        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
3990        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
3991        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
3992        sqrshrun        v2.4h,   v2.4s,   #4
3993        sqrshrun2       v2.8h,   v3.4s,   #4
3994        smin            v2.8h,   v2.8h,   v31.8h
3995        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
3996        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
3997        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
3998        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
3999        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
4000        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
4001        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
4002        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
4003        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
4004        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
4005        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
4006        sqrshrun        v3.4h,   v4.4s,   #4
4007        sqrshrun2       v3.8h,   v5.4s,   #4
4008.endif
4009        smin            v3.8h,   v3.8h,   v31.8h
4010        subs            w4,  w4,  #2
4011        st2             {v2.d, v3.d}[0], [x0], x1
4012        zip2            v0.2d,   v2.2d,   v3.2d
4013        st2             {v2.d, v3.d}[1], [x6], x1
4014        b.gt            8b
4015        ret
4016160:
4017320:
4018        AARCH64_VALID_JUMP_TARGET
4019        add             x8,  x2,  #2
4020        sub             x2,  x2,  #4
4021        mov             x7,  #-4
4022        sub             x1,  x1,  w3, uxtw #1
4023        mov             w9,  w3
4024
40251:
4026        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
40272:
4028        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
4029.if \bpc == 10
4030        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4031        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4032        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4033        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4034        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4035        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4036        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4037
4038        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4039        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4040        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4041        srshr           v3.8h,   v3.8h,   #4
4042        smax            v3.8h,   v3.8h,   v30.8h
4043        smin            v3.8h,   v3.8h,   v31.8h
4044        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4045        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4046        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4047        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4048
4049        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4050        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4051        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4052        srshr           v4.8h,   v4.8h,   #4
4053        smax            v4.8h,   v4.8h,   v30.8h
4054        smin            v4.8h,   v4.8h,   v31.8h
4055        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4056        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4057        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4058        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4059
4060        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4061        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4062        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4063        srshr           v5.8h,   v5.8h,   #4
4064        smax            v5.8h,   v5.8h,   v30.8h
4065        smin            v5.8h,   v5.8h,   v31.8h
4066        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4067        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4068        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4069        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4070
4071        subs            w3,  w3,  #16
4072        srshr           v6.8h,   v6.8h,   #4
4073        smax            v6.8h,   v6.8h,   v30.8h
4074.else
4075        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
4076        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
4077        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
4078        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
4079        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
4080        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
4081        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
4082        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
4083        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
4084        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
4085        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
4086        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
4087        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
4088        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
4089
4090        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
4091        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
4092        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
4093        sqrshrun        v3.4h,   v3.4s,   #4
4094        sqrshrun2       v3.8h,   v4.4s,   #4
4095        smin            v3.8h,   v3.8h,   v31.8h
4096        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
4097        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
4098        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
4099        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
4100        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
4101        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
4102        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
4103        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
4104        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
4105        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
4106        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
4107
4108        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
4109        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
4110        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
4111        sqrshrun        v4.4h,   v5.4s,   #4
4112        sqrshrun2       v4.8h,   v6.4s,   #4
4113        smin            v4.8h,   v4.8h,   v31.8h
4114        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
4115        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
4116        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
4117        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
4118        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
4119        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
4120        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
4121        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
4122        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
4123        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
4124        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
4125
4126        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
4127        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
4128        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
4129        sqrshrun        v5.4h,   v24.4s,  #4
4130        sqrshrun2       v5.8h,   v25.4s,  #4
4131        smin            v5.8h,   v5.8h,   v31.8h
4132        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
4133        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
4134        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
4135        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
4136        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
4137        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
4138        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
4139        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
4140        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
4141        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
4142        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
4143
4144        subs            w3,  w3,  #16
4145        sqrshrun        v6.4h,   v26.4s,  #4
4146        sqrshrun2       v6.8h,   v27.4s,  #4
4147.endif
4148        smin            v6.8h,   v6.8h,   v31.8h
4149
4150        ins             v0.h[2], v2.h[7]
4151        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
4152        ins             v0.h[0], v6.h[7]
4153        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
4154        ins             v0.h[1], v6.h[3]
4155        b.gt            2b
4156        subs            w4,  w4,  #2
4157        b.le            9f
4158        sub             x8,  x6,  w9, uxtw #1
4159        add             x0,  x0,  x1
4160        add             x6,  x6,  x1
4161        mov             w3,  w9
4162        b               1b
41639:
4164        ret
4165
4166L(ipred_filter\bpc\()_tbl):
4167        .hword L(ipred_filter\bpc\()_tbl) - 320b
4168        .hword L(ipred_filter\bpc\()_tbl) - 160b
4169        .hword L(ipred_filter\bpc\()_tbl) -  80b
4170        .hword L(ipred_filter\bpc\()_tbl) -  40b
4171endfunc
4172.endm
4173
4174filter_fn 10
4175filter_fn 12
4176
4177function ipred_filter_16bpc_neon, export=1
4178        ldr             w8,  [sp]
4179        cmp             w8,  0x3ff
4180        b.le            ipred_filter_10bpc_neon
4181        b               ipred_filter_12bpc_neon
4182endfunc
4183
4184// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4185//                          const pixel *const pal, const uint8_t *idx,
4186//                          const int w, const int h);
4187function pal_pred_16bpc_neon, export=1
4188        ld1             {v30.8h}, [x2]
4189        clz             w9,  w4
4190        adr             x6,  L(pal_pred_tbl)
4191        sub             w9,  w9,  #25
4192        movi            v29.16b, #7
4193        ldrh            w9,  [x6, w9, uxtw #1]
4194        movi            v31.8h,  #1, lsl #8
4195        sub             x6,  x6,  w9, uxtw
4196        br              x6
419740:
4198        AARCH64_VALID_JUMP_TARGET
4199        add             x2,  x0,  x1
4200        lsl             x1,  x1,  #1
42014:
4202        ld1             {v1.8b}, [x3], #8
4203        subs            w5,  w5,  #4
4204        ushr            v3.8b,   v1.8b,   #4
4205        and             v2.8b,   v1.8b,   v29.8b
4206        zip1            v1.16b,  v2.16b,  v3.16b
4207        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
4208        add             v1.16b,  v1.16b,  v1.16b
4209        zip1            v0.16b,  v1.16b,  v1.16b
4210        zip2            v1.16b,  v1.16b,  v1.16b
4211        add             v0.8h,   v0.8h,   v31.8h
4212        add             v1.8h,   v1.8h,   v31.8h
4213        tbl             v0.16b, {v30.16b}, v0.16b
4214        st1             {v0.d}[0], [x0], x1
4215        tbl             v1.16b, {v30.16b}, v1.16b
4216        st1             {v0.d}[1], [x2], x1
4217        st1             {v1.d}[0], [x0], x1
4218        st1             {v1.d}[1], [x2], x1
4219        b.gt            4b
4220        ret
422180:
4222        AARCH64_VALID_JUMP_TARGET
4223        add             x2,  x0,  x1
4224        lsl             x1,  x1,  #1
42258:
4226        ld1             {v2.16b}, [x3], #16
4227        subs            w5,  w5,  #4
4228        ushr            v4.16b,  v2.16b,  #4
4229        and             v3.16b,  v2.16b,  v29.16b
4230        zip1            v2.16b,  v3.16b,  v4.16b
4231        zip2            v3.16b,  v3.16b,  v4.16b
4232        add             v2.16b,  v2.16b,  v2.16b
4233        add             v3.16b,  v3.16b,  v3.16b
4234        zip1            v0.16b,  v2.16b,  v2.16b
4235        zip2            v1.16b,  v2.16b,  v2.16b
4236        zip1            v2.16b,  v3.16b,  v3.16b
4237        zip2            v3.16b,  v3.16b,  v3.16b
4238        add             v0.8h,   v0.8h,   v31.8h
4239        add             v1.8h,   v1.8h,   v31.8h
4240        add             v2.8h,   v2.8h,   v31.8h
4241        add             v3.8h,   v3.8h,   v31.8h
4242        tbl             v0.16b, {v30.16b}, v0.16b
4243        tbl             v1.16b, {v30.16b}, v1.16b
4244        st1             {v0.8h}, [x0], x1
4245        tbl             v2.16b, {v30.16b}, v2.16b
4246        st1             {v1.8h}, [x2], x1
4247        tbl             v3.16b, {v30.16b}, v3.16b
4248        st1             {v2.8h}, [x0], x1
4249        st1             {v3.8h}, [x2], x1
4250        b.gt            8b
4251        ret
4252160:
4253        AARCH64_VALID_JUMP_TARGET
4254        add             x2,  x0,  x1
4255        lsl             x1,  x1,  #1
425616:
4257        ld1             {v4.16b, v5.16b}, [x3], #32
4258        subs            w5,  w5,  #4
4259        ushr            v7.16b,  v4.16b,  #4
4260        and             v6.16b,  v4.16b,  v29.16b
4261        ushr            v3.16b,  v5.16b,  #4
4262        and             v2.16b,  v5.16b,  v29.16b
4263        zip1            v4.16b,  v6.16b,  v7.16b
4264        zip2            v5.16b,  v6.16b,  v7.16b
4265        zip1            v6.16b,  v2.16b,  v3.16b
4266        zip2            v7.16b,  v2.16b,  v3.16b
4267        add             v4.16b,  v4.16b,  v4.16b
4268        add             v5.16b,  v5.16b,  v5.16b
4269        add             v6.16b,  v6.16b,  v6.16b
4270        add             v7.16b,  v7.16b,  v7.16b
4271        zip1            v0.16b,  v4.16b,  v4.16b
4272        zip2            v1.16b,  v4.16b,  v4.16b
4273        zip1            v2.16b,  v5.16b,  v5.16b
4274        zip2            v3.16b,  v5.16b,  v5.16b
4275        zip1            v4.16b,  v6.16b,  v6.16b
4276        zip2            v5.16b,  v6.16b,  v6.16b
4277        zip1            v6.16b,  v7.16b,  v7.16b
4278        zip2            v7.16b,  v7.16b,  v7.16b
4279        add             v0.8h,   v0.8h,   v31.8h
4280        add             v1.8h,   v1.8h,   v31.8h
4281        add             v2.8h,   v2.8h,   v31.8h
4282        add             v3.8h,   v3.8h,   v31.8h
4283        add             v4.8h,   v4.8h,   v31.8h
4284        tbl             v0.16b, {v30.16b}, v0.16b
4285        add             v5.8h,   v5.8h,   v31.8h
4286        tbl             v1.16b, {v30.16b}, v1.16b
4287        add             v6.8h,   v6.8h,   v31.8h
4288        tbl             v2.16b, {v30.16b}, v2.16b
4289        add             v7.8h,   v7.8h,   v31.8h
4290        tbl             v3.16b, {v30.16b}, v3.16b
4291        tbl             v4.16b, {v30.16b}, v4.16b
4292        tbl             v5.16b, {v30.16b}, v5.16b
4293        st1             {v0.8h, v1.8h}, [x0], x1
4294        tbl             v6.16b, {v30.16b}, v6.16b
4295        st1             {v2.8h, v3.8h}, [x2], x1
4296        tbl             v7.16b, {v30.16b}, v7.16b
4297        st1             {v4.8h, v5.8h}, [x0], x1
4298        st1             {v6.8h, v7.8h}, [x2], x1
4299        b.gt            16b
4300        ret
4301320:
4302        AARCH64_VALID_JUMP_TARGET
4303        add             x2,  x0,  x1
4304        lsl             x1,  x1,  #1
430532:
4306        ld1             {v4.16b, v5.16b}, [x3], #32
4307        subs            w5,  w5,  #2
4308        ushr            v7.16b,  v4.16b,  #4
4309        and             v6.16b,  v4.16b,  v29.16b
4310        ushr            v3.16b,  v5.16b,  #4
4311        and             v2.16b,  v5.16b,  v29.16b
4312        zip1            v4.16b,  v6.16b,  v7.16b
4313        zip2            v5.16b,  v6.16b,  v7.16b
4314        zip1            v6.16b,  v2.16b,  v3.16b
4315        zip2            v7.16b,  v2.16b,  v3.16b
4316        add             v4.16b,  v4.16b,  v4.16b
4317        add             v5.16b,  v5.16b,  v5.16b
4318        add             v6.16b,  v6.16b,  v6.16b
4319        add             v7.16b,  v7.16b,  v7.16b
4320        zip1            v0.16b,  v4.16b,  v4.16b
4321        zip2            v1.16b,  v4.16b,  v4.16b
4322        zip1            v2.16b,  v5.16b,  v5.16b
4323        zip2            v3.16b,  v5.16b,  v5.16b
4324        zip1            v4.16b,  v6.16b,  v6.16b
4325        zip2            v5.16b,  v6.16b,  v6.16b
4326        zip1            v6.16b,  v7.16b,  v7.16b
4327        zip2            v7.16b,  v7.16b,  v7.16b
4328        add             v0.8h,   v0.8h,   v31.8h
4329        add             v1.8h,   v1.8h,   v31.8h
4330        add             v2.8h,   v2.8h,   v31.8h
4331        add             v3.8h,   v3.8h,   v31.8h
4332        add             v4.8h,   v4.8h,   v31.8h
4333        tbl             v0.16b, {v30.16b}, v0.16b
4334        add             v5.8h,   v5.8h,   v31.8h
4335        tbl             v1.16b, {v30.16b}, v1.16b
4336        add             v6.8h,   v6.8h,   v31.8h
4337        tbl             v2.16b, {v30.16b}, v2.16b
4338        add             v7.8h,   v7.8h,   v31.8h
4339        tbl             v3.16b, {v30.16b}, v3.16b
4340        tbl             v4.16b, {v30.16b}, v4.16b
4341        tbl             v5.16b, {v30.16b}, v5.16b
4342        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4343        tbl             v6.16b, {v30.16b}, v6.16b
4344        tbl             v7.16b, {v30.16b}, v7.16b
4345        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4346        b.gt            32b
4347        ret
4348640:
4349        AARCH64_VALID_JUMP_TARGET
4350        add             x2,  x0,  #64
435164:
4352        ld1             {v4.16b, v5.16b}, [x3], #32
4353        subs            w5,  w5,  #1
4354        ushr            v7.16b,  v4.16b,  #4
4355        and             v6.16b,  v4.16b,  v29.16b
4356        ushr            v3.16b,  v5.16b,  #4
4357        and             v2.16b,  v5.16b,  v29.16b
4358        zip1            v4.16b,  v6.16b,  v7.16b
4359        zip2            v5.16b,  v6.16b,  v7.16b
4360        zip1            v6.16b,  v2.16b,  v3.16b
4361        zip2            v7.16b,  v2.16b,  v3.16b
4362        add             v4.16b,  v4.16b,  v4.16b
4363        add             v5.16b,  v5.16b,  v5.16b
4364        add             v6.16b,  v6.16b,  v6.16b
4365        add             v7.16b,  v7.16b,  v7.16b
4366        zip1            v0.16b,  v4.16b,  v4.16b
4367        zip2            v1.16b,  v4.16b,  v4.16b
4368        zip1            v2.16b,  v5.16b,  v5.16b
4369        zip2            v3.16b,  v5.16b,  v5.16b
4370        zip1            v4.16b,  v6.16b,  v6.16b
4371        zip2            v5.16b,  v6.16b,  v6.16b
4372        zip1            v6.16b,  v7.16b,  v7.16b
4373        zip2            v7.16b,  v7.16b,  v7.16b
4374        add             v0.8h,   v0.8h,   v31.8h
4375        add             v1.8h,   v1.8h,   v31.8h
4376        add             v2.8h,   v2.8h,   v31.8h
4377        add             v3.8h,   v3.8h,   v31.8h
4378        add             v4.8h,   v4.8h,   v31.8h
4379        tbl             v0.16b, {v30.16b}, v0.16b
4380        add             v5.8h,   v5.8h,   v31.8h
4381        tbl             v1.16b, {v30.16b}, v1.16b
4382        add             v6.8h,   v6.8h,   v31.8h
4383        tbl             v2.16b, {v30.16b}, v2.16b
4384        add             v7.8h,   v7.8h,   v31.8h
4385        tbl             v3.16b, {v30.16b}, v3.16b
4386        tbl             v4.16b, {v30.16b}, v4.16b
4387        tbl             v5.16b, {v30.16b}, v5.16b
4388        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
4389        tbl             v6.16b, {v30.16b}, v6.16b
4390        tbl             v7.16b, {v30.16b}, v7.16b
4391        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
4392        b.gt            64b
4393        ret
4394
4395L(pal_pred_tbl):
4396        .hword L(pal_pred_tbl) - 640b
4397        .hword L(pal_pred_tbl) - 320b
4398        .hword L(pal_pred_tbl) - 160b
4399        .hword L(pal_pred_tbl) -  80b
4400        .hword L(pal_pred_tbl) -  40b
4401endfunc
4402
4403// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4404//                               const pixel *const topleft,
4405//                               const int width, const int height,
4406//                               const int16_t *ac, const int alpha,
4407//                               const int bitdepth_max);
4408function ipred_cfl_128_16bpc_neon, export=1
4409        dup             v31.8h,  w7   // bitdepth_max
4410        clz             w9,  w3
4411        adr             x7,  L(ipred_cfl_128_tbl)
4412        sub             w9,  w9,  #26
4413        ldrh            w9,  [x7, w9, uxtw #1]
4414        urshr           v0.8h,   v31.8h,  #1
4415        dup             v1.8h,   w6   // alpha
4416        sub             x7,  x7,  w9, uxtw
4417        add             x6,  x0,  x1
4418        lsl             x1,  x1,  #1
4419        movi            v30.8h,  #0
4420        br              x7
4421L(ipred_cfl_splat_w4):
4422        AARCH64_VALID_JUMP_TARGET
4423        ld1             {v4.8h, v5.8h}, [x5], #32
4424        subs            w4,  w4,  #4
4425        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4426        smull2          v3.4s,   v4.8h,   v1.8h
4427        smull           v4.4s,   v5.4h,   v1.4h
4428        smull2          v5.4s,   v5.8h,   v1.8h
4429        cmlt            v16.4s,  v2.4s,   #0     // sign
4430        cmlt            v17.4s,  v3.4s,   #0
4431        cmlt            v18.4s,  v4.4s,   #0
4432        cmlt            v19.4s,  v5.4s,   #0
4433        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4434        add             v3.4s,   v3.4s,   v17.4s
4435        add             v4.4s,   v4.4s,   v18.4s
4436        add             v5.4s,   v5.4s,   v19.4s
4437        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4438        rshrn2          v2.8h,   v3.4s,   #6
4439        rshrn           v3.4h,   v4.4s,   #6
4440        rshrn2          v3.8h,   v5.4s,   #6
4441        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4442        add             v3.8h,   v3.8h,   v0.8h
4443        smax            v2.8h,   v2.8h,   v30.8h
4444        smax            v3.8h,   v3.8h,   v30.8h
4445        smin            v2.8h,   v2.8h,   v31.8h
4446        smin            v3.8h,   v3.8h,   v31.8h
4447        st1             {v2.d}[0],  [x0], x1
4448        st1             {v2.d}[1],  [x6], x1
4449        st1             {v3.d}[0],  [x0], x1
4450        st1             {v3.d}[1],  [x6], x1
4451        b.gt            L(ipred_cfl_splat_w4)
4452        ret
4453L(ipred_cfl_splat_w8):
4454        AARCH64_VALID_JUMP_TARGET
4455        ld1             {v4.8h, v5.8h}, [x5], #32
4456        subs            w4,  w4,  #2
4457        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
4458        smull2          v3.4s,   v4.8h,   v1.8h
4459        smull           v4.4s,   v5.4h,   v1.4h
4460        smull2          v5.4s,   v5.8h,   v1.8h
4461        cmlt            v16.4s,  v2.4s,   #0     // sign
4462        cmlt            v17.4s,  v3.4s,   #0
4463        cmlt            v18.4s,  v4.4s,   #0
4464        cmlt            v19.4s,  v5.4s,   #0
4465        add             v2.4s,   v2.4s,   v16.4s // diff + sign
4466        add             v3.4s,   v3.4s,   v17.4s
4467        add             v4.4s,   v4.4s,   v18.4s
4468        add             v5.4s,   v5.4s,   v19.4s
4469        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
4470        rshrn2          v2.8h,   v3.4s,   #6
4471        rshrn           v3.4h,   v4.4s,   #6
4472        rshrn2          v3.8h,   v5.4s,   #6
4473        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
4474        add             v3.8h,   v3.8h,   v0.8h
4475        smax            v2.8h,   v2.8h,   v30.8h
4476        smax            v3.8h,   v3.8h,   v30.8h
4477        smin            v2.8h,   v2.8h,   v31.8h
4478        smin            v3.8h,   v3.8h,   v31.8h
4479        st1             {v2.8h},  [x0], x1
4480        st1             {v3.8h},  [x6], x1
4481        b.gt            L(ipred_cfl_splat_w8)
4482        ret
4483L(ipred_cfl_splat_w16):
4484        AARCH64_VALID_JUMP_TARGET
4485        add             x7,  x5,  w3, uxtw #1
4486        sub             x1,  x1,  w3, uxtw #1
4487        mov             w9,  w3
44881:
4489        ld1             {v2.8h, v3.8h}, [x5], #32
4490        ld1             {v4.8h, v5.8h}, [x7], #32
4491        subs            w3,  w3,  #16
4492        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
4493        smull2          v17.4s,  v2.8h,   v1.8h
4494        smull           v18.4s,  v3.4h,   v1.4h
4495        smull2          v19.4s,  v3.8h,   v1.8h
4496        smull           v2.4s,   v4.4h,   v1.4h
4497        smull2          v3.4s,   v4.8h,   v1.8h
4498        smull           v4.4s,   v5.4h,   v1.4h
4499        smull2          v5.4s,   v5.8h,   v1.8h
4500        cmlt            v20.4s,  v16.4s,  #0     // sign
4501        cmlt            v21.4s,  v17.4s,  #0
4502        cmlt            v22.4s,  v18.4s,  #0
4503        cmlt            v23.4s,  v19.4s,  #0
4504        cmlt            v24.4s,  v2.4s,   #0
4505        cmlt            v25.4s,  v3.4s,   #0
4506        cmlt            v26.4s,  v4.4s,   #0
4507        cmlt            v27.4s,  v5.4s,   #0
4508        add             v16.4s,  v16.4s,  v20.4s // diff + sign
4509        add             v17.4s,  v17.4s,  v21.4s
4510        add             v18.4s,  v18.4s,  v22.4s
4511        add             v19.4s,  v19.4s,  v23.4s
4512        add             v2.4s,   v2.4s,   v24.4s
4513        add             v3.4s,   v3.4s,   v25.4s
4514        add             v4.4s,   v4.4s,   v26.4s
4515        add             v5.4s,   v5.4s,   v27.4s
4516        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
4517        rshrn2          v16.8h,  v17.4s,  #6
4518        rshrn           v17.4h,  v18.4s,  #6
4519        rshrn2          v17.8h,  v19.4s,  #6
4520        rshrn           v6.4h,   v2.4s,   #6
4521        rshrn2          v6.8h,   v3.4s,   #6
4522        rshrn           v7.4h,   v4.4s,   #6
4523        rshrn2          v7.8h,   v5.4s,   #6
4524        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
4525        add             v3.8h,   v17.8h,  v0.8h
4526        add             v4.8h,   v6.8h,   v0.8h
4527        add             v5.8h,   v7.8h,   v0.8h
4528        smax            v2.8h,   v2.8h,   v30.8h
4529        smax            v3.8h,   v3.8h,   v30.8h
4530        smax            v4.8h,   v4.8h,   v30.8h
4531        smax            v5.8h,   v5.8h,   v30.8h
4532        smin            v2.8h,   v2.8h,   v31.8h
4533        smin            v3.8h,   v3.8h,   v31.8h
4534        smin            v4.8h,   v4.8h,   v31.8h
4535        smin            v5.8h,   v5.8h,   v31.8h
4536        st1             {v2.8h, v3.8h},  [x0], #32
4537        st1             {v4.8h, v5.8h},  [x6], #32
4538        b.gt            1b
4539        subs            w4,  w4,  #2
4540        add             x5,  x5,  w9, uxtw #1
4541        add             x7,  x7,  w9, uxtw #1
4542        add             x0,  x0,  x1
4543        add             x6,  x6,  x1
4544        mov             w3,  w9
4545        b.gt            1b
4546        ret
4547
4548L(ipred_cfl_128_tbl):
4549L(ipred_cfl_splat_tbl):
4550        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
4551        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
4552        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
4553        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
4554endfunc
4555
4556// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4557//                               const pixel *const topleft,
4558//                               const int width, const int height,
4559//                               const int16_t *ac, const int alpha,
4560//                               const int bitdepth_max);
4561function ipred_cfl_top_16bpc_neon, export=1
4562        dup             v31.8h,  w7   // bitdepth_max
4563        clz             w9,  w3
4564        adr             x7,  L(ipred_cfl_top_tbl)
4565        sub             w9,  w9,  #26
4566        ldrh            w9,  [x7, w9, uxtw #1]
4567        dup             v1.8h,   w6   // alpha
4568        add             x2,  x2,  #2
4569        sub             x7,  x7,  w9, uxtw
4570        add             x6,  x0,  x1
4571        lsl             x1,  x1,  #1
4572        movi            v30.8h,  #0
4573        br              x7
45744:
4575        AARCH64_VALID_JUMP_TARGET
4576        ld1             {v0.4h},  [x2]
4577        addv            h0,      v0.4h
4578        urshr           v0.4h,   v0.4h,   #2
4579        dup             v0.8h,   v0.h[0]
4580        b               L(ipred_cfl_splat_w4)
45818:
4582        AARCH64_VALID_JUMP_TARGET
4583        ld1             {v0.8h},  [x2]
4584        addv            h0,      v0.8h
4585        urshr           v0.4h,   v0.4h,   #3
4586        dup             v0.8h,   v0.h[0]
4587        b               L(ipred_cfl_splat_w8)
458816:
4589        AARCH64_VALID_JUMP_TARGET
4590        ld1             {v2.8h, v3.8h}, [x2]
4591        addp            v0.8h,   v2.8h,   v3.8h
4592        addv            h0,      v0.8h
4593        urshr           v0.4h,   v0.4h,   #4
4594        dup             v0.8h,   v0.h[0]
4595        b               L(ipred_cfl_splat_w16)
459632:
4597        AARCH64_VALID_JUMP_TARGET
4598        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4599        addp            v2.8h,   v2.8h,   v3.8h
4600        addp            v4.8h,   v4.8h,   v5.8h
4601        addp            v0.8h,   v2.8h,   v4.8h
4602        uaddlv          s0,      v0.8h
4603        rshrn           v0.4h,   v0.4s,   #5
4604        dup             v0.8h,   v0.h[0]
4605        b               L(ipred_cfl_splat_w16)
4606
4607L(ipred_cfl_top_tbl):
4608        .hword L(ipred_cfl_top_tbl) - 32b
4609        .hword L(ipred_cfl_top_tbl) - 16b
4610        .hword L(ipred_cfl_top_tbl) -  8b
4611        .hword L(ipred_cfl_top_tbl) -  4b
4612endfunc
4613
4614// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4615//                                const pixel *const topleft,
4616//                                const int width, const int height,
4617//                                const int16_t *ac, const int alpha,
4618//                                const int bitdepth_max);
4619function ipred_cfl_left_16bpc_neon, export=1
4620        dup             v31.8h,  w7   // bitdepth_max
4621        sub             x2,  x2,  w4, uxtw #1
4622        clz             w9,  w3
4623        clz             w8,  w4
4624        adr             x10, L(ipred_cfl_splat_tbl)
4625        adr             x7,  L(ipred_cfl_left_tbl)
4626        sub             w9,  w9,  #26
4627        sub             w8,  w8,  #26
4628        ldrh            w9,  [x10, w9, uxtw #1]
4629        ldrh            w8,  [x7,  w8, uxtw #1]
4630        dup             v1.8h,   w6   // alpha
4631        sub             x9,  x10, w9, uxtw
4632        sub             x7,  x7,  w8, uxtw
4633        add             x6,  x0,  x1
4634        lsl             x1,  x1,  #1
4635        movi            v30.8h,  #0
4636        br              x7
4637
4638L(ipred_cfl_left_h4):
4639        AARCH64_VALID_JUMP_TARGET
4640        ld1             {v0.4h},  [x2]
4641        addv            h0,      v0.4h
4642        urshr           v0.4h,   v0.4h,   #2
4643        dup             v0.8h,   v0.h[0]
4644        br              x9
4645
4646L(ipred_cfl_left_h8):
4647        AARCH64_VALID_JUMP_TARGET
4648        ld1             {v0.8h},  [x2]
4649        addv            h0,      v0.8h
4650        urshr           v0.4h,   v0.4h,   #3
4651        dup             v0.8h,   v0.h[0]
4652        br              x9
4653
4654L(ipred_cfl_left_h16):
4655        AARCH64_VALID_JUMP_TARGET
4656        ld1             {v2.8h, v3.8h}, [x2]
4657        addp            v0.8h,   v2.8h,   v3.8h
4658        addv            h0,      v0.8h
4659        urshr           v0.4h,   v0.4h,   #4
4660        dup             v0.8h,   v0.h[0]
4661        br              x9
4662
4663L(ipred_cfl_left_h32):
4664        AARCH64_VALID_JUMP_TARGET
4665        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4666        addp            v2.8h,   v2.8h,   v3.8h
4667        addp            v4.8h,   v4.8h,   v5.8h
4668        addp            v0.8h,   v2.8h,   v4.8h
4669        uaddlv          s0,      v0.8h
4670        rshrn           v0.4h,   v0.4s,   #5
4671        dup             v0.8h,   v0.h[0]
4672        br              x9
4673
4674L(ipred_cfl_left_tbl):
4675        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
4676        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
4677        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
4678        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
4679endfunc
4680
4681// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
4682//                           const pixel *const topleft,
4683//                           const int width, const int height,
4684//                           const int16_t *ac, const int alpha,
4685//                           const int bitdepth_max);
4686function ipred_cfl_16bpc_neon, export=1
4687        dup             v31.8h,  w7              // bitdepth_max
4688        sub             x2,  x2,  w4, uxtw #1
4689        add             w8,  w3,  w4             // width + height
4690        dup             v1.8h,   w6              // alpha
4691        clz             w9,  w3
4692        clz             w6,  w4
4693        dup             v16.4s, w8               // width + height
4694        adr             x7,  L(ipred_cfl_tbl)
4695        rbit            w8,  w8                  // rbit(width + height)
4696        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
4697        sub             w6,  w6,  #26
4698        clz             w8,  w8                  // ctz(width + height)
4699        ldrh            w9,  [x7, w9, uxtw #1]
4700        ldrh            w6,  [x7, w6, uxtw #1]
4701        neg             w8,  w8                  // -ctz(width + height)
4702        sub             x9,  x7,  w9, uxtw
4703        sub             x7,  x7,  w6, uxtw
4704        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
4705        dup             v17.4s,  w8              // -ctz(width + height)
4706        add             x6,  x0,  x1
4707        lsl             x1,  x1,  #1
4708        movi            v30.8h,  #0
4709        br              x7
4710
4711L(ipred_cfl_h4):
4712        AARCH64_VALID_JUMP_TARGET
4713        ld1             {v0.4h},  [x2], #8
4714        uaddlv          s0,      v0.4h
4715        add             x2,  x2,  #2
4716        br              x9
4717L(ipred_cfl_w4):
4718        AARCH64_VALID_JUMP_TARGET
4719        ld1             {v2.4h},  [x2]
4720        add             v0.2s,   v0.2s,   v16.2s
4721        uaddlv          s2,      v2.4h
4722        cmp             w4,  #4
4723        add             v0.2s,   v0.2s,   v2.2s
4724        ushl            v0.2s,   v0.2s,   v17.2s
4725        b.eq            1f
4726        // h = 8/16
4727        cmp             w4,  #16
4728        mov             w16, #0x6667
4729        mov             w17, #0xAAAB
4730        csel            w16, w16, w17, eq
4731        dup             v16.2s,  w16
4732        mul             v0.2s,   v0.2s,   v16.2s
4733        ushr            v0.2s,   v0.2s,   #17
47341:
4735        dup             v0.8h,   v0.h[0]
4736        b               L(ipred_cfl_splat_w4)
4737
4738L(ipred_cfl_h8):
4739        AARCH64_VALID_JUMP_TARGET
4740        ld1             {v0.8h},  [x2], #16
4741        uaddlv          s0,      v0.8h
4742        add             x2,  x2,  #2
4743        br              x9
4744L(ipred_cfl_w8):
4745        AARCH64_VALID_JUMP_TARGET
4746        ld1             {v2.8h},  [x2]
4747        add             v0.2s,   v0.2s,   v16.2s
4748        uaddlv          s2,      v2.8h
4749        cmp             w4,  #8
4750        add             v0.2s,   v0.2s,   v2.2s
4751        ushl            v0.2s,   v0.2s,   v17.2s
4752        b.eq            1f
4753        // h = 4/16/32
4754        cmp             w4,  #32
4755        mov             w16, #0x6667
4756        mov             w17, #0xAAAB
4757        csel            w16, w16, w17, eq
4758        dup             v16.2s,  w16
4759        mul             v0.2s,   v0.2s,   v16.2s
4760        ushr            v0.2s,   v0.2s,   #17
47611:
4762        dup             v0.8h,   v0.h[0]
4763        b               L(ipred_cfl_splat_w8)
4764
4765L(ipred_cfl_h16):
4766        AARCH64_VALID_JUMP_TARGET
4767        ld1             {v2.8h, v3.8h}, [x2], #32
4768        addp            v0.8h,   v2.8h,   v3.8h
4769        add             x2,  x2,  #2
4770        uaddlv          s0,      v0.8h
4771        br              x9
4772L(ipred_cfl_w16):
4773        AARCH64_VALID_JUMP_TARGET
4774        ld1             {v2.8h, v3.8h}, [x2]
4775        add             v0.2s,   v0.2s,   v16.2s
4776        addp            v2.8h,   v2.8h,   v3.8h
4777        uaddlv          s2,      v2.8h
4778        cmp             w4,  #16
4779        add             v0.2s,   v0.2s,   v2.2s
4780        ushl            v0.2s,   v0.2s,   v17.2s
4781        b.eq            1f
4782        // h = 4/8/32
4783        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
4784        mov             w16, #0x6667
4785        mov             w17, #0xAAAB
4786        csel            w16, w16, w17, eq
4787        dup             v16.2s,  w16
4788        mul             v0.2s,   v0.2s,   v16.2s
4789        ushr            v0.2s,   v0.2s,   #17
47901:
4791        dup             v0.8h,   v0.h[0]
4792        b               L(ipred_cfl_splat_w16)
4793
4794L(ipred_cfl_h32):
4795        AARCH64_VALID_JUMP_TARGET
4796        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
4797        addp            v2.8h,   v2.8h,   v3.8h
4798        addp            v4.8h,   v4.8h,   v5.8h
4799        addp            v0.8h,   v2.8h,   v4.8h
4800        add             x2,  x2,  #2
4801        uaddlv          s0,      v0.8h
4802        br              x9
4803L(ipred_cfl_w32):
4804        AARCH64_VALID_JUMP_TARGET
4805        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
4806        add             v0.4s,   v0.4s,   v16.4s
4807        addp            v2.8h,   v2.8h,   v3.8h
4808        addp            v4.8h,   v4.8h,   v5.8h
4809        addp            v2.8h,   v2.8h,   v4.8h
4810        cmp             w4,  #32
4811        uaddlv          s2,      v2.8h
4812        add             v0.2s,   v0.2s,   v2.2s
4813        ushl            v0.2s,   v0.2s,   v17.2s
4814        b.eq            1f
4815        // h = 8/16
4816        cmp             w4,  #8
4817        mov             w16, #0x6667
4818        mov             w17, #0xAAAB
4819        csel            w16, w16, w17, eq
4820        dup             v16.2s,  w16
4821        mul             v0.2s,   v0.2s,   v16.2s
4822        ushr            v0.2s,   v0.2s,   #17
48231:
4824        dup             v0.8h,   v0.h[0]
4825        b               L(ipred_cfl_splat_w16)
4826
4827L(ipred_cfl_tbl):
4828        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
4829        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
4830        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
4831        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
4832        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
4833        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
4834        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
4835        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
4836endfunc
4837
4838// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
4839//                            const ptrdiff_t stride, const int w_pad,
4840//                            const int h_pad, const int cw, const int ch);
4841function ipred_cfl_ac_420_16bpc_neon, export=1
4842        clz             w8,  w5
4843        lsl             w4,  w4,  #2
4844        adr             x7,  L(ipred_cfl_ac_420_tbl)
4845        sub             w8,  w8,  #27
4846        ldrh            w8,  [x7, w8, uxtw #1]
4847        movi            v24.4s,  #0
4848        movi            v25.4s,  #0
4849        movi            v26.4s,  #0
4850        movi            v27.4s,  #0
4851        sub             x7,  x7,  w8, uxtw
4852        sub             w8,  w6,  w4         // height - h_pad
4853        rbit            w9,  w5              // rbit(width)
4854        rbit            w10, w6              // rbit(height)
4855        clz             w9,  w9              // ctz(width)
4856        clz             w10, w10             // ctz(height)
4857        add             w9,  w9,  w10        // log2sz
4858        add             x10, x1,  x2
4859        dup             v31.4s,  w9
4860        lsl             x2,  x2,  #1
4861        neg             v31.4s,  v31.4s      // -log2sz
4862        br              x7
4863
4864L(ipred_cfl_ac_420_w4):
4865        AARCH64_VALID_JUMP_TARGET
48661:      // Copy and subsample input
4867        ld1             {v0.8h}, [x1],  x2
4868        ld1             {v1.8h}, [x10], x2
4869        ld1             {v2.8h}, [x1],  x2
4870        ld1             {v3.8h}, [x10], x2
4871        addp            v0.8h,   v0.8h,   v2.8h
4872        addp            v1.8h,   v1.8h,   v3.8h
4873        add             v0.8h,   v0.8h,   v1.8h
4874        shl             v0.8h,   v0.8h,   #1
4875        subs            w8,  w8,  #2
4876        st1             {v0.8h}, [x0], #16
4877        uaddw           v24.4s,  v24.4s,  v0.4h
4878        uaddw2          v25.4s,  v25.4s,  v0.8h
4879        b.gt            1b
4880        trn2            v1.2d,   v0.2d,   v0.2d
4881        trn2            v0.2d,   v0.2d,   v0.2d
4882L(ipred_cfl_ac_420_w4_hpad):
4883        cbz             w4,  3f
48842:      // Vertical padding (h_pad > 0)
4885        subs            w4,  w4,  #4
4886        st1             {v0.8h, v1.8h}, [x0], #32
4887        uaddw           v24.4s,  v24.4s,  v0.4h
4888        uaddw2          v25.4s,  v25.4s,  v0.8h
4889        uaddw           v26.4s,  v26.4s,  v1.4h
4890        uaddw2          v27.4s,  v27.4s,  v1.8h
4891        b.gt            2b
48923:
4893L(ipred_cfl_ac_420_w4_calc_subtract_dc):
4894        // Aggregate the sums
4895        add             v24.4s,  v24.4s,  v25.4s
4896        add             v26.4s,  v26.4s,  v27.4s
4897        add             v0.4s,   v24.4s,  v26.4s
4898        addv            s0,  v0.4s                // sum
4899        sub             x0,  x0,  w6, uxtw #3
4900        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
4901        dup             v4.8h,   v4.h[0]
49026:      // Subtract dc from ac
4903        ld1             {v0.8h, v1.8h}, [x0]
4904        subs            w6,  w6,  #4
4905        sub             v0.8h,   v0.8h,   v4.8h
4906        sub             v1.8h,   v1.8h,   v4.8h
4907        st1             {v0.8h, v1.8h}, [x0], #32
4908        b.gt            6b
4909        ret
4910
4911L(ipred_cfl_ac_420_w8):
4912        AARCH64_VALID_JUMP_TARGET
4913        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
49141:      // Copy and subsample input, without padding
4915        ld1             {v0.8h, v1.8h}, [x1],  x2
4916        ld1             {v2.8h, v3.8h}, [x10], x2
4917        ld1             {v4.8h, v5.8h}, [x1],  x2
4918        addp            v0.8h,   v0.8h,   v1.8h
4919        ld1             {v6.8h, v7.8h}, [x10], x2
4920        addp            v2.8h,   v2.8h,   v3.8h
4921        addp            v4.8h,   v4.8h,   v5.8h
4922        addp            v6.8h,   v6.8h,   v7.8h
4923        add             v0.8h,   v0.8h,   v2.8h
4924        add             v4.8h,   v4.8h,   v6.8h
4925        shl             v0.8h,   v0.8h,   #1
4926        shl             v1.8h,   v4.8h,   #1
4927        subs            w8,  w8,  #2
4928        st1             {v0.8h, v1.8h}, [x0], #32
4929        uaddw           v24.4s,  v24.4s,  v0.4h
4930        uaddw2          v25.4s,  v25.4s,  v0.8h
4931        uaddw           v26.4s,  v26.4s,  v1.4h
4932        uaddw2          v27.4s,  v27.4s,  v1.8h
4933        b.gt            1b
4934        mov             v0.16b,  v1.16b
4935        b               L(ipred_cfl_ac_420_w8_hpad)
4936
4937L(ipred_cfl_ac_420_w8_wpad):
49381:      // Copy and subsample input, padding 4
4939        ld1             {v0.8h}, [x1],  x2
4940        ld1             {v1.8h}, [x10], x2
4941        ld1             {v2.8h}, [x1],  x2
4942        ld1             {v3.8h}, [x10], x2
4943        addp            v0.8h,   v0.8h,   v2.8h
4944        addp            v1.8h,   v1.8h,   v3.8h
4945        add             v0.8h,   v0.8h,   v1.8h
4946        shl             v0.8h,   v0.8h,   #1
4947        dup             v1.4h,   v0.h[3]
4948        dup             v3.4h,   v0.h[7]
4949        trn2            v2.2d,   v0.2d,   v0.2d
4950        subs            w8,  w8,  #2
4951        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
4952        uaddw           v24.4s,  v24.4s,  v0.4h
4953        uaddw           v25.4s,  v25.4s,  v1.4h
4954        uaddw           v26.4s,  v26.4s,  v2.4h
4955        uaddw           v27.4s,  v27.4s,  v3.4h
4956        b.gt            1b
4957        trn1            v0.2d,   v2.2d,   v3.2d
4958        trn1            v1.2d,   v2.2d,   v3.2d
4959
4960L(ipred_cfl_ac_420_w8_hpad):
4961        cbz             w4,  3f
49622:      // Vertical padding (h_pad > 0)
4963        subs            w4,  w4,  #4
4964        st1             {v0.8h, v1.8h}, [x0], #32
4965        uaddw           v24.4s,  v24.4s,  v0.4h
4966        uaddw2          v25.4s,  v25.4s,  v0.8h
4967        uaddw           v26.4s,  v26.4s,  v1.4h
4968        uaddw2          v27.4s,  v27.4s,  v1.8h
4969        st1             {v0.8h, v1.8h}, [x0], #32
4970        uaddw           v24.4s,  v24.4s,  v0.4h
4971        uaddw2          v25.4s,  v25.4s,  v0.8h
4972        uaddw           v26.4s,  v26.4s,  v1.4h
4973        uaddw2          v27.4s,  v27.4s,  v1.8h
4974        b.gt            2b
49753:
4976
4977        // Double the height and reuse the w4 summing/subtracting
4978        lsl             w6,  w6,  #1
4979        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
4980
4981L(ipred_cfl_ac_420_w16):
4982        AARCH64_VALID_JUMP_TARGET
4983        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
4984        ldrh            w3,  [x7, w3, uxtw #1]
4985        sub             x7,  x7,  w3, uxtw
4986        br              x7
4987
4988L(ipred_cfl_ac_420_w16_wpad0):
4989        AARCH64_VALID_JUMP_TARGET
49901:      // Copy and subsample input, without padding
4991        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
4992        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
4993        addp            v0.8h,   v0.8h,   v1.8h
4994        addp            v2.8h,   v2.8h,   v3.8h
4995        addp            v4.8h,   v4.8h,   v5.8h
4996        addp            v6.8h,   v6.8h,   v7.8h
4997        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
4998        add             v0.8h,   v0.8h,   v4.8h
4999        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
5000        add             v2.8h,   v2.8h,   v6.8h
5001        addp            v16.8h,  v16.8h,  v17.8h
5002        addp            v18.8h,  v18.8h,  v19.8h
5003        addp            v20.8h,  v20.8h,  v21.8h
5004        addp            v22.8h,  v22.8h,  v23.8h
5005        add             v16.8h,  v16.8h,  v20.8h
5006        add             v18.8h,  v18.8h,  v22.8h
5007        shl             v0.8h,   v0.8h,   #1
5008        shl             v1.8h,   v2.8h,   #1
5009        shl             v2.8h,   v16.8h,  #1
5010        shl             v3.8h,   v18.8h,  #1
5011        subs            w8,  w8,  #2
5012        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5013        uaddw           v24.4s,  v24.4s,  v0.4h
5014        uaddw2          v25.4s,  v25.4s,  v0.8h
5015        uaddw           v26.4s,  v26.4s,  v1.4h
5016        uaddw2          v27.4s,  v27.4s,  v1.8h
5017        uaddw           v24.4s,  v24.4s,  v2.4h
5018        uaddw2          v25.4s,  v25.4s,  v2.8h
5019        uaddw           v26.4s,  v26.4s,  v3.4h
5020        uaddw2          v27.4s,  v27.4s,  v3.8h
5021        b.gt            1b
5022        mov             v0.16b,  v2.16b
5023        mov             v1.16b,  v3.16b
5024        b               L(ipred_cfl_ac_420_w16_hpad)
5025
5026L(ipred_cfl_ac_420_w16_wpad1):
5027        AARCH64_VALID_JUMP_TARGET
50281:      // Copy and subsample input, padding 4
5029        ldr             q2,  [x1,  #32]
5030        ld1             {v0.8h, v1.8h}, [x1],  x2
5031        ldr             q5,  [x10, #32]
5032        ld1             {v3.8h, v4.8h}, [x10], x2
5033        addp            v2.8h,   v2.8h,   v2.8h
5034        addp            v0.8h,   v0.8h,   v1.8h
5035        addp            v5.8h,   v5.8h,   v5.8h
5036        addp            v3.8h,   v3.8h,   v4.8h
5037        ldr             q18, [x1,  #32]
5038        add             v2.4h,   v2.4h,   v5.4h
5039        ld1             {v16.8h, v17.8h}, [x1],  x2
5040        add             v0.8h,   v0.8h,   v3.8h
5041        ldr             q21, [x10, #32]
5042        ld1             {v19.8h, v20.8h}, [x10], x2
5043        addp            v18.8h,  v18.8h,  v18.8h
5044        addp            v16.8h,  v16.8h,  v17.8h
5045        addp            v21.8h,  v21.8h,  v21.8h
5046        addp            v19.8h,  v19.8h,  v20.8h
5047        add             v18.4h,  v18.4h,  v21.4h
5048        add             v16.8h,  v16.8h,  v19.8h
5049        shl             v1.4h,   v2.4h,   #1
5050        shl             v0.8h,   v0.8h,   #1
5051        shl             v3.4h,   v18.4h,  #1
5052        shl             v2.8h,   v16.8h,  #1
5053        dup             v4.4h,   v1.h[3]
5054        dup             v5.4h,   v3.h[3]
5055        trn1            v1.2d,   v1.2d,   v4.2d
5056        trn1            v3.2d,   v3.2d,   v5.2d
5057        subs            w8,  w8,  #2
5058        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5059        uaddw           v24.4s,  v24.4s,  v0.4h
5060        uaddw2          v25.4s,  v25.4s,  v0.8h
5061        uaddw           v26.4s,  v26.4s,  v1.4h
5062        uaddw2          v27.4s,  v27.4s,  v1.8h
5063        uaddw           v24.4s,  v24.4s,  v2.4h
5064        uaddw2          v25.4s,  v25.4s,  v2.8h
5065        uaddw           v26.4s,  v26.4s,  v3.4h
5066        uaddw2          v27.4s,  v27.4s,  v3.8h
5067        b.gt            1b
5068        mov             v0.16b,  v2.16b
5069        mov             v1.16b,  v3.16b
5070        b               L(ipred_cfl_ac_420_w16_hpad)
5071
5072L(ipred_cfl_ac_420_w16_wpad2):
5073        AARCH64_VALID_JUMP_TARGET
50741:      // Copy and subsample input, padding 8
5075        ld1             {v0.8h, v1.8h}, [x1],  x2
5076        ld1             {v2.8h, v3.8h}, [x10], x2
5077        ld1             {v4.8h, v5.8h}, [x1],  x2
5078        addp            v0.8h,   v0.8h,   v1.8h
5079        ld1             {v6.8h, v7.8h}, [x10], x2
5080        addp            v2.8h,   v2.8h,   v3.8h
5081        addp            v4.8h,   v4.8h,   v5.8h
5082        addp            v6.8h,   v6.8h,   v7.8h
5083        add             v0.8h,   v0.8h,   v2.8h
5084        add             v4.8h,   v4.8h,   v6.8h
5085        shl             v0.8h,   v0.8h,   #1
5086        shl             v2.8h,   v4.8h,   #1
5087        dup             v1.8h,   v0.h[7]
5088        dup             v3.8h,   v2.h[7]
5089        subs            w8,  w8,  #2
5090        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5091        uaddw           v24.4s,  v24.4s,  v0.4h
5092        uaddw2          v25.4s,  v25.4s,  v0.8h
5093        uaddw           v26.4s,  v26.4s,  v1.4h
5094        uaddw2          v27.4s,  v27.4s,  v1.8h
5095        uaddw           v24.4s,  v24.4s,  v2.4h
5096        uaddw2          v25.4s,  v25.4s,  v2.8h
5097        uaddw           v26.4s,  v26.4s,  v3.4h
5098        uaddw2          v27.4s,  v27.4s,  v3.8h
5099        b.gt            1b
5100        mov             v0.16b,  v2.16b
5101        mov             v1.16b,  v3.16b
5102        b               L(ipred_cfl_ac_420_w16_hpad)
5103
5104L(ipred_cfl_ac_420_w16_wpad3):
5105        AARCH64_VALID_JUMP_TARGET
51061:      // Copy and subsample input, padding 12
5107        ld1             {v0.8h}, [x1],  x2
5108        ld1             {v2.8h}, [x10], x2
5109        ld1             {v4.8h}, [x1],  x2
5110        ld1             {v6.8h}, [x10], x2
5111        addp            v0.8h,   v0.8h,   v4.8h
5112        addp            v2.8h,   v2.8h,   v6.8h
5113        add             v0.8h,   v0.8h,   v2.8h
5114        shl             v0.8h,   v0.8h,   #1
5115        dup             v1.8h,   v0.h[3]
5116        dup             v3.8h,   v0.h[7]
5117        trn2            v2.2d,   v0.2d,   v3.2d
5118        trn1            v0.2d,   v0.2d,   v1.2d
5119        subs            w8,  w8,  #2
5120        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5121        uaddw           v24.4s,  v24.4s,  v0.4h
5122        uaddw2          v25.4s,  v25.4s,  v0.8h
5123        uaddw           v26.4s,  v26.4s,  v1.4h
5124        uaddw2          v27.4s,  v27.4s,  v1.8h
5125        uaddw           v24.4s,  v24.4s,  v2.4h
5126        uaddw2          v25.4s,  v25.4s,  v2.8h
5127        uaddw           v26.4s,  v26.4s,  v3.4h
5128        uaddw2          v27.4s,  v27.4s,  v3.8h
5129        b.gt            1b
5130        mov             v0.16b,  v2.16b
5131        mov             v1.16b,  v3.16b
5132
5133L(ipred_cfl_ac_420_w16_hpad):
5134        cbz             w4,  3f
51352:      // Vertical padding (h_pad > 0)
5136        subs            w4,  w4,  #4
5137        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5138        uaddw           v24.4s,  v24.4s,  v0.4h
5139        uaddw2          v25.4s,  v25.4s,  v0.8h
5140        uaddw           v26.4s,  v26.4s,  v1.4h
5141        uaddw2          v27.4s,  v27.4s,  v1.8h
5142        uaddw           v24.4s,  v24.4s,  v2.4h
5143        uaddw2          v25.4s,  v25.4s,  v2.8h
5144        uaddw           v26.4s,  v26.4s,  v3.4h
5145        uaddw2          v27.4s,  v27.4s,  v3.8h
5146        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5147        uaddw           v24.4s,  v24.4s,  v0.4h
5148        uaddw2          v25.4s,  v25.4s,  v0.8h
5149        uaddw           v26.4s,  v26.4s,  v1.4h
5150        uaddw2          v27.4s,  v27.4s,  v1.8h
5151        uaddw           v24.4s,  v24.4s,  v2.4h
5152        uaddw2          v25.4s,  v25.4s,  v2.8h
5153        uaddw           v26.4s,  v26.4s,  v3.4h
5154        uaddw2          v27.4s,  v27.4s,  v3.8h
5155        b.gt            2b
51563:
5157
5158        // Quadruple the height and reuse the w4 summing/subtracting
5159        lsl             w6,  w6,  #2
5160        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5161
5162L(ipred_cfl_ac_420_tbl):
5163        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
5164        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
5165        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
5166        .hword 0
5167
5168L(ipred_cfl_ac_420_w16_tbl):
5169        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
5170        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
5171        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
5172        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
5173endfunc
5174
5175// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5176//                            const ptrdiff_t stride, const int w_pad,
5177//                            const int h_pad, const int cw, const int ch);
5178function ipred_cfl_ac_422_16bpc_neon, export=1
5179        clz             w8,  w5
5180        lsl             w4,  w4,  #2
5181        adr             x7,  L(ipred_cfl_ac_422_tbl)
5182        sub             w8,  w8,  #27
5183        ldrh            w8,  [x7, w8, uxtw #1]
5184        movi            v24.4s,  #0
5185        movi            v25.4s,  #0
5186        movi            v26.4s,  #0
5187        movi            v27.4s,  #0
5188        sub             x7,  x7,  w8, uxtw
5189        sub             w8,  w6,  w4         // height - h_pad
5190        rbit            w9,  w5              // rbit(width)
5191        rbit            w10, w6              // rbit(height)
5192        clz             w9,  w9              // ctz(width)
5193        clz             w10, w10             // ctz(height)
5194        add             w9,  w9,  w10        // log2sz
5195        add             x10, x1,  x2
5196        dup             v31.4s,  w9
5197        lsl             x2,  x2,  #1
5198        neg             v31.4s,  v31.4s      // -log2sz
5199        br              x7
5200
5201L(ipred_cfl_ac_422_w4):
5202        AARCH64_VALID_JUMP_TARGET
52031:      // Copy and subsample input
5204        ld1             {v0.8h}, [x1],  x2
5205        ld1             {v1.8h}, [x10], x2
5206        ld1             {v2.8h}, [x1],  x2
5207        ld1             {v3.8h}, [x10], x2
5208        addp            v0.8h,   v0.8h,   v1.8h
5209        addp            v2.8h,   v2.8h,   v3.8h
5210        shl             v0.8h,   v0.8h,   #2
5211        shl             v1.8h,   v2.8h,   #2
5212        subs            w8,  w8,  #4
5213        st1             {v0.8h, v1.8h}, [x0], #32
5214        uaddw           v24.4s,  v24.4s,  v0.4h
5215        uaddw2          v25.4s,  v25.4s,  v0.8h
5216        uaddw           v26.4s,  v26.4s,  v1.4h
5217        uaddw2          v27.4s,  v27.4s,  v1.8h
5218        b.gt            1b
5219        trn2            v0.2d,   v1.2d,   v1.2d
5220        trn2            v1.2d,   v1.2d,   v1.2d
5221        b               L(ipred_cfl_ac_420_w4_hpad)
5222
5223L(ipred_cfl_ac_422_w8):
5224        AARCH64_VALID_JUMP_TARGET
5225        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
52261:      // Copy and subsample input, without padding
5227        ld1             {v0.8h, v1.8h}, [x1],  x2
5228        ld1             {v2.8h, v3.8h}, [x10], x2
5229        ld1             {v4.8h, v5.8h}, [x1],  x2
5230        addp            v0.8h,   v0.8h,   v1.8h
5231        ld1             {v6.8h, v7.8h}, [x10], x2
5232        addp            v2.8h,   v2.8h,   v3.8h
5233        addp            v4.8h,   v4.8h,   v5.8h
5234        addp            v6.8h,   v6.8h,   v7.8h
5235        shl             v0.8h,   v0.8h,   #2
5236        shl             v1.8h,   v2.8h,   #2
5237        shl             v2.8h,   v4.8h,   #2
5238        shl             v3.8h,   v6.8h,   #2
5239        subs            w8,  w8,  #4
5240        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5241        uaddw           v24.4s,  v24.4s,  v0.4h
5242        uaddw2          v25.4s,  v25.4s,  v0.8h
5243        uaddw           v26.4s,  v26.4s,  v1.4h
5244        uaddw2          v27.4s,  v27.4s,  v1.8h
5245        uaddw           v24.4s,  v24.4s,  v2.4h
5246        uaddw2          v25.4s,  v25.4s,  v2.8h
5247        uaddw           v26.4s,  v26.4s,  v3.4h
5248        uaddw2          v27.4s,  v27.4s,  v3.8h
5249        b.gt            1b
5250        mov             v0.16b,  v3.16b
5251        mov             v1.16b,  v3.16b
5252        b               L(ipred_cfl_ac_420_w8_hpad)
5253
5254L(ipred_cfl_ac_422_w8_wpad):
52551:      // Copy and subsample input, padding 4
5256        ld1             {v0.8h}, [x1],  x2
5257        ld1             {v1.8h}, [x10], x2
5258        ld1             {v2.8h}, [x1],  x2
5259        ld1             {v3.8h}, [x10], x2
5260        addp            v0.8h,   v0.8h,   v1.8h
5261        addp            v2.8h,   v2.8h,   v3.8h
5262        shl             v0.8h,   v0.8h,   #2
5263        shl             v2.8h,   v2.8h,   #2
5264        dup             v4.4h,   v0.h[3]
5265        dup             v5.8h,   v0.h[7]
5266        dup             v6.4h,   v2.h[3]
5267        dup             v7.8h,   v2.h[7]
5268        trn2            v1.2d,   v0.2d,   v5.2d
5269        trn1            v0.2d,   v0.2d,   v4.2d
5270        trn2            v3.2d,   v2.2d,   v7.2d
5271        trn1            v2.2d,   v2.2d,   v6.2d
5272        subs            w8,  w8,  #4
5273        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5274        uaddw           v24.4s,  v24.4s,  v0.4h
5275        uaddw2          v25.4s,  v25.4s,  v0.8h
5276        uaddw           v26.4s,  v26.4s,  v1.4h
5277        uaddw2          v27.4s,  v27.4s,  v1.8h
5278        uaddw           v24.4s,  v24.4s,  v2.4h
5279        uaddw2          v25.4s,  v25.4s,  v2.8h
5280        uaddw           v26.4s,  v26.4s,  v3.4h
5281        uaddw2          v27.4s,  v27.4s,  v3.8h
5282        b.gt            1b
5283        mov             v0.16b,  v3.16b
5284        mov             v1.16b,  v3.16b
5285        b               L(ipred_cfl_ac_420_w8_hpad)
5286
5287L(ipred_cfl_ac_422_w16):
5288        AARCH64_VALID_JUMP_TARGET
5289        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
5290        ldrh            w3,  [x7, w3, uxtw #1]
5291        sub             x7,  x7,  w3, uxtw
5292        br              x7
5293
5294L(ipred_cfl_ac_422_w16_wpad0):
5295        AARCH64_VALID_JUMP_TARGET
52961:      // Copy and subsample input, without padding
5297        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5298        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
5299        addp            v0.8h,   v0.8h,   v1.8h
5300        addp            v2.8h,   v2.8h,   v3.8h
5301        addp            v4.8h,   v4.8h,   v5.8h
5302        addp            v6.8h,   v6.8h,   v7.8h
5303        shl             v0.8h,   v0.8h,   #2
5304        shl             v1.8h,   v2.8h,   #2
5305        shl             v2.8h,   v4.8h,   #2
5306        shl             v3.8h,   v6.8h,   #2
5307        subs            w8,  w8,  #2
5308        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5309        uaddw           v24.4s,  v24.4s,  v0.4h
5310        uaddw2          v25.4s,  v25.4s,  v0.8h
5311        uaddw           v26.4s,  v26.4s,  v1.4h
5312        uaddw2          v27.4s,  v27.4s,  v1.8h
5313        uaddw           v24.4s,  v24.4s,  v2.4h
5314        uaddw2          v25.4s,  v25.4s,  v2.8h
5315        uaddw           v26.4s,  v26.4s,  v3.4h
5316        uaddw2          v27.4s,  v27.4s,  v3.8h
5317        b.gt            1b
5318        mov             v0.16b,  v2.16b
5319        mov             v1.16b,  v3.16b
5320        b               L(ipred_cfl_ac_420_w16_hpad)
5321
5322L(ipred_cfl_ac_422_w16_wpad1):
5323        AARCH64_VALID_JUMP_TARGET
53241:      // Copy and subsample input, padding 4
5325        ldr             q2,  [x1,  #32]
5326        ld1             {v0.8h, v1.8h}, [x1],  x2
5327        ldr             q6,  [x10, #32]
5328        ld1             {v4.8h, v5.8h}, [x10], x2
5329        addp            v2.8h,   v2.8h,   v2.8h
5330        addp            v0.8h,   v0.8h,   v1.8h
5331        addp            v6.8h,   v6.8h,   v6.8h
5332        addp            v4.8h,   v4.8h,   v5.8h
5333        shl             v1.4h,   v2.4h,   #2
5334        shl             v0.8h,   v0.8h,   #2
5335        shl             v3.4h,   v6.4h,   #2
5336        shl             v2.8h,   v4.8h,   #2
5337        dup             v4.4h,   v1.h[3]
5338        dup             v5.4h,   v3.h[3]
5339        trn1            v1.2d,   v1.2d,   v4.2d
5340        trn1            v3.2d,   v3.2d,   v5.2d
5341        subs            w8,  w8,  #2
5342        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5343        uaddw           v24.4s,  v24.4s,  v0.4h
5344        uaddw2          v25.4s,  v25.4s,  v0.8h
5345        uaddw           v26.4s,  v26.4s,  v1.4h
5346        uaddw2          v27.4s,  v27.4s,  v1.8h
5347        uaddw           v24.4s,  v24.4s,  v2.4h
5348        uaddw2          v25.4s,  v25.4s,  v2.8h
5349        uaddw           v26.4s,  v26.4s,  v3.4h
5350        uaddw2          v27.4s,  v27.4s,  v3.8h
5351        b.gt            1b
5352        mov             v0.16b,  v2.16b
5353        mov             v1.16b,  v3.16b
5354        b               L(ipred_cfl_ac_420_w16_hpad)
5355
5356L(ipred_cfl_ac_422_w16_wpad2):
5357        AARCH64_VALID_JUMP_TARGET
53581:      // Copy and subsample input, padding 8
5359        ld1             {v0.8h, v1.8h}, [x1],  x2
5360        ld1             {v2.8h, v3.8h}, [x10], x2
5361        addp            v0.8h,   v0.8h,   v1.8h
5362        addp            v2.8h,   v2.8h,   v3.8h
5363        shl             v0.8h,   v0.8h,   #2
5364        shl             v2.8h,   v2.8h,   #2
5365        dup             v1.8h,   v0.h[7]
5366        dup             v3.8h,   v2.h[7]
5367        subs            w8,  w8,  #2
5368        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5369        uaddw           v24.4s,  v24.4s,  v0.4h
5370        uaddw2          v25.4s,  v25.4s,  v0.8h
5371        uaddw           v26.4s,  v26.4s,  v1.4h
5372        uaddw2          v27.4s,  v27.4s,  v1.8h
5373        uaddw           v24.4s,  v24.4s,  v2.4h
5374        uaddw2          v25.4s,  v25.4s,  v2.8h
5375        uaddw           v26.4s,  v26.4s,  v3.4h
5376        uaddw2          v27.4s,  v27.4s,  v3.8h
5377        b.gt            1b
5378        mov             v0.16b,  v2.16b
5379        mov             v1.16b,  v3.16b
5380        b               L(ipred_cfl_ac_420_w16_hpad)
5381
5382L(ipred_cfl_ac_422_w16_wpad3):
5383        AARCH64_VALID_JUMP_TARGET
53841:      // Copy and subsample input, padding 12
5385        ld1             {v0.8h}, [x1],  x2
5386        ld1             {v2.8h}, [x10], x2
5387        addp            v0.8h,   v0.8h,   v0.8h
5388        addp            v2.8h,   v2.8h,   v2.8h
5389        shl             v0.4h,   v0.4h,   #2
5390        shl             v2.4h,   v2.4h,   #2
5391        dup             v1.8h,   v0.h[3]
5392        dup             v3.8h,   v2.h[3]
5393        trn1            v0.2d,   v0.2d,   v1.2d
5394        trn1            v2.2d,   v2.2d,   v3.2d
5395        subs            w8,  w8,  #2
5396        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5397        uaddw           v24.4s,  v24.4s,  v0.4h
5398        uaddw2          v25.4s,  v25.4s,  v0.8h
5399        uaddw           v26.4s,  v26.4s,  v1.4h
5400        uaddw2          v27.4s,  v27.4s,  v1.8h
5401        uaddw           v24.4s,  v24.4s,  v2.4h
5402        uaddw2          v25.4s,  v25.4s,  v2.8h
5403        uaddw           v26.4s,  v26.4s,  v3.4h
5404        uaddw2          v27.4s,  v27.4s,  v3.8h
5405        b.gt            1b
5406        mov             v0.16b,  v2.16b
5407        mov             v1.16b,  v3.16b
5408        b               L(ipred_cfl_ac_420_w16_hpad)
5409
5410L(ipred_cfl_ac_422_tbl):
5411        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
5412        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
5413        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
5414        .hword 0
5415
5416L(ipred_cfl_ac_422_w16_tbl):
5417        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
5418        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
5419        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
5420        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
5421endfunc
5422
5423// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
5424//                            const ptrdiff_t stride, const int w_pad,
5425//                            const int h_pad, const int cw, const int ch);
5426function ipred_cfl_ac_444_16bpc_neon, export=1
5427        clz             w8,  w5
5428        lsl             w4,  w4,  #2
5429        adr             x7,  L(ipred_cfl_ac_444_tbl)
5430        sub             w8,  w8,  #26
5431        ldrh            w8,  [x7, w8, uxtw #1]
5432        movi            v24.4s,  #0
5433        movi            v25.4s,  #0
5434        movi            v26.4s,  #0
5435        movi            v27.4s,  #0
5436        sub             x7,  x7,  w8, uxtw
5437        sub             w8,  w6,  w4         // height - h_pad
5438        rbit            w9,  w5              // rbit(width)
5439        rbit            w10, w6              // rbit(height)
5440        clz             w9,  w9              // ctz(width)
5441        clz             w10, w10             // ctz(height)
5442        add             w9,  w9,  w10        // log2sz
5443        add             x10, x1,  x2
5444        dup             v31.4s,  w9
5445        lsl             x2,  x2,  #1
5446        neg             v31.4s,  v31.4s      // -log2sz
5447        br              x7
5448
5449L(ipred_cfl_ac_444_w4):
5450        AARCH64_VALID_JUMP_TARGET
54511:      // Copy and expand input
5452        ld1             {v0.4h},   [x1],  x2
5453        ld1             {v0.d}[1], [x10], x2
5454        ld1             {v1.4h},   [x1],  x2
5455        ld1             {v1.d}[1], [x10], x2
5456        shl             v0.8h,   v0.8h,   #3
5457        shl             v1.8h,   v1.8h,   #3
5458        subs            w8,  w8,  #4
5459        st1             {v0.8h, v1.8h}, [x0], #32
5460        uaddw           v24.4s,  v24.4s,  v0.4h
5461        uaddw2          v25.4s,  v25.4s,  v0.8h
5462        uaddw           v26.4s,  v26.4s,  v1.4h
5463        uaddw2          v27.4s,  v27.4s,  v1.8h
5464        b.gt            1b
5465        trn2            v0.2d,   v1.2d,   v1.2d
5466        trn2            v1.2d,   v1.2d,   v1.2d
5467        b               L(ipred_cfl_ac_420_w4_hpad)
5468
5469L(ipred_cfl_ac_444_w8):
5470        AARCH64_VALID_JUMP_TARGET
54711:      // Copy and expand input
5472        ld1             {v0.8h}, [x1],  x2
5473        ld1             {v1.8h}, [x10], x2
5474        ld1             {v2.8h}, [x1],  x2
5475        shl             v0.8h,   v0.8h,   #3
5476        ld1             {v3.8h}, [x10], x2
5477        shl             v1.8h,   v1.8h,   #3
5478        shl             v2.8h,   v2.8h,   #3
5479        shl             v3.8h,   v3.8h,   #3
5480        subs            w8,  w8,  #4
5481        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5482        uaddw           v24.4s,  v24.4s,  v0.4h
5483        uaddw2          v25.4s,  v25.4s,  v0.8h
5484        uaddw           v26.4s,  v26.4s,  v1.4h
5485        uaddw2          v27.4s,  v27.4s,  v1.8h
5486        uaddw           v24.4s,  v24.4s,  v2.4h
5487        uaddw2          v25.4s,  v25.4s,  v2.8h
5488        uaddw           v26.4s,  v26.4s,  v3.4h
5489        uaddw2          v27.4s,  v27.4s,  v3.8h
5490        b.gt            1b
5491        mov             v0.16b,  v3.16b
5492        mov             v1.16b,  v3.16b
5493        b               L(ipred_cfl_ac_420_w8_hpad)
5494
5495L(ipred_cfl_ac_444_w16):
5496        AARCH64_VALID_JUMP_TARGET
5497        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
54981:      // Copy and expand input, without padding
5499        ld1             {v0.8h, v1.8h}, [x1],  x2
5500        ld1             {v2.8h, v3.8h}, [x10], x2
5501        shl             v0.8h,   v0.8h,   #3
5502        shl             v1.8h,   v1.8h,   #3
5503        shl             v2.8h,   v2.8h,   #3
5504        shl             v3.8h,   v3.8h,   #3
5505        subs            w8,  w8,  #2
5506        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5507        uaddw           v24.4s,  v24.4s,  v0.4h
5508        uaddw2          v25.4s,  v25.4s,  v0.8h
5509        uaddw           v26.4s,  v26.4s,  v1.4h
5510        uaddw2          v27.4s,  v27.4s,  v1.8h
5511        uaddw           v24.4s,  v24.4s,  v2.4h
5512        uaddw2          v25.4s,  v25.4s,  v2.8h
5513        uaddw           v26.4s,  v26.4s,  v3.4h
5514        uaddw2          v27.4s,  v27.4s,  v3.8h
5515        b.gt            1b
5516        mov             v0.16b,  v2.16b
5517        mov             v1.16b,  v3.16b
5518        b               L(ipred_cfl_ac_420_w16_hpad)
5519
5520L(ipred_cfl_ac_444_w16_wpad):
55211:      // Copy and expand input, padding 8
5522        ld1             {v0.8h}, [x1],  x2
5523        ld1             {v2.8h}, [x10], x2
5524        shl             v0.8h,   v0.8h,   #3
5525        shl             v2.8h,   v2.8h,   #3
5526        dup             v1.8h,   v0.h[7]
5527        dup             v3.8h,   v2.h[7]
5528        subs            w8,  w8,  #2
5529        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5530        uaddw           v24.4s,  v24.4s,  v0.4h
5531        uaddw2          v25.4s,  v25.4s,  v0.8h
5532        uaddw           v26.4s,  v26.4s,  v1.4h
5533        uaddw2          v27.4s,  v27.4s,  v1.8h
5534        uaddw           v24.4s,  v24.4s,  v2.4h
5535        uaddw2          v25.4s,  v25.4s,  v2.8h
5536        uaddw           v26.4s,  v26.4s,  v3.4h
5537        uaddw2          v27.4s,  v27.4s,  v3.8h
5538        b.gt            1b
5539        mov             v0.16b,  v2.16b
5540        mov             v1.16b,  v3.16b
5541        b               L(ipred_cfl_ac_420_w16_hpad)
5542
5543L(ipred_cfl_ac_444_w32):
5544        AARCH64_VALID_JUMP_TARGET
5545        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
5546        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
5547        lsr             x2,  x2,  #1 // Restore the stride to one line increments
5548        sub             x7,  x7,  w3, uxtw
5549        br              x7
5550
5551L(ipred_cfl_ac_444_w32_wpad0):
5552        AARCH64_VALID_JUMP_TARGET
55531:      // Copy and expand input, without padding
5554        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
5555        shl             v0.8h,   v0.8h,   #3
5556        shl             v1.8h,   v1.8h,   #3
5557        shl             v2.8h,   v2.8h,   #3
5558        shl             v3.8h,   v3.8h,   #3
5559        subs            w8,  w8,  #1
5560        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5561        uaddw           v24.4s,  v24.4s,  v0.4h
5562        uaddw2          v25.4s,  v25.4s,  v0.8h
5563        uaddw           v26.4s,  v26.4s,  v1.4h
5564        uaddw2          v27.4s,  v27.4s,  v1.8h
5565        uaddw           v24.4s,  v24.4s,  v2.4h
5566        uaddw2          v25.4s,  v25.4s,  v2.8h
5567        uaddw           v26.4s,  v26.4s,  v3.4h
5568        uaddw2          v27.4s,  v27.4s,  v3.8h
5569        b.gt            1b
5570        b               L(ipred_cfl_ac_444_w32_hpad)
5571
5572L(ipred_cfl_ac_444_w32_wpad2):
5573        AARCH64_VALID_JUMP_TARGET
55741:      // Copy and expand input, padding 8
5575        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
5576        shl             v2.8h,   v2.8h,   #3
5577        shl             v0.8h,   v0.8h,   #3
5578        shl             v1.8h,   v1.8h,   #3
5579        dup             v3.8h,   v2.h[7]
5580        subs            w8,  w8,  #1
5581        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5582        uaddw           v24.4s,  v24.4s,  v0.4h
5583        uaddw2          v25.4s,  v25.4s,  v0.8h
5584        uaddw           v26.4s,  v26.4s,  v1.4h
5585        uaddw2          v27.4s,  v27.4s,  v1.8h
5586        uaddw           v24.4s,  v24.4s,  v2.4h
5587        uaddw2          v25.4s,  v25.4s,  v2.8h
5588        uaddw           v26.4s,  v26.4s,  v3.4h
5589        uaddw2          v27.4s,  v27.4s,  v3.8h
5590        b.gt            1b
5591        b               L(ipred_cfl_ac_444_w32_hpad)
5592
5593L(ipred_cfl_ac_444_w32_wpad4):
5594        AARCH64_VALID_JUMP_TARGET
55951:      // Copy and expand input, padding 16
5596        ld1             {v0.8h, v1.8h}, [x1],  x2
5597        shl             v1.8h,   v1.8h,   #3
5598        shl             v0.8h,   v0.8h,   #3
5599        dup             v2.8h,   v1.h[7]
5600        dup             v3.8h,   v1.h[7]
5601        subs            w8,  w8,  #1
5602        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5603        uaddw           v24.4s,  v24.4s,  v0.4h
5604        uaddw2          v25.4s,  v25.4s,  v0.8h
5605        uaddw           v26.4s,  v26.4s,  v1.4h
5606        uaddw2          v27.4s,  v27.4s,  v1.8h
5607        uaddw           v24.4s,  v24.4s,  v2.4h
5608        uaddw2          v25.4s,  v25.4s,  v2.8h
5609        uaddw           v26.4s,  v26.4s,  v3.4h
5610        uaddw2          v27.4s,  v27.4s,  v3.8h
5611        b.gt            1b
5612        b               L(ipred_cfl_ac_444_w32_hpad)
5613
5614L(ipred_cfl_ac_444_w32_wpad6):
5615        AARCH64_VALID_JUMP_TARGET
56161:      // Copy and expand input, padding 24
5617        ld1             {v0.8h}, [x1],  x2
5618        shl             v0.8h,   v0.8h,   #3
5619        dup             v1.8h,   v0.h[7]
5620        dup             v2.8h,   v0.h[7]
5621        dup             v3.8h,   v0.h[7]
5622        subs            w8,  w8,  #1
5623        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5624        uaddw           v24.4s,  v24.4s,  v0.4h
5625        uaddw2          v25.4s,  v25.4s,  v0.8h
5626        uaddw           v26.4s,  v26.4s,  v1.4h
5627        uaddw2          v27.4s,  v27.4s,  v1.8h
5628        uaddw           v24.4s,  v24.4s,  v2.4h
5629        uaddw2          v25.4s,  v25.4s,  v2.8h
5630        uaddw           v26.4s,  v26.4s,  v3.4h
5631        uaddw2          v27.4s,  v27.4s,  v3.8h
5632        b.gt            1b
5633
5634L(ipred_cfl_ac_444_w32_hpad):
5635        cbz             w4,  3f
56362:      // Vertical padding (h_pad > 0)
5637        subs            w4,  w4,  #2
5638        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5639        uaddw           v24.4s,  v24.4s,  v0.4h
5640        uaddw2          v25.4s,  v25.4s,  v0.8h
5641        uaddw           v26.4s,  v26.4s,  v1.4h
5642        uaddw2          v27.4s,  v27.4s,  v1.8h
5643        uaddw           v24.4s,  v24.4s,  v2.4h
5644        uaddw2          v25.4s,  v25.4s,  v2.8h
5645        uaddw           v26.4s,  v26.4s,  v3.4h
5646        uaddw2          v27.4s,  v27.4s,  v3.8h
5647        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
5648        uaddw           v24.4s,  v24.4s,  v0.4h
5649        uaddw2          v25.4s,  v25.4s,  v0.8h
5650        uaddw           v26.4s,  v26.4s,  v1.4h
5651        uaddw2          v27.4s,  v27.4s,  v1.8h
5652        uaddw           v24.4s,  v24.4s,  v2.4h
5653        uaddw2          v25.4s,  v25.4s,  v2.8h
5654        uaddw           v26.4s,  v26.4s,  v3.4h
5655        uaddw2          v27.4s,  v27.4s,  v3.8h
5656        b.gt            2b
56573:
5658
5659        //  Multiply the height by eight and reuse the w4 subtracting
5660        lsl             w6,  w6,  #3
5661        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
5662
5663L(ipred_cfl_ac_444_tbl):
5664        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
5665        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
5666        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
5667        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)
5668
5669L(ipred_cfl_ac_444_w32_tbl):
5670        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
5671        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
5672        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
5673        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
5674endfunc
5675