arm/64/ipred16.S

/*
 * Copyright © 2018, VideoLAN and dav1d authors
 * Copyright © 2019, Martin Storsjo
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "src/arm/asm.S"
#include "util.S"

// void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                              const pixel *const topleft,
//                              const int width, const int height, const int a,
//                              const int max_width, const int max_height,
//                              const int bitdepth_max);
function ipred_dc_128_16bpc_neon, export=1
        ldr             w8,  [sp]
        clz             w3,  w3
        adr             x5,  L(ipred_dc_128_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        dup             v0.8h,   w8
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        urshr           v0.8h,   v0.8h,  #1
        br              x5
4:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            4b
        ret
8:
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
16:
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
32:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
        sub             x1,  x1,  #64
64:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            64b
        ret

L(ipred_dc_128_tbl):
        .hword L(ipred_dc_128_tbl) - 640b
        .hword L(ipred_dc_128_tbl) - 320b
        .hword L(ipred_dc_128_tbl) - 160b
        .hword L(ipred_dc_128_tbl) -   8b
        .hword L(ipred_dc_128_tbl) -   4b
endfunc

// void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                         const pixel *const topleft,
//                         const int width, const int height, const int a,
//                         const int max_width, const int max_height);
function ipred_v_16bpc_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_v_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        add             x2,  x2,  #2
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2]
4:
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2]
8:
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h}, [x2]
16:
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
32:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
        sub             x1,  x1,  #64
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
64:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
        b.gt            64b
        ret

L(ipred_v_tbl):
        .hword L(ipred_v_tbl) - 640b
        .hword L(ipred_v_tbl) - 320b
        .hword L(ipred_v_tbl) - 160b
        .hword L(ipred_v_tbl) -  80b
        .hword L(ipred_v_tbl) -  40b
endfunc

// void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                         const pixel *const topleft,
//                         const int width, const int height, const int a,
//                         const int max_width, const int max_height);
function ipred_h_16bpc_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_h_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        sub             x2,  x2,  #8
        sub             x5,  x5,  w3, uxtw
        mov             x7,  #-8
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
4:
        AARCH64_VALID_JUMP_TARGET
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
        st1             {v3.4h},  [x0], x1
        st1             {v2.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v1.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            4b
        ret
8:
        AARCH64_VALID_JUMP_TARGET
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
        st1             {v3.8h},  [x0], x1
        st1             {v2.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v1.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            8b
        ret
16:
        AARCH64_VALID_JUMP_TARGET
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
        str             q3,  [x0, #16]
        str             q2,  [x6, #16]
        st1             {v3.8h}, [x0], x1
        st1             {v2.8h}, [x6], x1
        subs            w4,  w4,  #4
        str             q1,  [x0, #16]
        str             q0,  [x6, #16]
        st1             {v1.8h}, [x0], x1
        st1             {v0.8h}, [x6], x1
        b.gt            16b
        ret
32:
        AARCH64_VALID_JUMP_TARGET
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
        str             q3,  [x0, #16]
        str             q2,  [x6, #16]
        stp             q3,  q3,  [x0, #32]
        stp             q2,  q2,  [x6, #32]
        st1             {v3.8h}, [x0], x1
        st1             {v2.8h}, [x6], x1
        subs            w4,  w4,  #4
        str             q1,  [x0, #16]
        str             q0,  [x6, #16]
        stp             q1,  q1,  [x0, #32]
        stp             q0,  q0,  [x6, #32]
        st1             {v1.8h}, [x0], x1
        st1             {v0.8h}, [x6], x1
        b.gt            32b
        ret
64:
        AARCH64_VALID_JUMP_TARGET
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
        str             q3,  [x0, #16]
        str             q2,  [x6, #16]
        stp             q3,  q3,  [x0, #32]
        stp             q2,  q2,  [x6, #32]
        stp             q3,  q3,  [x0, #64]
        stp             q2,  q2,  [x6, #64]
        stp             q3,  q3,  [x0, #96]
        stp             q2,  q2,  [x6, #96]
        st1             {v3.8h}, [x0], x1
        st1             {v2.8h}, [x6], x1
        subs            w4,  w4,  #4
        str             q1,  [x0, #16]
        str             q0,  [x6, #16]
        stp             q1,  q1,  [x0, #32]
        stp             q0,  q0,  [x6, #32]
        stp             q1,  q1,  [x0, #64]
        stp             q0,  q0,  [x6, #64]
        stp             q1,  q1,  [x0, #96]
        stp             q0,  q0,  [x6, #96]
        st1             {v1.8h}, [x0], x1
        st1             {v0.8h}, [x6], x1
        b.gt            64b
        ret

L(ipred_h_tbl):
        .hword L(ipred_h_tbl) - 64b
        .hword L(ipred_h_tbl) - 32b
        .hword L(ipred_h_tbl) - 16b
        .hword L(ipred_h_tbl) -  8b
        .hword L(ipred_h_tbl) -  4b
endfunc

// void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                              const pixel *const topleft,
//                              const int width, const int height, const int a,
//                              const int max_width, const int max_height);
function ipred_dc_top_16bpc_neon, export=1
        clz             w3,  w3
        adr             x5,  L(ipred_dc_top_tbl)
        sub             w3,  w3,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        add             x2,  x2,  #2
        sub             x5,  x5,  w3, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2]
        addv            h0,      v0.4h
        urshr           v0.4h,   v0.4h,   #2
        dup             v0.4h,   v0.h[0]
4:
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2]
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #3
        dup             v0.8h,   v0.h[0]
8:
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h}, [x2]
        addp            v0.8h,   v0.8h,   v1.8h
        addv            h0,      v0.8h
        urshr           v2.4h,   v0.4h,   #4
        dup             v0.8h,   v2.h[0]
        dup             v1.8h,   v2.h[0]
16:
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v0.8h,   v0.8h,   v2.8h
        uaddlv          s0,      v0.8h
        rshrn           v4.4h,   v0.4s,   #5
        dup             v0.8h,   v4.h[0]
        dup             v1.8h,   v4.h[0]
        dup             v2.8h,   v4.h[0]
        dup             v3.8h,   v4.h[0]
32:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v4.8h,   v4.8h,   v6.8h
        addp            v0.8h,   v0.8h,   v4.8h
        uaddlv          s0,      v0.8h
        rshrn           v4.4h,   v0.4s,   #6
        sub             x1,  x1,  #64
        dup             v0.8h,   v4.h[0]
        dup             v1.8h,   v4.h[0]
        dup             v2.8h,   v4.h[0]
        dup             v3.8h,   v4.h[0]
64:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            64b
        ret

L(ipred_dc_top_tbl):
        .hword L(ipred_dc_top_tbl) - 640b
        .hword L(ipred_dc_top_tbl) - 320b
        .hword L(ipred_dc_top_tbl) - 160b
        .hword L(ipred_dc_top_tbl) -  80b
        .hword L(ipred_dc_top_tbl) -  40b
endfunc

// void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                               const pixel *const topleft,
//                               const int width, const int height, const int a,
//                               const int max_width, const int max_height);
function ipred_dc_left_16bpc_neon, export=1
        sub             x2,  x2,  w4, uxtw #1
        clz             w3,  w3
        clz             w7,  w4
        adr             x5,  L(ipred_dc_left_tbl)
        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
        sub             w7,  w7,  #25
        ldrh            w3,  [x5, w3, uxtw #1]
        ldrh            w7,  [x5, w7, uxtw #1]
        sub             x3,  x5,  w3, uxtw
        sub             x5,  x5,  w7, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5

L(ipred_dc_left_h4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2]
        addv            h0,      v0.4h
        urshr           v0.4h,   v0.4h,   #2
        dup             v0.8h,   v0.h[0]
        br              x3
L(ipred_dc_left_w4):
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            L(ipred_dc_left_w4)
        ret

L(ipred_dc_left_h8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2]
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #3
        dup             v0.8h,   v0.h[0]
        br              x3
L(ipred_dc_left_w8):
        AARCH64_VALID_JUMP_TARGET
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            L(ipred_dc_left_w8)
        ret

L(ipred_dc_left_h16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h}, [x2]
        addp            v0.8h,   v0.8h,   v1.8h
        addv            h0,      v0.8h
        urshr           v2.4h,   v0.4h,   #4
        dup             v0.8h,   v2.h[0]
        dup             v1.8h,   v2.h[0]
        br              x3
L(ipred_dc_left_w16):
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
1:
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        b.gt            1b
        ret

L(ipred_dc_left_h32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v0.8h,   v0.8h,   v2.8h
        uaddlp          v0.4s,   v0.8h
        addv            s0,      v0.4s
        rshrn           v4.4h,   v0.4s,   #5
        dup             v0.8h,   v4.h[0]
        br              x3
L(ipred_dc_left_w32):
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
1:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            1b
        ret

L(ipred_dc_left_h64):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v4.8h,   v4.8h,   v6.8h
        addp            v0.8h,   v0.8h,   v4.8h
        uaddlv          s0,      v0.8h
        rshrn           v4.4h,   v0.4s,   #6
        dup             v0.8h,   v4.h[0]
        br              x3
L(ipred_dc_left_w64):
        AARCH64_VALID_JUMP_TARGET
        mov             v1.16b,  v0.16b
        mov             v2.16b,  v0.16b
        mov             v3.16b,  v0.16b
        sub             x1,  x1,  #64
1:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            1b
        ret

L(ipred_dc_left_tbl):
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h64)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h32)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h16)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h8)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_h4)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w64)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w32)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w16)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w8)
        .hword L(ipred_dc_left_tbl) - L(ipred_dc_left_w4)
endfunc

// void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                          const pixel *const topleft,
//                          const int width, const int height, const int a,
//                          const int max_width, const int max_height);
function ipred_dc_16bpc_neon, export=1
        sub             x2,  x2,  w4, uxtw #1
        add             w7,  w3,  w4             // width + height
        clz             w3,  w3
        clz             w6,  w4
        dup             v16.4s, w7               // width + height
        adr             x5,  L(ipred_dc_tbl)
        rbit            w7,  w7                  // rbit(width + height)
        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
        sub             w6,  w6,  #25
        clz             w7,  w7                  // ctz(width + height)
        ldrh            w3,  [x5, w3, uxtw #1]
        ldrh            w6,  [x5, w6, uxtw #1]
        neg             w7,  w7                  // -ctz(width + height)
        sub             x3,  x5,  w3, uxtw
        sub             x5,  x5,  w6, uxtw
        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
        dup             v17.4s,  w7              // -ctz(width + height)
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5

L(ipred_dc_h4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2], #8
        uaddlv          s0,      v0.4h
        add             x2,  x2,  #2
        br              x3
L(ipred_dc_w4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v1.4h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s1,      v1.4h
        cmp             w4,  #4
        add             v0.2s,   v0.2s,   v1.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 8/16
        cmp             w4,  #16
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.4h,   v0.h[0]
2:
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.4h},  [x0], x1
        st1             {v0.4h},  [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2], #16
        uaddlv          s0,      v0.8h
        add             x2,  x2,  #2
        br              x3
L(ipred_dc_w8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v1.8h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s1,      v1.8h
        cmp             w4,  #8
        add             v0.2s,   v0.2s,   v1.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 4/16/32
        cmp             w4,  #32
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.8h,   v0.h[0]
2:
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h},  [x0], x1
        st1             {v0.8h},  [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h}, [x2], #32
        addp            v0.8h,   v0.8h,   v1.8h
        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
L(ipred_dc_w16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v1.8h, v2.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
        uaddlv          s1,      v1.8h
        cmp             w4,  #16
        add             v0.2s,   v0.2s,   v1.2s
        ushl            v4.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 4/8/32/64
        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v4.2s,   v4.2s,   v16.2s
        ushr            v4.2s,   v4.2s,   #17
1:
        dup             v0.8h,   v4.h[0]
        dup             v1.8h,   v4.h[0]
2:
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], x1
        st1             {v0.8h, v1.8h}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v0.8h,   v0.8h,   v2.8h
        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
L(ipred_dc_w32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
        addp            v3.8h,   v3.8h,   v4.8h
        addp            v1.8h,   v1.8h,   v3.8h
        uaddlv          s1,      v1.8h
        cmp             w4,  #32
        add             v0.2s,   v0.2s,   v1.2s
        ushl            v4.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 8/16/64
        cmp             w4,  #8
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v4.2s,   v4.2s,   v16.2s
        ushr            v4.2s,   v4.2s,   #17
1:
        dup             v0.8h,   v4.h[0]
        dup             v1.8h,   v4.h[0]
        dup             v2.8h,   v4.h[0]
        dup             v3.8h,   v4.h[0]
2:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_h64):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v4.8h,   v4.8h,   v6.8h
        addp            v0.8h,   v0.8h,   v4.8h
        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x3
L(ipred_dc_w64):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
        add             v0.2s,   v0.2s,   v16.2s
        addp            v1.8h,   v1.8h,   v2.8h
        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
        addp            v3.8h,   v3.8h,   v4.8h
        addp            v20.8h,  v20.8h,  v21.8h
        addp            v22.8h,  v22.8h,  v23.8h
        addp            v1.8h,   v1.8h,   v3.8h
        addp            v20.8h,  v20.8h,  v22.8h
        addp            v1.8h,   v1.8h,   v20.8h
        uaddlv          s1,      v1.8h
        cmp             w4,  #64
        add             v0.2s,   v0.2s,   v1.2s
        ushl            v4.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 16/32
        cmp             w4,  #16
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v4.2s,   v4.2s,   v16.2s
        ushr            v4.2s,   v4.2s,   #17
1:
        sub             x1,  x1,  #64
        dup             v0.8h,   v4.h[0]
        dup             v1.8h,   v4.h[0]
        dup             v2.8h,   v4.h[0]
        dup             v3.8h,   v4.h[0]
2:
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
        b.gt            2b
        ret

L(ipred_dc_tbl):
        .hword L(ipred_dc_tbl) - L(ipred_dc_h64)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h32)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h16)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h8)
        .hword L(ipred_dc_tbl) - L(ipred_dc_h4)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w64)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w32)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w16)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w8)
        .hword L(ipred_dc_tbl) - L(ipred_dc_w4)
endfunc

// void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                             const pixel *const topleft,
//                             const int width, const int height, const int a,
//                             const int max_width, const int max_height);
function ipred_paeth_16bpc_neon, export=1
        clz             w9,  w3
        adr             x5,  L(ipred_paeth_tbl)
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.8h},  [x2]
        add             x8,  x2,  #2
        sub             x2,  x2,  #8
        sub             x5,  x5,  w9, uxtw
        mov             x7,  #-8
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v5.2d},  [x8]
        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
4:
        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
        zip1            v0.2d,   v0.2d,   v1.2d
        zip1            v2.2d,   v2.2d,   v3.2d
        add             v16.8h,  v6.8h,   v0.8h   // base
        add             v17.8h,  v6.8h,   v2.8h
        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
        sabd            v21.8h,  v5.8h,   v17.8h
        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
        sabd            v23.8h,  v4.8h,   v17.8h
        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
        sabd            v17.8h,  v2.8h,   v17.8h
        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
        umin            v19.8h,  v21.8h,  v23.8h
        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
        cmge            v21.8h,  v23.8h,  v21.8h
        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
        cmge            v17.8h,  v19.8h,  v17.8h
        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
        bsl             v20.16b, v5.16b,  v4.16b
        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
        bit             v20.16b, v0.16b,  v16.16b
        st1             {v21.d}[1], [x0], x1
        st1             {v21.d}[0], [x6], x1
        subs            w4,  w4,  #4
        st1             {v20.d}[1], [x0], x1
        st1             {v20.d}[0], [x6], x1
        b.gt            4b
        ret
80:
160:
320:
640:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v5.8h},  [x8], #16
        mov             w9,  w3
        // Set up pointers for four rows in parallel; x0, x6, x5, x10
        add             x5,  x0,  x1
        add             x10, x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw #1
1:
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
2:
        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
        add             v16.8h,  v6.8h,   v0.8h   // base
        add             v17.8h,  v6.8h,   v1.8h
        add             v18.8h,  v6.8h,   v2.8h
        add             v19.8h,  v6.8h,   v3.8h
        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
        sabd            v21.8h,  v5.8h,   v17.8h
        sabd            v22.8h,  v5.8h,   v18.8h
        sabd            v23.8h,  v5.8h,   v19.8h
        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
        sabd            v25.8h,  v4.8h,   v17.8h
        sabd            v26.8h,  v4.8h,   v18.8h
        sabd            v27.8h,  v4.8h,   v19.8h
        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
        sabd            v17.8h,  v1.8h,   v17.8h
        sabd            v18.8h,  v2.8h,   v18.8h
        sabd            v19.8h,  v3.8h,   v19.8h
        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
        umin            v29.8h,  v21.8h,  v25.8h
        umin            v30.8h,  v22.8h,  v26.8h
        umin            v31.8h,  v23.8h,  v27.8h
        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
        cmge            v21.8h,  v25.8h,  v21.8h
        cmge            v22.8h,  v26.8h,  v22.8h
        cmge            v23.8h,  v27.8h,  v23.8h
        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
        cmge            v17.8h,  v29.8h,  v17.8h
        cmge            v18.8h,  v30.8h,  v18.8h
        cmge            v19.8h,  v31.8h,  v19.8h
        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
        bsl             v22.16b, v5.16b,  v4.16b
        bsl             v21.16b, v5.16b,  v4.16b
        bsl             v20.16b, v5.16b,  v4.16b
        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
        bit             v22.16b, v2.16b,  v18.16b
        bit             v21.16b, v1.16b,  v17.16b
        bit             v20.16b, v0.16b,  v16.16b
        st1             {v23.8h}, [x0], #16
        st1             {v22.8h}, [x6], #16
        subs            w3,  w3,  #8
        st1             {v21.8h}, [x5], #16
        st1             {v20.8h}, [x10], #16
        b.le            8f
        ld1             {v5.8h},  [x8], #16
        b               2b
8:
        subs            w4,  w4,  #4
        b.le            9f
        // End of horizontal loop, move pointers to next four rows
        sub             x8,  x8,  w9, uxtw #1
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        // Load the top row as early as possible
        ld1             {v5.8h},  [x8], #16
        add             x5,  x5,  x1
        add             x10, x10, x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_paeth_tbl):
        .hword L(ipred_paeth_tbl) - 640b
        .hword L(ipred_paeth_tbl) - 320b
        .hword L(ipred_paeth_tbl) - 160b
        .hword L(ipred_paeth_tbl) -  80b
        .hword L(ipred_paeth_tbl) -  40b
endfunc

// void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                              const pixel *const topleft,
//                              const int width, const int height, const int a,
//                              const int max_width, const int max_height);
function ipred_smooth_16bpc_neon, export=1
        movrel          x10, X(sm_weights)
        add             x11, x10, w4, uxtw
        add             x10, x10, w3, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_tbl)
        sub             x12, x2,  w4, uxtw #1
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.8h},  [x12] // bottom
        add             x8,  x2,  #2
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v6.2d}, [x8]             // top
        ld1r            {v7.2s}, [x10]            // weights_hor
        sub             x2,  x2,  #8
        mov             x7,  #-8
        dup             v5.8h,   v6.h[3]          // right
        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
        uxtl            v7.8h,   v7.8b            // weights_hor
        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
4:
        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
        ushll           v21.4s,  v31.4h,  #8
        ushll           v22.4s,  v31.4h,  #8
        ushll           v23.4s,  v31.4h,  #8
        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
        zip1            v0.2d,   v3.2d,   v2.2d
        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
        zip1            v18.2s,  v18.2s,  v19.2s
        sub             v0.8h,   v0.8h,   v5.8h   // left-right
        sub             v1.8h,   v1.8h,   v5.8h
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v18.8h,  v18.8b
        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
        smlal2          v21.4s,  v0.8h,   v7.8h
        smlal           v22.4s,  v1.4h,   v7.4h
        smlal2          v23.4s,  v1.8h,   v7.8h
        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
        smlal2          v21.4s,  v6.8h,   v16.8h
        smlal           v22.4s,  v6.4h,   v18.4h
        smlal2          v23.4s,  v6.8h,   v18.8h
        rshrn           v20.4h,  v20.4s,  #9
        rshrn           v21.4h,  v21.4s,  #9
        rshrn           v22.4h,  v22.4s,  #9
        rshrn           v23.4h,  v23.4s,  #9
        st1             {v20.4h}, [x0], x1
        st1             {v21.4h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.4h}, [x0], x1
        st1             {v23.4h}, [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v6.8h}, [x8]             // top
        ld1             {v7.8b}, [x10]            // weights_hor
        sub             x2,  x2,  #8
        mov             x7,  #-8
        dup             v5.8h,   v6.h[7]          // right
        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
        uxtl            v7.8h,   v7.8b            // weights_hor
        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
8:
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
        ushll           v21.4s,  v31.4h,  #8
        ushll           v22.4s,  v31.4h,  #8
        ushll           v23.4s,  v31.4h,  #8
        ushll           v24.4s,  v31.4h,  #8
        ushll           v25.4s,  v31.4h,  #8
        ushll           v26.4s,  v31.4h,  #8
        ushll           v27.4s,  v31.4h,  #8
        sub             v0.8h,   v0.8h,   v5.8h   // left-right
        sub             v1.8h,   v1.8h,   v5.8h
        sub             v2.8h,   v2.8h,   v5.8h
        sub             v3.8h,   v3.8h,   v5.8h
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
        uxtl            v18.8h,  v18.8b
        uxtl            v19.8h,  v19.8b
        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
        smlal           v22.4s,  v2.4h,   v7.4h
        smlal2          v23.4s,  v2.8h,   v7.8h
        smlal           v24.4s,  v1.4h,   v7.4h
        smlal2          v25.4s,  v1.8h,   v7.8h
        smlal           v26.4s,  v0.4h,   v7.4h
        smlal2          v27.4s,  v0.8h,   v7.8h
        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
        smlal2          v21.4s,  v6.8h,   v16.8h
        smlal           v22.4s,  v6.4h,   v17.4h
        smlal2          v23.4s,  v6.8h,   v17.8h
        smlal           v24.4s,  v6.4h,   v18.4h
        smlal2          v25.4s,  v6.8h,   v18.8h
        smlal           v26.4s,  v6.4h,   v19.4h
        smlal2          v27.4s,  v6.8h,   v19.8h
        rshrn           v20.4h,  v20.4s,  #9
        rshrn2          v20.8h,  v21.4s,  #9
        rshrn           v21.4h,  v22.4s,  #9
        rshrn2          v21.8h,  v23.4s,  #9
        rshrn           v22.4h,  v24.4s,  #9
        rshrn2          v22.8h,  v25.4s,  #9
        rshrn           v23.4h,  v26.4s,  #9
        rshrn2          v23.8h,  v27.4s,  #9
        st1             {v20.8h}, [x0], x1
        st1             {v21.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.8h}, [x0], x1
        st1             {v23.8h}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        AARCH64_VALID_JUMP_TARGET
        add             x12, x2,  w3, uxtw #1
        sub             x1,  x1,  w3, uxtw #1
        ld1r            {v5.8h}, [x12]            // right
        sub             x2,  x2,  #4
        mov             x7,  #-4
        mov             w9,  w3
        add             v31.4h,  v4.4h,   v5.4h   // bottom+right

1:
        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
        sub             v0.8h,   v0.8h,   v5.8h   // left-right
        sub             v1.8h,   v1.8h,   v5.8h
        uxtl            v16.8h,  v16.8b           // weights_ver
        uxtl            v17.8h,  v17.8b
2:
        ld1             {v7.16b}, [x10],  #16     // weights_hor
        ld1             {v2.8h, v3.8h}, [x8], #32 // top
        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
        ushll           v21.4s,  v31.4h,  #8
        ushll           v22.4s,  v31.4h,  #8
        ushll           v23.4s,  v31.4h,  #8
        ushll           v24.4s,  v31.4h,  #8
        ushll           v25.4s,  v31.4h,  #8
        ushll           v26.4s,  v31.4h,  #8
        ushll           v27.4s,  v31.4h,  #8
        uxtl            v6.8h,   v7.8b            // weights_hor
        uxtl2           v7.8h,   v7.16b
        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
        sub             v3.8h,   v3.8h,   v4.8h
        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
        smlal           v22.4s,  v1.4h,   v7.4h
        smlal2          v23.4s,  v1.8h,   v7.8h
        smlal           v24.4s,  v0.4h,   v6.4h
        smlal2          v25.4s,  v0.8h,   v6.8h
        smlal           v26.4s,  v0.4h,   v7.4h
        smlal2          v27.4s,  v0.8h,   v7.8h
        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
        smlal2          v21.4s,  v2.8h,   v16.8h
        smlal           v22.4s,  v3.4h,   v16.4h
        smlal2          v23.4s,  v3.8h,   v16.8h
        smlal           v24.4s,  v2.4h,   v17.4h
        smlal2          v25.4s,  v2.8h,   v17.8h
        smlal           v26.4s,  v3.4h,   v17.4h
        smlal2          v27.4s,  v3.8h,   v17.8h
        rshrn           v20.4h,  v20.4s,  #9
        rshrn2          v20.8h,  v21.4s,  #9
        rshrn           v21.4h,  v22.4s,  #9
        rshrn2          v21.8h,  v23.4s,  #9
        rshrn           v22.4h,  v24.4s,  #9
        rshrn2          v22.8h,  v25.4s,  #9
        rshrn           v23.4h,  v26.4s,  #9
        rshrn2          v23.8h,  v27.4s,  #9
        subs            w3,  w3,  #16
        st1             {v20.8h, v21.8h}, [x0], #32
        st1             {v22.8h, v23.8h}, [x6], #32
        b.gt            2b
        subs            w4,  w4,  #2
        b.le            9f
        sub             x8,  x8,  w9, uxtw #1
        sub             x10, x10, w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_tbl):
        .hword L(ipred_smooth_tbl) - 640b
        .hword L(ipred_smooth_tbl) - 320b
        .hword L(ipred_smooth_tbl) - 160b
        .hword L(ipred_smooth_tbl) -  80b
        .hword L(ipred_smooth_tbl) -  40b
endfunc

// void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const topleft,
//                                const int width, const int height, const int a,
//                                const int max_width, const int max_height);
function ipred_smooth_v_16bpc_neon, export=1
        movrel          x7,  X(sm_weights)
        add             x7,  x7,  w4, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_v_tbl)
        sub             x8,  x2,  w4, uxtw #1
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v4.8h},  [x8] // bottom
        add             x2,  x2,  #2
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v6.2d}, [x2]             // top
        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
4:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
        zip1            v18.2s,  v18.2s,  v19.2s
        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
        ushll           v18.8h,  v18.8b,  #7
        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
        sqrdmulh        v21.8h,  v6.8h,   v18.8h
        add             v20.8h,  v20.8h,  v4.8h
        add             v21.8h,  v21.8h,  v4.8h
        st1             {v20.d}[0], [x0], x1
        st1             {v20.d}[1], [x6], x1
        subs            w4,  w4,  #4
        st1             {v21.d}[0], [x0], x1
        st1             {v21.d}[1], [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v6.8h}, [x2]             // top
        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
8:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
        ushll           v17.8h,  v17.8b,  #7
        ushll           v18.8h,  v18.8b,  #7
        ushll           v19.8h,  v19.8b,  #7
        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
        sqrdmulh        v21.8h,  v6.8h,   v17.8h
        sqrdmulh        v22.8h,  v6.8h,   v18.8h
        sqrdmulh        v23.8h,  v6.8h,   v19.8h
        add             v20.8h,  v20.8h,  v4.8h
        add             v21.8h,  v21.8h,  v4.8h
        add             v22.8h,  v22.8h,  v4.8h
        add             v23.8h,  v23.8h,  v4.8h
        st1             {v20.8h}, [x0], x1
        st1             {v21.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.8h}, [x0], x1
        st1             {v23.8h}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        AARCH64_VALID_JUMP_TARGET
        // Set up pointers for four rows in parallel; x0, x6, x5, x8
        add             x5,  x0,  x1
        add             x8,  x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw #1
        mov             w9,  w3

1:
        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
        ushll           v17.8h,  v17.8b,  #7
        ushll           v18.8h,  v18.8b,  #7
        ushll           v19.8h,  v19.8b,  #7
2:
        ld1             {v2.8h, v3.8h}, [x2], #32 // top
        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
        sub             v3.8h,   v3.8h,   v4.8h
        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
        sqrdmulh        v21.8h,  v3.8h,   v16.8h
        sqrdmulh        v22.8h,  v2.8h,   v17.8h
        sqrdmulh        v23.8h,  v3.8h,   v17.8h
        sqrdmulh        v24.8h,  v2.8h,   v18.8h
        sqrdmulh        v25.8h,  v3.8h,   v18.8h
        sqrdmulh        v26.8h,  v2.8h,   v19.8h
        sqrdmulh        v27.8h,  v3.8h,   v19.8h
        add             v20.8h,  v20.8h,  v4.8h
        add             v21.8h,  v21.8h,  v4.8h
        add             v22.8h,  v22.8h,  v4.8h
        add             v23.8h,  v23.8h,  v4.8h
        add             v24.8h,  v24.8h,  v4.8h
        add             v25.8h,  v25.8h,  v4.8h
        add             v26.8h,  v26.8h,  v4.8h
        add             v27.8h,  v27.8h,  v4.8h
        subs            w3,  w3,  #16
        st1             {v20.8h, v21.8h}, [x0], #32
        st1             {v22.8h, v23.8h}, [x6], #32
        st1             {v24.8h, v25.8h}, [x5], #32
        st1             {v26.8h, v27.8h}, [x8], #32
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
        sub             x2,  x2,  w9, uxtw #1
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
        add             x8,  x8,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_v_tbl):
        .hword L(ipred_smooth_v_tbl) - 640b
        .hword L(ipred_smooth_v_tbl) - 320b
        .hword L(ipred_smooth_v_tbl) - 160b
        .hword L(ipred_smooth_v_tbl) -  80b
        .hword L(ipred_smooth_v_tbl) -  40b
endfunc

// void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const topleft,
//                                const int width, const int height, const int a,
//                                const int max_width, const int max_height);
function ipred_smooth_h_16bpc_neon, export=1
        movrel          x8,  X(sm_weights)
        add             x8,  x8,  w3, uxtw
        clz             w9,  w3
        adr             x5,  L(ipred_smooth_h_tbl)
        add             x12, x2,  w3, uxtw #1
        sub             w9,  w9,  #25
        ldrh            w9,  [x5, w9, uxtw #1]
        ld1r            {v5.8h},  [x12] // right
        sub             x5,  x5,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ld1r            {v7.2s}, [x8]             // weights_hor
        sub             x2,  x2,  #8
        mov             x7,  #-8
        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
4:
        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
        zip1            v0.2d,   v3.2d,   v2.2d
        sub             v0.8h,   v0.8h,   v5.8h   // left-right
        sub             v1.8h,   v1.8h,   v5.8h
        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
        sqrdmulh        v21.8h,  v1.8h,   v7.8h
        add             v20.8h,  v20.8h,  v5.8h
        add             v21.8h,  v21.8h,  v5.8h
        st1             {v20.d}[0], [x0], x1
        st1             {v20.d}[1], [x6], x1
        subs            w4,  w4,  #4
        st1             {v21.d}[0], [x0], x1
        st1             {v21.d}[1], [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v7.8b}, [x8]             // weights_hor
        sub             x2,  x2,  #8
        mov             x7,  #-8
        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
8:
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
        sub             v3.8h,   v3.8h,   v5.8h   // left-right
        sub             v2.8h,   v2.8h,   v5.8h
        sub             v1.8h,   v1.8h,   v5.8h
        sub             v0.8h,   v0.8h,   v5.8h
        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
        sqrdmulh        v22.8h,  v1.8h,   v7.8h
        sqrdmulh        v23.8h,  v0.8h,   v7.8h
        add             v20.8h,  v20.8h,  v5.8h
        add             v21.8h,  v21.8h,  v5.8h
        add             v22.8h,  v22.8h,  v5.8h
        add             v23.8h,  v23.8h,  v5.8h
        st1             {v20.8h}, [x0], x1
        st1             {v21.8h}, [x6], x1
        subs            w4,  w4,  #4
        st1             {v22.8h}, [x0], x1
        st1             {v23.8h}, [x6], x1
        b.gt            8b
        ret
160:
320:
640:
        AARCH64_VALID_JUMP_TARGET
        sub             x2,  x2,  #8
        mov             x7,  #-8
        // Set up pointers for four rows in parallel; x0, x6, x5, x10
        add             x5,  x0,  x1
        add             x10, x6,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3, uxtw #1
        mov             w9,  w3

1:
        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
        sub             v0.8h,   v0.8h,   v5.8h   // left-right
        sub             v1.8h,   v1.8h,   v5.8h
        sub             v2.8h,   v2.8h,   v5.8h
        sub             v3.8h,   v3.8h,   v5.8h
2:
        ld1             {v7.16b}, [x8],   #16     // weights_hor
        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
        ushll2          v7.8h,   v7.16b,  #7
        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
        sqrdmulh        v22.8h,  v2.8h,   v6.8h
        sqrdmulh        v23.8h,  v2.8h,   v7.8h
        sqrdmulh        v24.8h,  v1.8h,   v6.8h
        sqrdmulh        v25.8h,  v1.8h,   v7.8h
        sqrdmulh        v26.8h,  v0.8h,   v6.8h
        sqrdmulh        v27.8h,  v0.8h,   v7.8h
        add             v20.8h,  v20.8h,  v5.8h
        add             v21.8h,  v21.8h,  v5.8h
        add             v22.8h,  v22.8h,  v5.8h
        add             v23.8h,  v23.8h,  v5.8h
        add             v24.8h,  v24.8h,  v5.8h
        add             v25.8h,  v25.8h,  v5.8h
        add             v26.8h,  v26.8h,  v5.8h
        add             v27.8h,  v27.8h,  v5.8h
        subs            w3,  w3,  #16
        st1             {v20.8h, v21.8h}, [x0],  #32
        st1             {v22.8h, v23.8h}, [x6],  #32
        st1             {v24.8h, v25.8h}, [x5],  #32
        st1             {v26.8h, v27.8h}, [x10], #32
        b.gt            2b
        subs            w4,  w4,  #4
        b.le            9f
        sub             x8,  x8,  w9, uxtw
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        add             x5,  x5,  x1
        add             x10, x10, x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_smooth_h_tbl):
        .hword L(ipred_smooth_h_tbl) - 640b
        .hword L(ipred_smooth_h_tbl) - 320b
        .hword L(ipred_smooth_h_tbl) - 160b
        .hword L(ipred_smooth_h_tbl) -  80b
        .hword L(ipred_smooth_h_tbl) -  40b
endfunc

const padding_mask_buf
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
padding_mask:
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
endconst

// void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
//                                        const pixel *const in, const int end,
//                                        const int bitdepth_max);
function ipred_z1_upsample_edge_16bpc_neon, export=1
        dup             v30.8h,  w4               // bitdepth_max
        movrel          x4,  padding_mask
        ld1             {v0.8h, v1.8h},  [x2]     // in[]
        add             x5,  x2,  w3,  uxtw #1    // in[end]
        sub             x4,  x4,  w3,  uxtw #1

        ld1r            {v2.8h},  [x5]            // padding
        ld1             {v3.8h, v4.8h}, [x4]      // padding_mask

        movi            v31.8h,  #9

        bit             v0.16b,  v2.16b,  v3.16b  // padded in[]
        bit             v1.16b,  v2.16b,  v4.16b

        ext             v4.16b,  v0.16b,  v1.16b,  #2
        ext             v5.16b,  v1.16b,  v2.16b,  #2
        ext             v6.16b,  v0.16b,  v1.16b,  #4
        ext             v7.16b,  v1.16b,  v2.16b,  #4
        ext             v16.16b, v0.16b,  v1.16b,  #6
        ext             v17.16b, v1.16b,  v2.16b,  #6

        add             v18.8h,  v4.8h,   v6.8h   // in[i+1] + in[i+2]
        add             v19.8h,  v5.8h,   v7.8h
        add             v20.8h,  v0.8h,   v16.8h
        add             v21.8h,  v1.8h,   v17.8h
        umull           v22.4s,  v18.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
        umull2          v23.4s,  v18.8h,  v31.8h
        umull           v24.4s,  v19.4h,  v31.4h
        umull2          v25.4s,  v19.8h,  v31.8h
        usubw           v22.4s,  v22.4s,  v20.4h
        usubw2          v23.4s,  v23.4s,  v20.8h
        usubw           v24.4s,  v24.4s,  v21.4h
        usubw2          v25.4s,  v25.4s,  v21.8h

        sqrshrun        v16.4h,  v22.4s,  #4
        sqrshrun2       v16.8h,  v23.4s,  #4
        sqrshrun        v17.4h,  v24.4s,  #4
        sqrshrun2       v17.8h,  v25.4s,  #4

        smin            v16.8h,  v16.8h,  v30.8h
        smin            v17.8h,  v17.8h,  v30.8h

        zip1            v0.8h,   v4.8h,   v16.8h
        zip2            v1.8h,   v4.8h,   v16.8h
        zip1            v2.8h,   v5.8h,   v17.8h
        zip2            v3.8h,   v5.8h,   v17.8h

        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]

        ret
endfunc

// void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
//                                        const pixel *const in,
//                                        const int bitdepth_max);
function ipred_z2_upsample_edge_16bpc_neon, export=1
        dup             v30.8h,  w3               // bitdepth_max
        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
        movrel          x4,  padding_mask
        ld1             {v0.8h, v1.8h}, [x2]      // in[]
        add             x5,  x2,  w1,  uxtw #1    // in[sz]
        sub             x4,  x4,  w1,  uxtw #1

        ld1r            {v3.8h},  [x2]            // in[0] for padding
        ld1r            {v2.8h},  [x5]            // padding
        ld1             {v4.8h, v5.8h}, [x4]      // padding_mask

        movi            v31.8h,  #9

        bit             v0.16b,  v2.16b,  v4.16b  // padded in[]
        bit             v1.16b,  v2.16b,  v5.16b

        ext             v4.16b,  v3.16b,  v0.16b,  #14
        ext             v5.16b,  v0.16b,  v1.16b,  #2
        ext             v6.16b,  v0.16b,  v1.16b,  #4

        add             v16.8h,  v0.8h,   v5.8h   // in[i+0] + in[i+1]
        add             v17.8h,  v4.8h,   v6.8h   // in[i-1] + in[i+2]
        umull           v18.4s,  v16.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
        umull2          v19.4s,  v16.8h,  v31.8h
        usubw           v18.4s,  v18.4s,  v17.4h
        usubw2          v19.4s,  v19.4s,  v17.8h

        sqrshrun        v16.4h,  v18.4s,  #4
        sqrshrun2       v16.8h,  v19.4s,  #4

        add             x5,  x0,  #2*16

        smin            v16.8h,  v16.8h,  v30.8h

        zip1            v4.8h,   v0.8h,   v16.8h
        zip2            v5.8h,   v0.8h,   v16.8h

        st1             {v2.h}[0], [x5]
        // In case sz=8, output one single pixel in out[16].
        st1             {v4.8h, v5.8h}, [x0]

        ret
endfunc

const edge_filter
        .short 0, 4, 8, 0
        .short 0, 5, 6, 0
// Leaving out the coeffs for strength=3
//      .byte 2, 4, 4, 0
endconst

// void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
//                                      const pixel *const in, const int end,
//                                      const int strength);
function ipred_z1_filter_edge_16bpc_neon, export=1
        cmp             w4, #3
        b.eq            L(fivetap)                // if (strength == 3) goto fivetap

        movrel          x5,  edge_filter, -6
        add             x5,  x5,  w4,  uxtw #3    // edge_filter + 2*((strength - 1)*4 + 1)

        ld1             {v31.s}[0], [x5]          // kernel[1-2]

        ld1             {v0.8h}, [x2], #16

        dup             v30.8h, v31.h[0]
        dup             v31.8h, v31.h[1]
1:
        // in[end], is the last valid pixel. We produce 16 pixels out by
        // using 18 pixels in - the last pixel used is [17] of the ones
        // read/buffered.
        cmp             w3,  #17
        ld1             {v1.8h, v2.8h}, [x2], #32
        b.lt            2f
        ext             v3.16b,  v0.16b,  v1.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v0.16b,  v1.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #4
        mul             v16.8h,  v0.8h,   v30.8h
        mla             v16.8h,  v3.8h,   v31.8h
        mla             v16.8h,  v5.8h,   v30.8h
        mul             v17.8h,  v1.8h,   v30.8h
        mla             v17.8h,  v4.8h,   v31.8h
        mla             v17.8h,  v6.8h,   v30.8h
        subs            w1,  w1,  #16
        mov             v0.16b,  v2.16b
        urshr           v16.8h,  v16.8h,  #4
        urshr           v17.8h,  v17.8h,  #4
        sub             w3,  w3,  #16
        st1             {v16.8h, v17.8h}, [x0], #32
        b.gt            1b
        ret
2:
        // Right padding

        // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
        movrel          x5,  padding_mask
        sub             w6,  w3,  #24
        sub             x5,  x5,  w3,  uxtw #1
        add             x6,  x2,  w6,  sxtw #1

        ld1             {v3.8h, v4.8h}, [x5] // padding_mask

        ld1r            {v2.8h}, [x6]
        bit             v0.16b,  v2.16b,  v3.16b  // Pad v0-v1
        bit             v1.16b,  v2.16b,  v4.16b

        // Filter one block
        ext             v3.16b,  v0.16b,  v1.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v0.16b,  v1.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #4
        mul             v16.8h,  v0.8h,   v30.8h
        mla             v16.8h,  v3.8h,   v31.8h
        mla             v16.8h,  v5.8h,   v30.8h
        mul             v17.8h,  v1.8h,   v30.8h
        mla             v17.8h,  v4.8h,   v31.8h
        mla             v17.8h,  v6.8h,   v30.8h
        subs            w1,  w1,  #16
        urshr           v16.8h,  v16.8h,  #4
        urshr           v17.8h,  v17.8h,  #4
        st1             {v16.8h, v17.8h}, [x0], #32
        b.le            9f
5:
        // After one block, any remaining output would only be filtering
        // padding - thus just store the padding.
        subs            w1,  w1,  #16
        st1             {v2.16b}, [x0], #16
        b.gt            5b
9:
        ret

L(fivetap):
        sub             x2,  x2,  #2              // topleft -= 1 pixel
        movi            v29.8h, #2
        ld1             {v0.8h}, [x2], #16
        movi            v30.8h, #4
        movi            v31.8h, #4
        ins             v0.h[0], v0.h[1]
1:
        // in[end+1], is the last valid pixel. We produce 16 pixels out by
        // using 20 pixels in - the last pixel used is [19] of the ones
        // read/buffered.
        cmp             w3,  #18
        ld1             {v1.8h, v2.8h}, [x2], #32
        b.lt            2f                        // if (end + 1 < 19)
        ext             v3.16b,  v0.16b,  v1.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v0.16b,  v1.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #4
        ext             v16.16b, v0.16b,  v1.16b,  #6
        ext             v17.16b, v1.16b,  v2.16b,  #6
        ext             v18.16b, v0.16b,  v1.16b,  #8
        ext             v19.16b, v1.16b,  v2.16b,  #8
        mul             v20.8h,  v0.8h,   v29.8h
        mla             v20.8h,  v3.8h,   v30.8h
        mla             v20.8h,  v5.8h,   v31.8h
        mla             v20.8h,  v16.8h,  v30.8h
        mla             v20.8h,  v18.8h,  v29.8h
        mul             v21.8h,  v1.8h,   v29.8h
        mla             v21.8h,  v4.8h,   v30.8h
        mla             v21.8h,  v6.8h,   v31.8h
        mla             v21.8h,  v17.8h,  v30.8h
        mla             v21.8h,  v19.8h,  v29.8h
        subs            w1,  w1,  #16
        mov             v0.16b,  v2.16b
        urshr           v20.8h,  v20.8h,  #4
        urshr           v21.8h,  v21.8h,  #4
        sub             w3,  w3,  #16
        st1             {v20.8h, v21.8h}, [x0], #32
        b.gt            1b
        ret
2:
        // Right padding

        // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
        movrel          x5,  padding_mask, -2
        sub             w6,  w3,  #23
        sub             x5,  x5,  w3,  uxtw #1
        add             x6,  x2,  w6,  sxtw #1

        ld1             {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask

        ld1r            {v28.8h}, [x6]
        bit             v0.16b,  v28.16b, v3.16b  // Pad v0-v2
        bit             v1.16b,  v28.16b, v4.16b
        bit             v2.16b,  v28.16b, v5.16b
4:
        // Filter one block
        ext             v3.16b,  v0.16b,  v1.16b,  #2
        ext             v4.16b,  v1.16b,  v2.16b,  #2
        ext             v5.16b,  v0.16b,  v1.16b,  #4
        ext             v6.16b,  v1.16b,  v2.16b,  #4
        ext             v16.16b, v0.16b,  v1.16b,  #6
        ext             v17.16b, v1.16b,  v2.16b,  #6
        ext             v18.16b, v0.16b,  v1.16b,  #8
        ext             v19.16b, v1.16b,  v2.16b,  #8
        mul             v20.8h,  v0.8h,   v29.8h
        mla             v20.8h,  v3.8h,   v30.8h
        mla             v20.8h,  v5.8h,   v31.8h
        mla             v20.8h,  v16.8h,  v30.8h
        mla             v20.8h,  v18.8h,  v29.8h
        mul             v21.8h,  v1.8h,   v29.8h
        mla             v21.8h,  v4.8h,   v30.8h
        mla             v21.8h,  v6.8h,   v31.8h
        mla             v21.8h,  v17.8h,  v30.8h
        mla             v21.8h,  v19.8h,  v29.8h
        subs            w1,  w1,  #16
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v28.16b
        mov             v2.16b,  v28.16b
        urshr           v20.8h,  v20.8h,  #4
        urshr           v21.8h,  v21.8h,  #4
        sub             w3,  w3,  #16
        st1             {v20.8h, v21.8h}, [x0], #32
        b.le            9f
        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
        // filter properly once more - aka (w3 >= 0).
        cmp             w3,  #0
        b.ge            4b
5:
        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
        // last valid pixel - thus just output that without filtering.
        subs            w1,  w1,  #8
        st1             {v28.8h}, [x0], #16
        b.gt            5b
9:
        ret
endfunc

// void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
//                                 const int n);
function ipred_pixel_set_16bpc_neon, export=1
        dup             v0.8h,   w1
1:
        subs            w2,  w2,  #8
        st1             {v0.8h}, [x0], #16
        b.gt            1b
        ret
endfunc

// void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const top,
//                                const int width, const int height,
//                                const int dx, const int max_base_x);
function ipred_z1_fill1_16bpc_neon, export=1
        clz             w9,  w3
        adr             x8,  L(ipred_z1_fill1_tbl)
        sub             w9,  w9,  #25
        ldrh            w9,  [x8, w9, uxtw #1]
        add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
        sub             x8,  x8,  w9,  uxtw
        ld1r            {v31.8h}, [x10]           // padding
        mov             w7,  w5
        mov             w15, #64
        br              x8
40:
        AARCH64_VALID_JUMP_TARGET
4:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            49f
        lsl             w8,  w8,  #1
        lsl             w10, w10, #1
        ldr             q0,  [x2, w8, uxtw]       // top[base]
        ldr             q2,  [x2, w10, uxtw]
        dup             v4.4h,   w9               // frac
        dup             v5.4h,   w11
        ext             v1.16b,  v0.16b,  v0.16b,  #2 // top[base+1]
        ext             v3.16b,  v2.16b,  v2.16b,  #2
        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
        sub             v7.4h,   v3.4h,   v2.4h
        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
        ushll           v17.4s,  v2.4h,   #6
        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
        smlal           v17.4s,  v7.4h,   v5.4h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn           v17.4h,  v17.4s,  #6
        st1             {v16.4h}, [x0], x1
        add             w7,  w7,  w5              // xpos += dx
        subs            w4,  w4,  #2
        st1             {v17.4h}, [x0], x1
        b.gt            4b
        ret

49:
        st1             {v31.4h}, [x0], x1
        subs            w4,  w4,  #2
        st1             {v31.4h}, [x0], x1
        b.gt            49b
        ret

80:
        AARCH64_VALID_JUMP_TARGET
8:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            89f
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v4.8h,   w9               // frac
        dup             v5.8h,   w11
        ld1             {v0.8h},  [x8]            // top[base]
        ld1             {v2.8h},  [x10]
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        ldr             h1, [x8, #16]
        ldr             h3, [x10, #16]
        dup             v6.8h,   w9               // 64 - frac
        dup             v7.8h,   w11
        ext             v1.16b,  v0.16b,  v1.16b,  #2 // top[base+1]
        ext             v3.16b,  v2.16b,  v3.16b,  #2
        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
        umlal           v16.4s,  v1.4h,   v4.4h   // + top[base+1]*frac
        umull2          v17.4s,  v0.8h,   v6.8h
        umlal2          v17.4s,  v1.8h,   v4.8h
        umull           v18.4s,  v2.4h,   v7.4h
        umlal           v18.4s,  v3.4h,   v5.4h
        umull2          v19.4s,  v2.8h,   v7.8h
        umlal2          v19.4s,  v3.8h,   v5.8h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn2          v16.8h,  v17.4s,  #6
        rshrn           v17.4h,  v18.4s,  #6
        rshrn2          v17.8h,  v19.4s,  #6
        st1             {v16.8h}, [x0], x1
        add             w7,  w7,  w5              // xpos += dx
        subs            w4,  w4,  #2
        st1             {v17.8h}, [x0], x1
        b.gt            8b
        ret

89:
        st1             {v31.8h}, [x0], x1
        subs            w4,  w4,  #2
        st1             {v31.8h}, [x0], x1
        b.gt            89b
        ret

160:
320:
640:
        AARCH64_VALID_JUMP_TARGET

        mov             w12, w3

        add             x13, x0,  x1
        lsl             x1,  x1,  #1
        sub             x1,  x1,  w3,  uxtw #1
1:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            169f
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v6.8h,   w9               // frac
        dup             v7.8h,   w11
        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // top[base]
        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        dup             v16.8h,  w9               // 64 - frac
        dup             v17.8h,  w11
        add             w7,  w7,  w5              // xpos += dx
2:
        ext             v18.16b, v0.16b,  v1.16b,  #2 // top[base+1]
        ext             v19.16b, v1.16b,  v2.16b,  #2
        ext             v20.16b, v3.16b,  v4.16b,  #2
        ext             v21.16b, v4.16b,  v5.16b,  #2
        subs            w3,  w3,  #16
        umull           v22.4s,  v0.4h,   v16.4h  // top[base]*(64-frac)
        umlal           v22.4s,  v18.4h,  v6.4h   // + top[base+1]*frac
        umull2          v23.4s,  v0.8h,   v16.8h
        umlal2          v23.4s,  v18.8h,  v6.8h
        umull           v24.4s,  v1.4h,   v16.4h
        umlal           v24.4s,  v19.4h,  v6.4h
        umull2          v25.4s,  v1.8h,   v16.8h
        umlal2          v25.4s,  v19.8h,  v6.8h
        umull           v26.4s,  v3.4h,   v17.4h
        umlal           v26.4s,  v20.4h,  v7.4h
        umull2          v27.4s,  v3.8h,   v17.8h
        umlal2          v27.4s,  v20.8h,  v7.8h
        umull           v28.4s,  v4.4h,   v17.4h
        umlal           v28.4s,  v21.4h,  v7.4h
        umull2          v29.4s,  v4.8h,   v17.8h
        umlal2          v29.4s,  v21.8h,  v7.8h
        rshrn           v22.4h,  v22.4s,  #6
        rshrn2          v22.8h,  v23.4s,  #6
        rshrn           v23.4h,  v24.4s,  #6
        rshrn2          v23.8h,  v25.4s,  #6
        rshrn           v24.4h,  v26.4s,  #6
        rshrn2          v24.8h,  v27.4s,  #6
        rshrn           v25.4h,  v28.4s,  #6
        rshrn2          v25.8h,  v29.4s,  #6
        st1             {v22.8h, v23.8h}, [x0],  #32
        st1             {v24.8h, v25.8h}, [x13], #32
        b.le            3f
        mov             v0.16b,  v2.16b
        ld1             {v1.8h, v2.8h}, [x8],  #32 // top[base]
        mov             v3.16b,  v5.16b
        ld1             {v4.8h, v5.8h}, [x10], #32
        b               2b

3:
        subs            w4,  w4,  #2
        b.le            9f
        add             x0,  x0,  x1
        add             x13, x13, x1
        mov             w3,  w12
        b               1b
9:
        ret

169:
        st1             {v31.8h}, [x0],  #16
        subs            w3,  w3,  #8
        st1             {v31.8h}, [x13], #16
        b.gt            169b
        subs            w4,  w4,  #2
        b.le            9b
        add             x0,  x0,  x1
        add             x13, x13, x1
        mov             w3,  w12
        b               169b

L(ipred_z1_fill1_tbl):
        .hword L(ipred_z1_fill1_tbl) - 640b
        .hword L(ipred_z1_fill1_tbl) - 320b
        .hword L(ipred_z1_fill1_tbl) - 160b
        .hword L(ipred_z1_fill1_tbl) -  80b
        .hword L(ipred_z1_fill1_tbl) -  40b
endfunc

function ipred_z1_fill2_16bpc_neon, export=1
        cmp             w3,  #8
        add             x10, x2,  w6,  uxtw       // top[max_base_x]
        ld1r            {v31.16b}, [x10]          // padding
        mov             w7,  w5
        mov             w15, #64
        b.eq            8f

4:      // w == 4
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            49f
        lsl             w8,  w8,  #1
        lsl             w10, w10, #1
        ldr             q0,  [x2, w8, uxtw]       // top[base]
        ldr             q2,  [x2, w10, uxtw]
        dup             v4.4h,   w9               // frac
        dup             v5.4h,   w11
        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
        uzp2            v3.8h,   v2.8h,   v2.8h
        uzp1            v2.8h,   v2.8h,   v2.8h
        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
        sub             v7.4h,   v3.4h,   v2.4h
        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
        ushll           v17.4s,  v2.4h,   #6
        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
        smlal           v17.4s,  v7.4h,   v5.4h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn           v17.4h,  v17.4s,  #6
        st1             {v16.4h}, [x0], x1
        add             w7,  w7,  w5              // xpos += dx
        subs            w4,  w4,  #2
        st1             {v17.4h}, [x0], x1
        b.gt            4b
        ret

49:
        st1             {v31.4h}, [x0], x1
        subs            w4,  w4,  #2
        st1             {v31.4h}, [x0], x1
        b.gt            49b
        ret

8:      // w == 8
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            89f
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v4.8h,   w9               // frac
        dup             v5.8h,   w11
        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
        ld1             {v2.8h, v3.8h},  [x10]
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        dup             v6.8h,   w9               // 64 - frac
        dup             v7.8h,   w11
        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
        uzp2            v21.8h,  v2.8h,   v3.8h
        uzp1            v2.8h,   v2.8h,   v3.8h
        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
        umull2          v17.4s,  v0.8h,   v6.8h
        umlal2          v17.4s,  v20.8h,  v4.8h
        umull           v18.4s,  v2.4h,   v7.4h
        umlal           v18.4s,  v21.4h,  v5.4h
        umull2          v19.4s,  v2.8h,   v7.8h
        umlal2          v19.4s,  v21.8h,  v5.8h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn2          v16.8h,  v17.4s,  #6
        rshrn           v17.4h,  v18.4s,  #6
        rshrn2          v17.8h,  v19.4s,  #6
        st1             {v16.8h}, [x0], x1
        add             w7,  w7,  w5              // xpos += dx
        subs            w4,  w4,  #2
        st1             {v17.8h}, [x0], x1
        b.gt            8b
        ret

89:
        st1             {v31.8h}, [x0], x1
        subs            w4,  w4,  #2
        st1             {v31.8h}, [x0], x1
        b.gt            89b
        ret
endfunc

// void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
//                               const int n);
function ipred_reverse_16bpc_neon, export=1
        sub             x1,  x1,  #16
        add             x3,  x0,  #8
        mov             x4,  #16
1:
        ld1             {v0.8h}, [x1]
        subs            w2,  w2,  #8
        rev64           v0.8h,  v0.8h
        sub             x1,  x1,  #16
        st1             {v0.d}[1], [x0], x4
        st1             {v0.d}[0], [x3], x4
        b.gt            1b
        ret
endfunc

const increments
        .short          0,  1,  2,  3,  4,  5,  6,  7
endconst

// void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const top,
//                                const pixel *const left,
//                                const int width, const int height,
//                                const int dx, const int dy);
function ipred_z2_fill1_16bpc_neon, export=1
        clz             w10, w4
        adr             x9,  L(ipred_z2_fill1_tbl)
        sub             w10, w10, #25
        ldrh            w10, [x9, w10, uxtw #1]
        mov             w8,  #(1 << 6)            // xpos = 1 << 6
        sub             x9,  x9,  w10, uxtw
        sub             w8,  w8,  w6              // xpos -= dx

        movrel          x11, increments
        ld1             {v31.8h},  [x11]          // increments
        neg             w7,  w7                   // -dy

        br              x9
40:
        AARCH64_VALID_JUMP_TARGET

        dup             v30.4h,  w7               // -dy
        movi            v17.8b,  #1

        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
        movi            v25.8h,  #0x3e
        add             v30.4h,  v16.4h,  v30.4h  // -= dy

        // Worst case height for w=4 is 16, but we need at least h+1 elements
        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]

        movi            v26.8h,  #64
        movi            v19.16b, #4

        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
        and             v27.8b,  v30.8b,  v25.8b  // frac_y

        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1

        movi            v23.4h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
        movi            v17.8b,  #2
        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...

        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)

        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]

        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2

        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y

        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}

        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y

        movi            v29.16b, #4
4:
        asr             w9,  w8,  #6              // base_x
        dup             v16.4h,  w8               // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-4                  // base_x <= -4
        asr             w11, w8,  #6              // base_x
        b.le            49f

        lsl             w9,  w9,  #1
        lsl             w11, w11, #1

        dup             v17.4h,  w8               // xpos

        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
        ldr             q6,  [x2, w11, sxtw]

        trn1            v16.2d,  v16.2d,  v17.2d  // xpos

        // Cut corners here; only doing tbl over v0-v1 here; we only
        // seem to need the last pixel, from v2, after skipping to the
        // left-only codepath below.
        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]

        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row

        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
        ext             v7.16b,  v6.16b,  v6.16b,  #2

        and             v16.16b, v16.16b, v25.16b // frac_x

        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]

        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]

        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x

        add             v20.8h,  v20.8h,  v31.8h  // actual base_x

        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v22.4s,  v18.8h,  v28.8h
        umlal2          v22.4s,  v19.8h,  v27.8h

        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v24.4s,  v4.8h,   v17.8h
        umlal2          v24.4s,  v5.8h,   v16.8h

        cmge            v20.8h,  v20.8h,  #0

        rshrn           v21.4h,  v21.4s,  #6
        rshrn2          v21.8h,  v22.4s,  #6
        rshrn           v22.4h,  v23.4s,  #6
        rshrn2          v22.8h,  v24.4s,  #6

        bit             v21.16b, v22.16b, v20.16b

        st1             {v21.d}[0], [x0], x1
        sub             w8,  w8,  w6              // xpos -= dx
        subs            w5,  w5,  #2
        st1             {v21.d}[1], [x0], x1
        b.le            9f

        ext             v18.16b, v19.16b, v19.16b, #8
        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
        b               4b

49:
        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]

        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]

        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v21.4s,  v18.8h,  v28.8h
        umlal2          v21.4s,  v19.8h,  v27.8h

        rshrn           v20.4h,  v20.4s,  #6
        rshrn2          v20.8h,  v21.4s,  #6

        st1             {v20.d}[0], [x0], x1
        subs            w5,  w5,  #2
        st1             {v20.d}[1], [x0], x1
        b.le            9f

        ext             v18.16b, v19.16b, v19.16b, #8
        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
        b               49b

9:
        ret

80:
        AARCH64_VALID_JUMP_TARGET

        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        dup             v18.8h,  w7               // -dy
        add             x3,  x3,  #2              // Skip past left[0]

        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
        movi            v25.8h,  #0x3e
        add             v16.8h,  v16.8h,  v18.8h  // -= dy

        // Worst case height for w=8 is 32.
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
        ld1r            {v15.8h}, [x2]            // left[0] == top[0]

        movi            v26.8h,  #64
        movi            v19.16b, #4

        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
        and             v27.16b, v16.16b, v25.16b // frac_y

        movi            v23.8h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        mov             v18.16b, v15.16b          // left[0]
        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
        movi            v17.16b, #2
        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...

        // Cut corners here; for the first row we don't expect to need to
        // read outside of v0.
        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]

        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)

        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y

        movi            v24.16b, #4
8:
        asr             w9,  w8,  #6              // base_x
        dup             v16.8h,   w8              // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-16                 // base_x <= -16
        asr             w11, w8,  #6              // base_x
        b.le            89f

        dup             v17.8h,   w8              // xpos

        add             x9,  x2,  w9,  sxtw #1
        add             x11, x2,  w11, sxtw #1

        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
        mov             v19.16b, v15.16b          // left[0]
        ld1             {v6.8h, v7.8h}, [x11]

        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]

        mov             v20.16b, v15.16b          // left[0]

        sshr            v21.8h,  v16.8h,  #6      // first base_x
        sshr            v22.8h,  v17.8h,  #6

        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]

        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
        ext             v7.16b,  v6.16b,  v7.16b,  #2

        and             v16.16b, v16.16b, v25.16b // frac_x
        and             v17.16b, v17.16b, v25.16b

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y

        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
        sub             v9.8h,   v26.8h,  v17.8h

        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h

        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
        add             v22.8h,  v22.8h,  v31.8h

        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v13.4s,  v4.8h,   v8.8h
        umlal2          v13.4s,  v5.8h,   v16.8h
        umull           v14.4s,  v6.4h,   v9.4h
        umlal           v14.4s,  v7.4h,   v17.4h
        umull2          v18.4s,  v6.8h,   v9.8h
        umlal2          v18.4s,  v7.8h,   v17.8h

        cmge            v21.8h,  v21.8h,  #0
        cmge            v22.8h,  v22.8h,  #0

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v18.4s,  #6

        bit             v10.16b, v12.16b, v21.16b
        bit             v11.16b, v13.16b, v22.16b

        st1             {v10.8h}, [x0], x1
        subs            w5,  w5,  #2
        sub             w8,  w8,  w6              // xpos -= dx
        st1             {v11.8h}, [x0], x1
        b.le            9f

        mov             v18.16b, v20.16b
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
        b               8b

89:
        mov             v19.16b, v15.16b
        mov             v20.16b, v15.16b
        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]

        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v5.4s,   v18.8h,  v28.8h
        umlal2          v5.4s,   v19.8h,  v27.8h
        umull           v6.4s,   v19.4h,  v28.4h
        umlal           v6.4s,   v20.4h,  v27.4h
        umull2          v7.4s,   v19.8h,  v28.8h
        umlal2          v7.4s,   v20.8h,  v27.8h

        rshrn           v4.4h,   v4.4s,   #6
        rshrn2          v4.8h,   v5.4s,   #6
        rshrn           v5.4h,   v6.4s,   #6
        rshrn2          v5.8h,   v7.4s,   #6

        st1             {v4.8h}, [x0], x1
        subs            w5,  w5,  #2
        st1             {v5.8h}, [x0], x1
        b.le            9f

        mov             v18.16b, v20.16b
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
        b               89b

9:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret

160:
320:
640:
        AARCH64_VALID_JUMP_TARGET

        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        dup             v25.8h,  w7               // -dy
        add             x3,  x3,  #2              // Skip past left[0]

        add             x13, x0,  x1              // alternating row
        lsl             x1,  x1,  #1              // stride *= 2
        sub             x1,  x1,  w4,  uxtw #1    // stride -= width

        movi            v11.8h,  #8
        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
        add             v26.8h,  v26.8h,  v25.8h  // -= dy
        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy

        // Worst case height is 64, but we can only fit 32 pixels into
        // v0-v3 usable within one tbx instruction. As long as base_y is
        // up to 32, we use tbx.
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
        ld1r            {v15.8h}, [x2]            // left[0] == top[0]

        mov             w12, w4                   // orig w
        neg             w14, w4                   // -w

1:
        mov             v23.16b, v26.16b          // reset ypos

        asr             w9,  w8,  #6              // base_x
        dup             v16.8h,   w8              // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  w14                  // base_x <= -2*w
        asr             w11, w8,  #6              // base_x
        b.le            169f

        dup             v17.8h,   w8              // xpos
        sub             w8,  w8,  w6              // xpos -= dx

        add             x9,  x2,  w9,  sxtw #1
        add             x11, x2,  w11, sxtw #1

        sshr            v21.8h,  v16.8h,  #6      // first base_x
        sshr            v22.8h,  v17.8h,  #6

        ld1             {v4.8h}, [x9], #16        // top[base_x]
        ld1             {v6.8h}, [x11], #16

        movi            v10.8h,  #0x3e
        movi            v11.8h,  #64

        and             v16.16b, v16.16b, v10.16b // frac_x
        and             v17.16b, v17.16b, v10.16b

        sub             v8.8h,   v11.8h,  v16.8h  // 64 - frac_x
        sub             v9.8h,   v11.8h,  v17.8h

        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
        add             v22.8h,  v22.8h,  v31.8h

2:
        smov            w10,     v22.h[0]

        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
        movi            v12.8h,  #64
        cmp             w10, #0                   // base_x (bottom left) >= 0
        smov            w10,     v29.b[0]         // base_y[0]
        movi            v10.8h,  #0x3e

        b.ge            4f
        and             v27.16b, v23.16b, v10.16b // frac_y
        cmp             w10,     #(32-3)

        mov             v18.16b, v15.16b          // left[0]
        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
        b.gt            22f

21:
        // base_y < 32, using tbx
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        movi            v11.8h,  #1, lsl #8
        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...

        movi            v13.16b, #2

        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]

        add             v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
        mov             v19.16b, v15.16b          // left[0]

        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]

        add             v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
        mov             v20.16b, v15.16b          // left[0]

        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]

        b               23f

22:
        // base_y >= 32, using separate loads.
        smov            w15,     v29.b[1]
        smov            w16,     v29.b[2]
        add             x10, x3,  w10, sxtw #1
        smov            w17,     v29.b[3]
        add             x15, x3,  w15, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[0], [x10]
        smov            w10,     v29.b[4]
        add             x16, x3,  w16, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[1], [x15]
        smov            w15,     v29.b[5]
        add             x17, x3,  w17, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[2], [x16]
        smov            w16,     v29.b[6]
        add             x10, x3,  w10, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[3], [x17]
        smov            w17,     v29.b[7]
        add             x15, x3,  w15, sxtw #1
        add             x16, x3,  w16, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[4], [x10]
        add             x17, x3,  w17, sxtw #1
        ld3             {v18.h, v19.h, v20.h}[5], [x15]
        ld3             {v18.h, v19.h, v20.h}[6], [x16]
        ld3             {v18.h, v19.h, v20.h}[7], [x17]

23:

        ld1             {v5.8h}, [x9], #16        // top[base_x]
        ld1             {v7.8h}, [x11], #16

        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h
        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
        ext             v19.16b, v6.16b,  v7.16b,  #2

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
        umull2          v13.4s,  v4.8h,   v8.8h
        umlal2          v13.4s,  v18.8h,  v16.8h
        umull           v14.4s,  v6.4h,   v9.4h
        umlal           v14.4s,  v19.4h,  v17.4h
        umull2          v20.4s,  v6.8h,   v9.8h
        umlal2          v20.4s,  v19.8h,  v17.8h

        cmge            v18.8h,  v21.8h,  #0
        cmge            v19.8h,  v22.8h,  #0

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v20.4s,  #6

        bit             v10.16b, v12.16b, v18.16b
        bit             v11.16b, v13.16b, v19.16b

        st1             {v10.8h}, [x0], #16
        subs            w4,  w4,  #8
        st1             {v11.8h}, [x13], #16
        b.le            3f

        movi            v10.8h,  #8
        mov             v4.16b,  v5.16b
        mov             v6.16b,  v7.16b
        add             v21.8h,  v21.8h,  v10.8h  // base_x += 8
        add             v22.8h,  v22.8h,  v10.8h
        b               2b

3:
        subs            w5,  w5,  #2
        b.le            9f
        movi            v10.8h, #128
        add             x0,  x0,  x1
        add             x13, x13, x1
        mov             w4,  w12                  // reset w
        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
        b               1b

4:      // The rest of the row only predicted from top[]
        ld1             {v5.8h}, [x9], #16        // top[base_x]
        ld1             {v7.8h}, [x11], #16

        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
        ext             v19.16b, v6.16b,  v7.16b,  #2

        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
        umull2          v13.4s,  v4.8h,   v8.8h
        umlal2          v13.4s,  v18.8h,  v16.8h
        umull           v14.4s,  v6.4h,   v9.4h
        umlal           v14.4s,  v19.4h,  v17.4h
        umull2          v20.4s,  v6.8h,   v9.8h
        umlal2          v20.4s,  v19.8h,  v17.8h

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v20.4s,  #6

        st1             {v12.8h}, [x0], #16
        subs            w4,  w4,  #8
        st1             {v13.8h}, [x13], #16
        b.le            3b

        mov             v4.16b,  v5.16b
        mov             v6.16b,  v7.16b
        b               4b

169:    // The rest of the block only predicted from left[]
        add             x1,  x1,  w4,  uxtw #1    // restore stride
        mov             w12, w5                   // orig remaining h
1:
        movi            v12.8h,  #64
        movi            v10.8h,  #0x3e

        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
        and             v27.16b, v23.16b, v10.16b // frac_y

        smov            w10,     v29.b[0]         // base_y[0]

        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        movi            v11.8h,  #1, lsl #8
        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...

        cmp             w10,     #(32-1)

        mov             v18.16b, v15.16b          // left[0]
        movi            v21.16b, #2

        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y

        b.gt            31f

        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
        add             v29.16b, v29.16b, v21.16b // base_y + 1 (*2)

2:
        // base_y < 32, using tbx.
        smov            w10,     v29.b[0]         // base_y[0]
        mov             v19.16b, v15.16b          // left[0]
        cmp             w10,     #(64-4)
        b.gt            32f
        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
        add             v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
        mov             v20.16b, v15.16b          // left[0]
        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
        add             v29.16b, v29.16b, v21.16b // next base_y

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h
        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        st1             {v10.8h}, [x0], x1
        subs            w5,  w5,  #2
        st1             {v11.8h}, [x13], x1
        b.le            4f
        mov             v18.16b, v20.16b
        b               2b

31:     // base_y >= 32, using separate loads, loading v18 if we had to bail
        // in the prologue.
        smov            w10,     v29.b[0]
        smov            w15,     v29.b[2]
        movi            v21.16b, #2
        smov            w16,     v29.b[4]
        add             x10, x3,  w10, sxtw
        smov            w17,     v29.b[6]
        add             x15, x3,  w15, sxtw
        ld1             {v18.h}[0], [x10]
        smov            w10,     v29.b[8]
        add             x16, x3,  w16, sxtw
        ld1             {v18.h}[1], [x15]
        smov            w15,     v29.b[10]
        add             x17, x3,  w17, sxtw
        ld1             {v18.h}[2], [x16]
        smov            w16,     v29.b[12]
        add             x10, x3,  w10, sxtw
        ld1             {v18.h}[3], [x17]
        smov            w17,     v29.b[14]
        add             x15, x3,  w15, sxtw
        add             x16, x3,  w16, sxtw
        ld1             {v18.h}[4], [x10]
        add             x17, x3,  w17, sxtw
        ld1             {v18.h}[5], [x15]
        add             v29.16b, v29.16b, v21.16b // next base_y
        ld1             {v18.h}[6], [x16]
        ld1             {v18.h}[7], [x17]

32:     // base_y >= 32, using separate loads.
        cmp             w5,  #4
        b.lt            34f
33:     // h >= 4, preserving v18 from the previous round, loading v19-v22.
        smov            w10,     v29.b[0]
        subs            w5,  w5,  #4
        smov            w15,     v29.b[2]
        movi            v10.16b, #8
        smov            w16,     v29.b[4]
        add             x10, x3,  w10, sxtw
        smov            w17,     v29.b[6]
        add             x15, x3,  w15, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[0], [x10]
        smov            w10,     v29.b[8]
        add             x16, x3,  w16, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[1], [x15]
        smov            w15,     v29.b[10]
        add             x17, x3,  w17, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[2], [x16]
        smov            w16,     v29.b[12]
        add             x10, x3,  w10, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[3], [x17]
        smov            w17,     v29.b[14]
        add             x15, x3,  w15, sxtw
        add             x16, x3,  w16, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[4], [x10]
        add             x17, x3,  w17, sxtw
        ld4             {v19.h, v20.h, v21.h, v22.h}[5], [x15]
        ld4             {v19.h, v20.h, v21.h, v22.h}[6], [x16]
        add             v29.16b, v29.16b, v10.16b // next base_y
        ld4             {v19.h, v20.h, v21.h, v22.h}[7], [x17]

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h
        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        umull           v12.4s,  v20.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v12.4s,  v21.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v13.4s,  v20.8h,  v28.8h
        umlal2          v13.4s,  v21.8h,  v27.8h
        umull           v14.4s,  v21.4h,  v28.4h
        umlal           v14.4s,  v22.4h,  v27.4h
        umull2          v18.4s,  v21.8h,  v28.8h
        umlal2          v18.4s,  v22.8h,  v27.8h

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v18.4s,  #6

        st1             {v10.8h}, [x0],  x1
        cmp             w5,  #2
        st1             {v11.8h}, [x13], x1
        st1             {v12.8h}, [x0],  x1
        st1             {v13.8h}, [x13], x1
        b.lt            4f
        mov             v18.16b, v22.16b
        b.gt            33b

34:     // h == 2, preserving v18 from the previous round, loading v19-v20.
        smov            w10,     v29.b[0]
        smov            w15,     v29.b[2]
        movi            v21.16b, #4
        smov            w16,     v29.b[4]
        add             x10, x3,  w10, sxtw
        smov            w17,     v29.b[6]
        add             x15, x3,  w15, sxtw
        ld2             {v19.h, v20.h}[0], [x10]
        smov            w10,     v29.b[8]
        add             x16, x3,  w16, sxtw
        ld2             {v19.h, v20.h}[1], [x15]
        smov            w15,     v29.b[10]
        add             x17, x3,  w17, sxtw
        ld2             {v19.h, v20.h}[2], [x16]
        smov            w16,     v29.b[12]
        add             x10, x3,  w10, sxtw
        ld2             {v19.h, v20.h}[3], [x17]
        smov            w17,     v29.b[14]
        add             x15, x3,  w15, sxtw
        add             x16, x3,  w16, sxtw
        ld2             {v19.h, v20.h}[4], [x10]
        add             x17, x3,  w17, sxtw
        ld2             {v19.h, v20.h}[5], [x15]
        ld2             {v19.h, v20.h}[6], [x16]
        add             v29.16b, v29.16b, v21.16b // next base_y
        ld2             {v19.h, v20.h}[7], [x17]

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h
        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        st1             {v10.8h}, [x0], x1
        st1             {v11.8h}, [x13], x1
        // The h==2 case only happens once at the end, if at all.

4:
        subs            w4,  w4,  #8
        b.le            9f

        lsr             x1,  x1,  #1
        msub            x0,  x1,  x12, x0         // ptr -= h * stride
        msub            x13, x1,  x12, x13
        lsl             x1,  x1,  #1
        add             x0,  x0,  #16
        add             x13, x13, #16
        mov             w5,  w12                  // reset h
        b               1b

9:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret

L(ipred_z2_fill1_tbl):
        .hword L(ipred_z2_fill1_tbl) - 640b
        .hword L(ipred_z2_fill1_tbl) - 320b
        .hword L(ipred_z2_fill1_tbl) - 160b
        .hword L(ipred_z2_fill1_tbl) -  80b
        .hword L(ipred_z2_fill1_tbl) -  40b
endfunc

function ipred_z2_fill2_16bpc_neon, export=1
        cmp             w4,  #8
        mov             w8,  #(2 << 6)            // xpos = 2 << 6
        sub             w8,  w8,  w6              // xpos -= dx

        movrel          x11, increments
        ld1             {v31.8h},  [x11]          // increments
        neg             w7,  w7                   // -dy
        b.eq            80f

40:
        dup             v30.4h,  w7               // -dy
        movi            v17.8b,  #1

        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
        movi            v25.8h,  #0x3e
        add             v30.4h,  v16.4h,  v30.4h  // -= dy

        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
        // from left.
        ld1             {v0.8h, v1.8h}, [x3]      // left[]

        movi            v26.8h,  #64
        movi            v19.16b, #4

        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
        and             v27.8b,  v30.8b,  v25.8b  // frac_y

        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1

        movi            v23.4h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
        movi            v17.8b,  #2
        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...

        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)

        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]

        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2

        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y

        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}

        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y

        movi            v29.16b, #4
        add             v31.8h,  v31.8h,  v31.8h  // {0,2,4,6,0,2,4,6}
4:
        asr             w9,  w8,  #6              // base_x
        dup             v16.4h,  w8               // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-8                  // base_x <= -8
        asr             w11, w8,  #6              // base_x
        b.le            49f

        lsl             w9,  w9,  #1
        lsl             w11, w11, #1

        dup             v17.4h,  w8               // xpos

        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
        ldr             q6,  [x2, w11, sxtw]

        trn1            v16.2d,  v16.2d,  v17.2d  // xpos

        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]

        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row

        uzp2            v5.8h,   v4.8h,   v6.8h   // top[base_x+1]
        uzp1            v4.8h,   v4.8h,   v6.8h   // top[base_x]

        and             v16.16b, v16.16b, v25.16b // frac_x

        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]

        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x

        add             v20.8h,  v20.8h,  v31.8h  // actual base_x

        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v22.4s,  v18.8h,  v28.8h
        umlal2          v22.4s,  v19.8h,  v27.8h

        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v24.4s,  v4.8h,   v17.8h
        umlal2          v24.4s,  v5.8h,   v16.8h

        cmge            v20.8h,  v20.8h,  #0

        rshrn           v21.4h,  v21.4s,  #6
        rshrn2          v21.8h,  v22.4s,  #6
        rshrn           v22.4h,  v23.4s,  #6
        rshrn2          v22.8h,  v24.4s,  #6

        bit             v21.16b, v22.16b, v20.16b

        st1             {v21.d}[0], [x0], x1
        sub             w8,  w8,  w6              // xpos -= dx
        subs            w5,  w5,  #2
        st1             {v21.d}[1], [x0], x1
        b.le            9f

        ext             v18.16b, v19.16b, v19.16b, #8
        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
        b               4b

49:
        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]

        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]

        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v21.4s,  v18.8h,  v28.8h
        umlal2          v21.4s,  v19.8h,  v27.8h

        rshrn           v20.4h,  v20.4s,  #6
        rshrn2          v20.8h,  v21.4s,  #6

        st1             {v20.d}[0], [x0], x1
        subs            w5,  w5,  #2
        st1             {v20.d}[1], [x0], x1
        b.le            9f

        ext             v18.16b, v19.16b, v19.16b, #8
        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
        b               49b

9:
        ret

80:
        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        dup             v18.8h,  w7               // -dy
        movi            v17.8b,  #1

        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
        movi            v25.8h,  #0x3e
        add             v16.8h,  v16.8h,  v18.8h  // -= dy

        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
        // from left.
        ld1             {v0.8h, v1.8h}, [x3]      // left[]

        movi            v26.8h,  #64
        movi            v19.16b, #4

        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
        and             v27.16b, v16.16b, v25.16b // frac_y

        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1

        movi            v23.8h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
        movi            v17.16b, #2
        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...

        // Cut corners here; for the first row we don't expect to need to
        // read outside of v0.
        tbl             v18.16b, {v0.16b}, v29.16b // left[base_y]

        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)

        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y

        movi            v24.16b, #4
        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
8:
        asr             w9,  w8,  #6              // base_x
        dup             v16.8h,   w8              // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-16                 // base_x <= -16
        asr             w11, w8,  #6              // base_x
        b.le            89f

        dup             v17.8h,   w8              // xpos

        add             x9,  x2,  w9,  sxtw #1
        add             x11, x2,  w11, sxtw #1

        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
        ld1             {v6.8h, v7.8h}, [x11]

        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]

        sshr            v21.8h,  v16.8h,  #6      // first base_x
        sshr            v22.8h,  v17.8h,  #6

        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]

        uzp2            v2.8h,   v4.8h,   v5.8h   // top[base_x+1]
        uzp1            v4.8h,   v4.8h,   v5.8h   // top[base_x]
        uzp2            v3.8h,   v6.8h,   v7.8h
        uzp1            v6.8h,   v6.8h,   v7.8h
        mov             v5.16b,  v2.16b
        mov             v7.16b,  v3.16b

        and             v16.16b, v16.16b, v25.16b // frac_x
        and             v17.16b, v17.16b, v25.16b

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y

        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
        sub             v9.8h,   v26.8h,  v17.8h

        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h

        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
        add             v22.8h,  v22.8h,  v31.8h

        umull           v12.4s,  v19.4h,  v28.4h
        umlal           v12.4s,  v20.4h,  v27.4h
        umull2          v13.4s,  v19.8h,  v28.8h
        umlal2          v13.4s,  v20.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v13.4s,  v4.8h,   v8.8h
        umlal2          v13.4s,  v5.8h,   v16.8h
        umull           v14.4s,  v6.4h,   v9.4h
        umlal           v14.4s,  v7.4h,   v17.4h
        umull2          v18.4s,  v6.8h,   v9.8h
        umlal2          v18.4s,  v7.8h,   v17.8h

        cmge            v21.8h,  v21.8h,  #0
        cmge            v22.8h,  v22.8h,  #0

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v18.4s,  #6

        bit             v10.16b, v12.16b, v21.16b
        bit             v11.16b, v13.16b, v22.16b

        st1             {v10.8h}, [x0], x1
        subs            w5,  w5,  #2
        sub             w8,  w8,  w6              // xpos -= dx
        st1             {v11.8h}, [x0], x1
        b.le            9f

        mov             v18.16b, v20.16b
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
        b               8b

89:
        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]

        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v5.4s,   v18.8h,  v28.8h
        umlal2          v5.4s,   v19.8h,  v27.8h
        umull           v6.4s,   v19.4h,  v28.4h
        umlal           v6.4s,   v20.4h,  v27.4h
        umull2          v7.4s,   v19.8h,  v28.8h
        umlal2          v7.4s,   v20.8h,  v27.8h

        rshrn           v4.4h,   v4.4s,   #6
        rshrn2          v4.8h,   v5.4s,   #6
        rshrn           v5.4h,   v6.4s,   #6
        rshrn2          v5.8h,   v7.4s,   #6

        st1             {v4.8h}, [x0], x1
        subs            w5,  w5,  #2
        st1             {v5.8h}, [x0], x1
        b.le            9f

        mov             v18.16b, v20.16b
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
        b               89b

9:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret
endfunc

function ipred_z2_fill3_16bpc_neon, export=1
        cmp             w4,  #8
        mov             w8,  #(1 << 6)            // xpos = 1 << 6
        sub             w8,  w8,  w6              // xpos -= dx

        movrel          x11, increments
        ld1             {v31.8h},  [x11]          // increments
        neg             w7,  w7                   // -dy
        b.eq            80f

40:
        dup             v30.4h,  w7               // -dy
        movi            v17.8b,  #1

        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
        movi            v25.8h,  #0x3e
        add             v30.4h,  v16.4h,  v30.4h  // -= dy

        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]

        movi            v26.8h,  #64
        movi            v19.16b, #2

        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
        and             v27.8b,  v30.8b,  v25.8b  // frac_y

        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2

        movi            v23.4h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        movi            v19.16b, #4
        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
        movi            v17.8b,  #2
        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...

        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)

        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}

        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3 (*2)

        trn1            v29.2d,  v29.2d,  v28.2d  // base_y + 0, base_y + 2
        trn1            v30.2d,  v30.2d,  v24.2d  // base_y + 1, base_y + 3

        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y

        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y

        movi            v24.16b, #8
4:
        asr             w9,  w8,  #6              // base_x
        dup             v16.4h,  w8               // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-4                  // base_x <= -4
        asr             w11, w8,  #6              // base_x
        b.le            49f

        lsl             w9,  w9,  #1
        lsl             w11, w11, #1

        dup             v17.4h,  w8               // xpos

        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
        ldr             q6,  [x2, w11, sxtw]

        trn1            v16.2d,  v16.2d,  v17.2d  // xpos

        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]

        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row

        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
        ext             v7.16b,  v6.16b,  v6.16b,  #2

        and             v16.16b, v16.16b, v25.16b // frac_x

        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]

        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x

        add             v20.8h,  v20.8h,  v31.8h  // actual base_x

        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v22.4s,  v18.8h,  v28.8h
        umlal2          v22.4s,  v19.8h,  v27.8h

        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v24.4s,  v4.8h,   v17.8h
        umlal2          v24.4s,  v5.8h,   v16.8h

        cmge            v20.8h,  v20.8h,  #0

        rshrn           v21.4h,  v21.4s,  #6
        rshrn2          v21.8h,  v22.4s,  #6
        rshrn           v22.4h,  v23.4s,  #6
        rshrn2          v22.8h,  v24.4s,  #6

        movi            v24.16b, #8

        bit             v21.16b, v22.16b, v20.16b

        st1             {v21.d}[0], [x0], x1
        sub             w8,  w8,  w6              // xpos -= dx
        subs            w5,  w5,  #2
        st1             {v21.d}[1], [x0], x1
        b.le            9f

        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
        b               4b

49:
        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]

        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v21.4s,  v18.8h,  v28.8h
        umlal2          v21.4s,  v19.8h,  v27.8h

        rshrn           v20.4h,  v20.4s,  #6
        rshrn2          v20.8h,  v21.4s,  #6

        st1             {v20.d}[0], [x0], x1
        subs            w5,  w5,  #2
        st1             {v20.d}[1], [x0], x1
        b.le            9f

        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
        b               49b

9:
        ret

80:
        stp             d8,  d9,  [sp, #-0x40]!
        stp             d10, d11, [sp, #0x10]
        stp             d12, d13, [sp, #0x20]
        stp             d14, d15, [sp, #0x30]

        dup             v18.8h,  w7               // -dy
        movi            v17.16b, #2

        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
        movi            v25.8h,  #0x3e
        add             v16.8h,  v16.8h,  v18.8h  // -= dy

        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]

        movi            v26.8h,  #64
        movi            v19.16b, #4

        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
        and             v27.16b, v16.16b, v25.16b // frac_y

        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 2

        movi            v23.8h,  #1, lsl #8
        shl             v29.8b,  v29.8b,  #1      // 2*base_y
        mov             v18.16b, v15.16b          // left[0]
        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...

        add             v30.16b, v29.16b, v17.16b // base_y + 1 (*2)

        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y

        movi            v24.16b, #4
8:
        asr             w9,  w8,  #6              // base_x
        dup             v16.8h,   w8              // xpos
        sub             w8,  w8,  w6              // xpos -= dx
        cmp             w9,  #-16                 // base_x <= -16
        asr             w11, w8,  #6              // base_x
        b.le            89f

        dup             v17.8h,   w8              // xpos

        add             x9,  x2,  w9,  sxtw #1
        add             x11, x2,  w11, sxtw #1

        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
        ld1             {v6.8h, v7.8h}, [x11]

        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
        add             v30.16b, v30.16b, v24.16b

        sshr            v22.8h,  v16.8h,  #6      // first base_x
        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
        sshr            v23.8h,  v17.8h,  #6
        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]

        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
        ext             v7.16b,  v6.16b,  v7.16b,  #2

        and             v16.16b, v16.16b, v25.16b // frac_x
        and             v17.16b, v17.16b, v25.16b

        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y

        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
        sub             v9.8h,   v26.8h,  v17.8h

        umull2          v11.4s,  v18.8h,  v28.8h
        umlal2          v11.4s,  v19.8h,  v27.8h

        add             v22.8h,  v22.8h,  v31.8h  // actual base_x
        add             v23.8h,  v23.8h,  v31.8h

        umull           v12.4s,  v20.4h,  v28.4h
        umlal           v12.4s,  v21.4h,  v27.4h
        umull2          v13.4s,  v20.8h,  v28.8h
        umlal2          v13.4s,  v21.8h,  v27.8h

        rshrn           v10.4h,  v10.4s,  #6
        rshrn2          v10.8h,  v11.4s,  #6
        rshrn           v11.4h,  v12.4s,  #6
        rshrn2          v11.8h,  v13.4s,  #6

        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
        umull2          v13.4s,  v4.8h,   v8.8h
        umlal2          v13.4s,  v5.8h,   v16.8h
        umull           v14.4s,  v6.4h,   v9.4h
        umlal           v14.4s,  v7.4h,   v17.4h
        umull2          v18.4s,  v6.8h,   v9.8h
        umlal2          v18.4s,  v7.8h,   v17.8h

        cmge            v22.8h,  v22.8h,  #0
        cmge            v23.8h,  v23.8h,  #0

        rshrn           v12.4h,  v12.4s,  #6
        rshrn2          v12.8h,  v13.4s,  #6
        rshrn           v13.4h,  v14.4s,  #6
        rshrn2          v13.8h,  v18.4s,  #6

        bit             v10.16b, v12.16b, v22.16b
        bit             v11.16b, v13.16b, v23.16b

        st1             {v10.8h}, [x0], x1
        subs            w5,  w5,  #2
        sub             w8,  w8,  w6              // xpos -= dx
        st1             {v11.8h}, [x0], x1
        b.le            9f

        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b
        b               8b

89:
        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
        add             v30.16b, v30.16b, v24.16b
        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]

        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
        umull2          v5.4s,   v18.8h,  v28.8h
        umlal2          v5.4s,   v19.8h,  v27.8h
        umull           v6.4s,   v20.4h,  v28.4h
        umlal           v6.4s,   v21.4h,  v27.4h
        umull2          v7.4s,   v20.8h,  v28.8h
        umlal2          v7.4s,   v21.8h,  v27.8h

        rshrn           v4.4h,   v4.4s,   #6
        rshrn2          v4.8h,   v5.4s,   #6
        rshrn           v5.4h,   v6.4s,   #6
        rshrn2          v5.8h,   v7.4s,   #6

        st1             {v4.8h}, [x0], x1
        subs            w5,  w5,  #2
        st1             {v5.8h}, [x0], x1
        b.le            9f

        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
        add             v30.16b, v30.16b, v24.16b
        b               89b

9:
        ldp             d14, d15, [sp, #0x30]
        ldp             d12, d13, [sp, #0x20]
        ldp             d10, d11, [sp, #0x10]
        ldp             d8,  d9,  [sp], 0x40
        ret
endfunc

// void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const left,
//                                const int width, const int height,
//                                const int dy, const int max_base_y);
function ipred_z3_fill1_16bpc_neon, export=1
        clz             w9,  w4
        adr             x8,  L(ipred_z3_fill1_tbl)
        sub             w9,  w9,  #25
        ldrh            w9,  [x8, w9, uxtw #1]
        add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
        sub             x8,  x8,  w9,  uxtw
        ld1r            {v31.8h}, [x10]           // padding
        mov             w7,  w5
        mov             w15, #64
        add             x13, x0,  x1
        lsl             x1,  x1,  #1
        br              x8

40:
        AARCH64_VALID_JUMP_TARGET
4:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            ipred_z3_fill_padding_neon
        lsl             w8,  w8,  #1
        lsl             w10, w10, #1
        ldr             q0,  [x2, w8, uxtw]       // left[base]
        ldr             q2,  [x2, w10, uxtw]
        dup             v4.8h,   w9               // frac
        dup             v5.8h,   w11
        ext             v1.16b,  v0.16b,  v0.16b,  #2 // left[base+1]
        ext             v3.16b,  v2.16b,  v2.16b,  #2
        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
        sub             v7.4h,   v3.4h,   v2.4h
        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
        ushll           v17.4s,  v2.4h,   #6
        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
        smlal           v17.4s,  v7.4h,   v5.4h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn           v17.4h,  v17.4s,  #6
        subs            w3,  w3,  #2
        zip1            v18.8h,  v16.8h,  v17.8h
        st1             {v18.s}[0], [x0],  x1
        st1             {v18.s}[1], [x13], x1
        add             w7,  w7,  w5              // xpos += dx
        st1             {v18.s}[2], [x0]
        st1             {v18.s}[3], [x13]
        b.le            9f
        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
        sub             x13, x13, x1
        add             x0,  x0,  #4
        add             x13, x13, #4
        b               4b
9:
        ret

80:
        AARCH64_VALID_JUMP_TARGET
8:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            ipred_z3_fill_padding_neon
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v4.8h,   w9               // frac
        dup             v5.8h,   w11
        ld1             {v0.8h},  [x8]            // left[base]
        ld1             {v2.8h},  [x10]
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        ldr             h1, [x8, #16]
        ldr             h3, [x10, #16]
        dup             v6.8h,   w9               // 64 - frac
        dup             v7.8h,   w11
        ext             v1.16b,  v0.16b,  v1.16b,  #2 // left[base+1]
        ext             v3.16b,  v2.16b,  v3.16b,  #2
        umull           v16.4s,  v0.4h,   v6.4h   // left[base]*(64-frac)
        umlal           v16.4s,  v1.4h,   v4.4h   // + left[base+1]*frac
        umull2          v17.4s,  v0.8h,   v6.8h
        umlal2          v17.4s,  v1.8h,   v4.8h
        umull           v18.4s,  v2.4h,   v7.4h
        umlal           v18.4s,  v3.4h,   v5.4h
        umull2          v19.4s,  v2.8h,   v7.8h
        umlal2          v19.4s,  v3.8h,   v5.8h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn2          v16.8h,  v17.4s,  #6
        rshrn           v17.4h,  v18.4s,  #6
        rshrn2          v17.8h,  v19.4s,  #6
        subs            w3,  w3,  #2
        zip1            v18.8h,  v16.8h,  v17.8h
        zip2            v19.8h,  v16.8h,  v17.8h
        add             w7,  w7,  w5              // xpos += dx
        st1             {v18.s}[0], [x0],  x1
        st1             {v18.s}[1], [x13], x1
        st1             {v18.s}[2], [x0],  x1
        st1             {v18.s}[3], [x13], x1
        st1             {v19.s}[0], [x0],  x1
        st1             {v19.s}[1], [x13], x1
        st1             {v19.s}[2], [x0],  x1
        st1             {v19.s}[3], [x13], x1
        b.le            9f
        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
        sub             x13, x13, x1, lsl #2
        add             x0,  x0,  #4
        add             x13, x13, #4
        b               8b
9:
        ret

160:
320:
640:
        AARCH64_VALID_JUMP_TARGET
        mov             w12, w4
1:
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // ypos += dy
        cmp             w8,  w6                   // base >= max_base_y
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            ipred_z3_fill_padding_neon
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v6.8h,   w9               // frac
        dup             v7.8h,   w11
        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // left[base]
        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        dup             v16.8h,  w9               // 64 - frac
        dup             v17.8h,  w11
        add             w7,  w7,  w5              // ypos += dy
2:
        ext             v18.16b, v0.16b,  v1.16b,  #2 // left[base+1]
        ext             v19.16b, v1.16b,  v2.16b,  #2
        ext             v20.16b, v3.16b,  v4.16b,  #2
        ext             v21.16b, v4.16b,  v5.16b,  #2
        subs            w4,  w4,  #16
        umull           v22.4s,  v0.4h,   v16.4h  // left[base]*(64-frac)
        umlal           v22.4s,  v18.4h,  v6.4h   // + left[base+1]*frac
        umull2          v23.4s,  v0.8h,   v16.8h
        umlal2          v23.4s,  v18.8h,  v6.8h
        umull           v24.4s,  v1.4h,   v16.4h
        umlal           v24.4s,  v19.4h,  v6.4h
        umull2          v25.4s,  v1.8h,   v16.8h
        umlal2          v25.4s,  v19.8h,  v6.8h
        umull           v26.4s,  v3.4h,   v17.4h
        umlal           v26.4s,  v20.4h,  v7.4h
        umull2          v27.4s,  v3.8h,   v17.8h
        umlal2          v27.4s,  v20.8h,  v7.8h
        umull           v28.4s,  v4.4h,   v17.4h
        umlal           v28.4s,  v21.4h,  v7.4h
        umull2          v29.4s,  v4.8h,   v17.8h
        umlal2          v29.4s,  v21.8h,  v7.8h
        rshrn           v22.4h,  v22.4s,  #6
        rshrn2          v22.8h,  v23.4s,  #6
        rshrn           v23.4h,  v24.4s,  #6
        rshrn2          v23.8h,  v25.4s,  #6
        rshrn           v24.4h,  v26.4s,  #6
        rshrn2          v24.8h,  v27.4s,  #6
        rshrn           v25.4h,  v28.4s,  #6
        rshrn2          v25.8h,  v29.4s,  #6
        zip1            v18.8h,  v22.8h,  v24.8h
        zip2            v19.8h,  v22.8h,  v24.8h
        zip1            v20.8h,  v23.8h,  v25.8h
        zip2            v21.8h,  v23.8h,  v25.8h
        st1             {v18.s}[0], [x0],  x1
        st1             {v18.s}[1], [x13], x1
        st1             {v18.s}[2], [x0],  x1
        st1             {v18.s}[3], [x13], x1
        st1             {v19.s}[0], [x0],  x1
        st1             {v19.s}[1], [x13], x1
        st1             {v19.s}[2], [x0],  x1
        st1             {v19.s}[3], [x13], x1
        st1             {v20.s}[0], [x0],  x1
        st1             {v20.s}[1], [x13], x1
        st1             {v20.s}[2], [x0],  x1
        st1             {v20.s}[3], [x13], x1
        st1             {v21.s}[0], [x0],  x1
        st1             {v21.s}[1], [x13], x1
        st1             {v21.s}[2], [x0],  x1
        st1             {v21.s}[3], [x13], x1
        b.le            3f
        mov             v0.16b,  v2.16b
        ld1             {v1.8h, v2.8h}, [x8],  #32      // left[base]
        mov             v3.16b,  v5.16b
        ld1             {v4.8h, v5.8h}, [x10], #32
        b               2b

3:
        subs            w3,  w3,  #2
        b.le            9f
        lsr             x1,  x1,  #1
        msub            x0,  x1,  x12, x0         // ptr -= h * stride
        msub            x13, x1,  x12, x13
        lsl             x1,  x1,  #1
        add             x0,  x0,  #4
        add             x13, x13, #4
        mov             w4,  w12
        b               1b
9:
        ret

L(ipred_z3_fill1_tbl):
        .hword L(ipred_z3_fill1_tbl) - 640b
        .hword L(ipred_z3_fill1_tbl) - 320b
        .hword L(ipred_z3_fill1_tbl) - 160b
        .hword L(ipred_z3_fill1_tbl) -  80b
        .hword L(ipred_z3_fill1_tbl) -  40b
endfunc

function ipred_z3_fill_padding_neon, export=0
        cmp             w3,  #8
        adr             x8,  L(ipred_z3_fill_padding_tbl)
        b.gt            L(ipred_z3_fill_padding_wide)
        // w3 = remaining width, w4 = constant height
        mov             w12, w4

1:
        // Fill a WxH rectangle with padding. W can be any number;
        // this fills the exact width by filling in the largest
        // power of two in the remaining width, and repeating.
        clz             w9,  w3
        sub             w9,  w9,  #25
        ldrh            w9,  [x8, w9, uxtw #1]
        sub             x9,  x8,  w9,  uxtw
        br              x9

2:
        AARCH64_VALID_JUMP_TARGET
        st1             {v31.s}[0], [x0],  x1
        subs            w4,  w4,  #4
        st1             {v31.s}[0], [x13], x1
        st1             {v31.s}[0], [x0],  x1
        st1             {v31.s}[0], [x13], x1
        b.gt            2b
        subs            w3,  w3,  #2
        lsr             x1,  x1,  #1
        msub            x0,  x1,  x12, x0         // ptr -= h * stride
        msub            x13, x1,  x12, x13
        b.le            9f
        lsl             x1,  x1,  #1
        add             x0,  x0,  #4
        add             x13, x13, #4
        mov             w4,  w12
        b               1b

4:
        AARCH64_VALID_JUMP_TARGET
        st1             {v31.4h}, [x0],  x1
        subs            w4,  w4,  #4
        st1             {v31.4h}, [x13], x1
        st1             {v31.4h}, [x0],  x1
        st1             {v31.4h}, [x13], x1
        b.gt            4b
        subs            w3,  w3,  #4
        lsr             x1,  x1,  #1
        msub            x0,  x1,  x12, x0         // ptr -= h * stride
        msub            x13, x1,  x12, x13
        b.le            9f
        lsl             x1,  x1,  #1
        add             x0,  x0,  #8
        add             x13, x13, #8
        mov             w4,  w12
        b               1b

8:
16:
32:
64:
        AARCH64_VALID_JUMP_TARGET
        st1             {v31.8h}, [x0],  x1
        subs            w4,  w4,  #4
        st1             {v31.8h}, [x13], x1
        st1             {v31.8h}, [x0],  x1
        st1             {v31.8h}, [x13], x1
        b.gt            4b
        subs            w3,  w3,  #8
        lsr             x1,  x1,  #1
        msub            x0,  x1,  x12, x0         // ptr -= h * stride
        msub            x13, x1,  x12, x13
        b.le            9f
        lsl             x1,  x1,  #1
        add             x0,  x0,  #16
        add             x13, x13, #16
        mov             w4,  w12
        b               1b

9:
        ret

L(ipred_z3_fill_padding_tbl):
        .hword L(ipred_z3_fill_padding_tbl) - 64b
        .hword L(ipred_z3_fill_padding_tbl) - 32b
        .hword L(ipred_z3_fill_padding_tbl) - 16b
        .hword L(ipred_z3_fill_padding_tbl) -  8b
        .hword L(ipred_z3_fill_padding_tbl) -  4b
        .hword L(ipred_z3_fill_padding_tbl) -  2b

L(ipred_z3_fill_padding_wide):
        // Fill a WxH rectangle with padding, with W > 8.
        lsr             x1,  x1,  #1
        mov             w12, w3
        sub             x1,  x1,  w3,  uxtw #1
1:
        ands            w5,  w3,  #7
        b.eq            2f
        // If the width isn't aligned to 8, first do one 8 pixel write
        // and align the start pointer.
        sub             w3,  w3,  w5
        st1             {v31.8h}, [x0]
        add             x0,  x0,  w5,  uxtw #1
2:
        // Fill the rest of the line with aligned 8 pixel writes.
        subs            w3,  w3,  #8
        st1             {v31.8h}, [x0], #16
        b.gt            2b
        subs            w4,  w4,  #1
        add             x0,  x0,  x1
        b.le            9f
        mov             w3,  w12
        b               1b
9:
        ret
endfunc

function ipred_z3_fill2_16bpc_neon, export=1
        cmp             w4,  #8
        add             x10, x2,  w6,  uxtw       // left[max_base_y]
        ld1r            {v31.16b}, [x10]          // padding
        mov             w7,  w5
        mov             w15, #64
        add             x13, x0,  x1
        lsl             x1,  x1,  #1
        b.eq            8f

4:      // h == 4
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            ipred_z3_fill_padding_neon
        lsl             w8,  w8,  #1
        lsl             w10, w10, #1
        ldr             q0,  [x2, w8, uxtw]       // top[base]
        ldr             q2,  [x2, w10, uxtw]
        dup             v4.4h,   w9               // frac
        dup             v5.4h,   w11
        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
        uzp2            v3.8h,   v2.8h,   v2.8h
        uzp1            v2.8h,   v2.8h,   v2.8h
        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
        sub             v7.4h,   v3.4h,   v2.4h
        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
        ushll           v17.4s,  v2.4h,   #6
        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
        smlal           v17.4s,  v7.4h,   v5.4h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn           v17.4h,  v17.4s,  #6
        subs            w3,  w3,  #2
        zip1            v18.8h,  v16.8h,  v17.8h
        st1             {v18.s}[0], [x0],  x1
        st1             {v18.s}[1], [x13], x1
        add             w7,  w7,  w5              // xpos += dx
        st1             {v18.s}[2], [x0]
        st1             {v18.s}[3], [x13]
        b.le            9f
        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
        sub             x13, x13, x1
        add             x0,  x0,  #4
        add             x13, x13, #4
        b               4b
9:
        ret

8:      // h == 8
        lsr             w8,  w7,  #6              // base
        and             w9,  w7,  #0x3e           // frac
        add             w7,  w7,  w5              // xpos += dx
        cmp             w8,  w6                   // base >= max_base_x
        lsr             w10, w7,  #6              // base
        and             w11, w7,  #0x3e           // frac
        b.ge            ipred_z3_fill_padding_neon
        add             x8,  x2,  w8,  uxtw #1
        add             x10, x2,  w10, uxtw #1
        dup             v4.8h,   w9               // frac
        dup             v5.8h,   w11
        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
        ld1             {v2.8h, v3.8h},  [x10]
        sub             w9,  w15, w9              // 64 - frac
        sub             w11, w15, w11
        dup             v6.8h,   w9               // 64 - frac
        dup             v7.8h,   w11
        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
        uzp2            v21.8h,  v2.8h,   v3.8h
        uzp1            v2.8h,   v2.8h,   v3.8h
        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
        umull2          v17.4s,  v0.8h,   v6.8h
        umlal2          v17.4s,  v20.8h,  v4.8h
        umull           v18.4s,  v2.4h,   v7.4h
        umlal           v18.4s,  v21.4h,  v5.4h
        umull2          v19.4s,  v2.8h,   v7.8h
        umlal2          v19.4s,  v21.8h,  v5.8h
        rshrn           v16.4h,  v16.4s,  #6
        rshrn2          v16.8h,  v17.4s,  #6
        rshrn           v17.4h,  v18.4s,  #6
        rshrn2          v17.8h,  v19.4s,  #6
        subs            w3,  w3,  #2
        zip1            v18.8h,  v16.8h,  v17.8h
        zip2            v19.8h,  v16.8h,  v17.8h
        add             w7,  w7,  w5              // xpos += dx
        st1             {v18.s}[0], [x0],  x1
        st1             {v18.s}[1], [x13], x1
        st1             {v18.s}[2], [x0],  x1
        st1             {v18.s}[3], [x13], x1
        st1             {v19.s}[0], [x0],  x1
        st1             {v19.s}[1], [x13], x1
        st1             {v19.s}[2], [x0],  x1
        st1             {v19.s}[3], [x13], x1
        b.le            9f
        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
        sub             x13, x13, x1, lsl #2
        add             x0,  x0,  #4
        add             x13, x13, #4
        b               8b
9:
        ret
endfunc


// void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                              const pixel *const topleft,
//                              const int width, const int height, const int filt_idx,
//                              const int max_width, const int max_height,
//                              const int bitdepth_max);
.macro filter_fn bpc
function ipred_filter_\bpc\()bpc_neon
        and             w5,  w5,  #511
        movrel          x6,  X(filter_intra_taps)
        lsl             w5,  w5,  #6
        add             x6,  x6,  w5, uxtw
        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
        clz             w9,  w3
        adr             x5,  L(ipred_filter\bpc\()_tbl)
        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
        sub             w9,  w9,  #26
        ldrh            w9,  [x5, w9, uxtw #1]
        sxtl            v16.8h,  v16.8b
        sxtl            v17.8h,  v17.8b
        sub             x5,  x5,  w9, uxtw
        sxtl            v18.8h,  v18.8b
        sxtl            v19.8h,  v19.8b
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        sxtl            v20.8h,  v20.8b
        sxtl            v21.8h,  v21.8b
        sxtl            v22.8h,  v22.8b
        dup             v31.8h,  w8
.if \bpc == 10
        movi            v30.8h,  #0
.endif
        br              x5
40:
        AARCH64_VALID_JUMP_TARGET
        ldur            d0,  [x2, #2]             // top (0-3)
        sub             x2,  x2,  #4
        mov             x7,  #-4
4:
        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
.if \bpc == 10
        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        srshr           v2.8h,   v2.8h,   #4
        smax            v2.8h,   v2.8h,   v30.8h
.else
        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        sqrshrun        v2.4h,   v2.4s,   #4
        sqrshrun2       v2.8h,   v3.4s,   #4
.endif
        smin            v2.8h,   v2.8h,   v31.8h
        subs            w4,  w4,  #2
        st1             {v2.d}[0], [x0], x1
        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
        st1             {v2.d}[1], [x6], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        ldur            q0,  [x2, #2]             // top (0-7)
        sub             x2,  x2,  #4
        mov             x7,  #-4
8:
        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
.if \bpc == 10
        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
        srshr           v2.8h,   v2.8h,   #4
        smax            v2.8h,   v2.8h,   v30.8h
        smin            v2.8h,   v2.8h,   v31.8h
        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
        srshr           v3.8h,   v3.8h,   #4
        smax            v3.8h,   v3.8h,   v30.8h
.else
        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v2.4h,   v2.4s,   #4
        sqrshrun2       v2.8h,   v3.4s,   #4
        smin            v2.8h,   v2.8h,   v31.8h
        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
        sqrshrun        v3.4h,   v4.4s,   #4
        sqrshrun2       v3.8h,   v5.4s,   #4
.endif
        smin            v3.8h,   v3.8h,   v31.8h
        subs            w4,  w4,  #2
        st2             {v2.d, v3.d}[0], [x0], x1
        zip2            v0.2d,   v2.2d,   v3.2d
        st2             {v2.d, v3.d}[1], [x6], x1
        b.gt            8b
        ret
160:
320:
        AARCH64_VALID_JUMP_TARGET
        add             x8,  x2,  #2
        sub             x2,  x2,  #4
        mov             x7,  #-4
        sub             x1,  x1,  w3, uxtw #1
        mov             w9,  w3

1:
        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
2:
        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
.if \bpc == 10
        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
        srshr           v3.8h,   v3.8h,   #4
        smax            v3.8h,   v3.8h,   v30.8h
        smin            v3.8h,   v3.8h,   v31.8h
        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)

        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
        srshr           v4.8h,   v4.8h,   #4
        smax            v4.8h,   v4.8h,   v30.8h
        smin            v4.8h,   v4.8h,   v31.8h
        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)

        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
        srshr           v5.8h,   v5.8h,   #4
        smax            v5.8h,   v5.8h,   v30.8h
        smin            v5.8h,   v5.8h,   v31.8h
        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)

        subs            w3,  w3,  #16
        srshr           v6.8h,   v6.8h,   #4
        smax            v6.8h,   v6.8h,   v30.8h
.else
        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)

        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v3.4h,   v3.4s,   #4
        sqrshrun2       v3.8h,   v4.4s,   #4
        smin            v3.8h,   v3.8h,   v31.8h
        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)

        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
        sqrshrun        v4.4h,   v5.4s,   #4
        sqrshrun2       v4.8h,   v6.4s,   #4
        smin            v4.8h,   v4.8h,   v31.8h
        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)

        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
        sqrshrun        v5.4h,   v24.4s,  #4
        sqrshrun2       v5.8h,   v25.4s,  #4
        smin            v5.8h,   v5.8h,   v31.8h
        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)

        subs            w3,  w3,  #16
        sqrshrun        v6.4h,   v26.4s,  #4
        sqrshrun2       v6.8h,   v27.4s,  #4
.endif
        smin            v6.8h,   v6.8h,   v31.8h

        ins             v0.h[2], v2.h[7]
        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
        ins             v0.h[0], v6.h[7]
        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
        ins             v0.h[1], v6.h[3]
        b.gt            2b
        subs            w4,  w4,  #2
        b.le            9f
        sub             x8,  x6,  w9, uxtw #1
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b               1b
9:
        ret

L(ipred_filter\bpc\()_tbl):
        .hword L(ipred_filter\bpc\()_tbl) - 320b
        .hword L(ipred_filter\bpc\()_tbl) - 160b
        .hword L(ipred_filter\bpc\()_tbl) -  80b
        .hword L(ipred_filter\bpc\()_tbl) -  40b
endfunc
.endm

filter_fn 10
filter_fn 12

function ipred_filter_16bpc_neon, export=1
        ldr             w8,  [sp]
        cmp             w8,  0x3ff
        b.le            ipred_filter_10bpc_neon
        b               ipred_filter_12bpc_neon
endfunc

// void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                          const pixel *const pal, const uint8_t *idx,
//                          const int w, const int h);
function pal_pred_16bpc_neon, export=1
        ld1             {v30.8h}, [x2]
        clz             w9,  w4
        adr             x6,  L(pal_pred_tbl)
        sub             w9,  w9,  #25
        movi            v29.16b, #7
        ldrh            w9,  [x6, w9, uxtw #1]
        movi            v31.8h,  #1, lsl #8
        sub             x6,  x6,  w9, uxtw
        br              x6
40:
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
4:
        ld1             {v1.8b}, [x3], #8
        subs            w5,  w5,  #4
        ushr            v3.8b,   v1.8b,   #4
        and             v2.8b,   v1.8b,   v29.8b
        zip1            v1.16b,  v2.16b,  v3.16b
        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
        add             v1.16b,  v1.16b,  v1.16b
        zip1            v0.16b,  v1.16b,  v1.16b
        zip2            v1.16b,  v1.16b,  v1.16b
        add             v0.8h,   v0.8h,   v31.8h
        add             v1.8h,   v1.8h,   v31.8h
        tbl             v0.16b, {v30.16b}, v0.16b
        st1             {v0.d}[0], [x0], x1
        tbl             v1.16b, {v30.16b}, v1.16b
        st1             {v0.d}[1], [x2], x1
        st1             {v1.d}[0], [x0], x1
        st1             {v1.d}[1], [x2], x1
        b.gt            4b
        ret
80:
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
8:
        ld1             {v2.16b}, [x3], #16
        subs            w5,  w5,  #4
        ushr            v4.16b,  v2.16b,  #4
        and             v3.16b,  v2.16b,  v29.16b
        zip1            v2.16b,  v3.16b,  v4.16b
        zip2            v3.16b,  v3.16b,  v4.16b
        add             v2.16b,  v2.16b,  v2.16b
        add             v3.16b,  v3.16b,  v3.16b
        zip1            v0.16b,  v2.16b,  v2.16b
        zip2            v1.16b,  v2.16b,  v2.16b
        zip1            v2.16b,  v3.16b,  v3.16b
        zip2            v3.16b,  v3.16b,  v3.16b
        add             v0.8h,   v0.8h,   v31.8h
        add             v1.8h,   v1.8h,   v31.8h
        add             v2.8h,   v2.8h,   v31.8h
        add             v3.8h,   v3.8h,   v31.8h
        tbl             v0.16b, {v30.16b}, v0.16b
        tbl             v1.16b, {v30.16b}, v1.16b
        st1             {v0.8h}, [x0], x1
        tbl             v2.16b, {v30.16b}, v2.16b
        st1             {v1.8h}, [x2], x1
        tbl             v3.16b, {v30.16b}, v3.16b
        st1             {v2.8h}, [x0], x1
        st1             {v3.8h}, [x2], x1
        b.gt            8b
        ret
160:
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
16:
        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #4
        ushr            v7.16b,  v4.16b,  #4
        and             v6.16b,  v4.16b,  v29.16b
        ushr            v3.16b,  v5.16b,  #4
        and             v2.16b,  v5.16b,  v29.16b
        zip1            v4.16b,  v6.16b,  v7.16b
        zip2            v5.16b,  v6.16b,  v7.16b
        zip1            v6.16b,  v2.16b,  v3.16b
        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
        add             v7.16b,  v7.16b,  v7.16b
        zip1            v0.16b,  v4.16b,  v4.16b
        zip2            v1.16b,  v4.16b,  v4.16b
        zip1            v2.16b,  v5.16b,  v5.16b
        zip2            v3.16b,  v5.16b,  v5.16b
        zip1            v4.16b,  v6.16b,  v6.16b
        zip2            v5.16b,  v6.16b,  v6.16b
        zip1            v6.16b,  v7.16b,  v7.16b
        zip2            v7.16b,  v7.16b,  v7.16b
        add             v0.8h,   v0.8h,   v31.8h
        add             v1.8h,   v1.8h,   v31.8h
        add             v2.8h,   v2.8h,   v31.8h
        add             v3.8h,   v3.8h,   v31.8h
        add             v4.8h,   v4.8h,   v31.8h
        tbl             v0.16b, {v30.16b}, v0.16b
        add             v5.8h,   v5.8h,   v31.8h
        tbl             v1.16b, {v30.16b}, v1.16b
        add             v6.8h,   v6.8h,   v31.8h
        tbl             v2.16b, {v30.16b}, v2.16b
        add             v7.8h,   v7.8h,   v31.8h
        tbl             v3.16b, {v30.16b}, v3.16b
        tbl             v4.16b, {v30.16b}, v4.16b
        tbl             v5.16b, {v30.16b}, v5.16b
        st1             {v0.8h, v1.8h}, [x0], x1
        tbl             v6.16b, {v30.16b}, v6.16b
        st1             {v2.8h, v3.8h}, [x2], x1
        tbl             v7.16b, {v30.16b}, v7.16b
        st1             {v4.8h, v5.8h}, [x0], x1
        st1             {v6.8h, v7.8h}, [x2], x1
        b.gt            16b
        ret
320:
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  x1
        lsl             x1,  x1,  #1
32:
        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #2
        ushr            v7.16b,  v4.16b,  #4
        and             v6.16b,  v4.16b,  v29.16b
        ushr            v3.16b,  v5.16b,  #4
        and             v2.16b,  v5.16b,  v29.16b
        zip1            v4.16b,  v6.16b,  v7.16b
        zip2            v5.16b,  v6.16b,  v7.16b
        zip1            v6.16b,  v2.16b,  v3.16b
        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
        add             v7.16b,  v7.16b,  v7.16b
        zip1            v0.16b,  v4.16b,  v4.16b
        zip2            v1.16b,  v4.16b,  v4.16b
        zip1            v2.16b,  v5.16b,  v5.16b
        zip2            v3.16b,  v5.16b,  v5.16b
        zip1            v4.16b,  v6.16b,  v6.16b
        zip2            v5.16b,  v6.16b,  v6.16b
        zip1            v6.16b,  v7.16b,  v7.16b
        zip2            v7.16b,  v7.16b,  v7.16b
        add             v0.8h,   v0.8h,   v31.8h
        add             v1.8h,   v1.8h,   v31.8h
        add             v2.8h,   v2.8h,   v31.8h
        add             v3.8h,   v3.8h,   v31.8h
        add             v4.8h,   v4.8h,   v31.8h
        tbl             v0.16b, {v30.16b}, v0.16b
        add             v5.8h,   v5.8h,   v31.8h
        tbl             v1.16b, {v30.16b}, v1.16b
        add             v6.8h,   v6.8h,   v31.8h
        tbl             v2.16b, {v30.16b}, v2.16b
        add             v7.8h,   v7.8h,   v31.8h
        tbl             v3.16b, {v30.16b}, v3.16b
        tbl             v4.16b, {v30.16b}, v4.16b
        tbl             v5.16b, {v30.16b}, v5.16b
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        tbl             v6.16b, {v30.16b}, v6.16b
        tbl             v7.16b, {v30.16b}, v7.16b
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
        b.gt            32b
        ret
640:
        AARCH64_VALID_JUMP_TARGET
        add             x2,  x0,  #64
64:
        ld1             {v4.16b, v5.16b}, [x3], #32
        subs            w5,  w5,  #1
        ushr            v7.16b,  v4.16b,  #4
        and             v6.16b,  v4.16b,  v29.16b
        ushr            v3.16b,  v5.16b,  #4
        and             v2.16b,  v5.16b,  v29.16b
        zip1            v4.16b,  v6.16b,  v7.16b
        zip2            v5.16b,  v6.16b,  v7.16b
        zip1            v6.16b,  v2.16b,  v3.16b
        zip2            v7.16b,  v2.16b,  v3.16b
        add             v4.16b,  v4.16b,  v4.16b
        add             v5.16b,  v5.16b,  v5.16b
        add             v6.16b,  v6.16b,  v6.16b
        add             v7.16b,  v7.16b,  v7.16b
        zip1            v0.16b,  v4.16b,  v4.16b
        zip2            v1.16b,  v4.16b,  v4.16b
        zip1            v2.16b,  v5.16b,  v5.16b
        zip2            v3.16b,  v5.16b,  v5.16b
        zip1            v4.16b,  v6.16b,  v6.16b
        zip2            v5.16b,  v6.16b,  v6.16b
        zip1            v6.16b,  v7.16b,  v7.16b
        zip2            v7.16b,  v7.16b,  v7.16b
        add             v0.8h,   v0.8h,   v31.8h
        add             v1.8h,   v1.8h,   v31.8h
        add             v2.8h,   v2.8h,   v31.8h
        add             v3.8h,   v3.8h,   v31.8h
        add             v4.8h,   v4.8h,   v31.8h
        tbl             v0.16b, {v30.16b}, v0.16b
        add             v5.8h,   v5.8h,   v31.8h
        tbl             v1.16b, {v30.16b}, v1.16b
        add             v6.8h,   v6.8h,   v31.8h
        tbl             v2.16b, {v30.16b}, v2.16b
        add             v7.8h,   v7.8h,   v31.8h
        tbl             v3.16b, {v30.16b}, v3.16b
        tbl             v4.16b, {v30.16b}, v4.16b
        tbl             v5.16b, {v30.16b}, v5.16b
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
        tbl             v6.16b, {v30.16b}, v6.16b
        tbl             v7.16b, {v30.16b}, v7.16b
        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
        b.gt            64b
        ret

L(pal_pred_tbl):
        .hword L(pal_pred_tbl) - 640b
        .hword L(pal_pred_tbl) - 320b
        .hword L(pal_pred_tbl) - 160b
        .hword L(pal_pred_tbl) -  80b
        .hword L(pal_pred_tbl) -  40b
endfunc

// void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                               const pixel *const topleft,
//                               const int width, const int height,
//                               const int16_t *ac, const int alpha,
//                               const int bitdepth_max);
function ipred_cfl_128_16bpc_neon, export=1
        dup             v31.8h,  w7   // bitdepth_max
        clz             w9,  w3
        adr             x7,  L(ipred_cfl_128_tbl)
        sub             w9,  w9,  #26
        ldrh            w9,  [x7, w9, uxtw #1]
        urshr           v0.8h,   v31.8h,  #1
        dup             v1.8h,   w6   // alpha
        sub             x7,  x7,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        movi            v30.8h,  #0
        br              x7
L(ipred_cfl_splat_w4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v4.8h, v5.8h}, [x5], #32
        subs            w4,  w4,  #4
        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
        smull2          v3.4s,   v4.8h,   v1.8h
        smull           v4.4s,   v5.4h,   v1.4h
        smull2          v5.4s,   v5.8h,   v1.8h
        cmlt            v16.4s,  v2.4s,   #0     // sign
        cmlt            v17.4s,  v3.4s,   #0
        cmlt            v18.4s,  v4.4s,   #0
        cmlt            v19.4s,  v5.4s,   #0
        add             v2.4s,   v2.4s,   v16.4s // diff + sign
        add             v3.4s,   v3.4s,   v17.4s
        add             v4.4s,   v4.4s,   v18.4s
        add             v5.4s,   v5.4s,   v19.4s
        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
        rshrn2          v2.8h,   v3.4s,   #6
        rshrn           v3.4h,   v4.4s,   #6
        rshrn2          v3.8h,   v5.4s,   #6
        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
        add             v3.8h,   v3.8h,   v0.8h
        smax            v2.8h,   v2.8h,   v30.8h
        smax            v3.8h,   v3.8h,   v30.8h
        smin            v2.8h,   v2.8h,   v31.8h
        smin            v3.8h,   v3.8h,   v31.8h
        st1             {v2.d}[0],  [x0], x1
        st1             {v2.d}[1],  [x6], x1
        st1             {v3.d}[0],  [x0], x1
        st1             {v3.d}[1],  [x6], x1
        b.gt            L(ipred_cfl_splat_w4)
        ret
L(ipred_cfl_splat_w8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v4.8h, v5.8h}, [x5], #32
        subs            w4,  w4,  #2
        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
        smull2          v3.4s,   v4.8h,   v1.8h
        smull           v4.4s,   v5.4h,   v1.4h
        smull2          v5.4s,   v5.8h,   v1.8h
        cmlt            v16.4s,  v2.4s,   #0     // sign
        cmlt            v17.4s,  v3.4s,   #0
        cmlt            v18.4s,  v4.4s,   #0
        cmlt            v19.4s,  v5.4s,   #0
        add             v2.4s,   v2.4s,   v16.4s // diff + sign
        add             v3.4s,   v3.4s,   v17.4s
        add             v4.4s,   v4.4s,   v18.4s
        add             v5.4s,   v5.4s,   v19.4s
        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
        rshrn2          v2.8h,   v3.4s,   #6
        rshrn           v3.4h,   v4.4s,   #6
        rshrn2          v3.8h,   v5.4s,   #6
        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
        add             v3.8h,   v3.8h,   v0.8h
        smax            v2.8h,   v2.8h,   v30.8h
        smax            v3.8h,   v3.8h,   v30.8h
        smin            v2.8h,   v2.8h,   v31.8h
        smin            v3.8h,   v3.8h,   v31.8h
        st1             {v2.8h},  [x0], x1
        st1             {v3.8h},  [x6], x1
        b.gt            L(ipred_cfl_splat_w8)
        ret
L(ipred_cfl_splat_w16):
        AARCH64_VALID_JUMP_TARGET
        add             x7,  x5,  w3, uxtw #1
        sub             x1,  x1,  w3, uxtw #1
        mov             w9,  w3
1:
        ld1             {v2.8h, v3.8h}, [x5], #32
        ld1             {v4.8h, v5.8h}, [x7], #32
        subs            w3,  w3,  #16
        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
        smull2          v17.4s,  v2.8h,   v1.8h
        smull           v18.4s,  v3.4h,   v1.4h
        smull2          v19.4s,  v3.8h,   v1.8h
        smull           v2.4s,   v4.4h,   v1.4h
        smull2          v3.4s,   v4.8h,   v1.8h
        smull           v4.4s,   v5.4h,   v1.4h
        smull2          v5.4s,   v5.8h,   v1.8h
        cmlt            v20.4s,  v16.4s,  #0     // sign
        cmlt            v21.4s,  v17.4s,  #0
        cmlt            v22.4s,  v18.4s,  #0
        cmlt            v23.4s,  v19.4s,  #0
        cmlt            v24.4s,  v2.4s,   #0
        cmlt            v25.4s,  v3.4s,   #0
        cmlt            v26.4s,  v4.4s,   #0
        cmlt            v27.4s,  v5.4s,   #0
        add             v16.4s,  v16.4s,  v20.4s // diff + sign
        add             v17.4s,  v17.4s,  v21.4s
        add             v18.4s,  v18.4s,  v22.4s
        add             v19.4s,  v19.4s,  v23.4s
        add             v2.4s,   v2.4s,   v24.4s
        add             v3.4s,   v3.4s,   v25.4s
        add             v4.4s,   v4.4s,   v26.4s
        add             v5.4s,   v5.4s,   v27.4s
        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
        rshrn2          v16.8h,  v17.4s,  #6
        rshrn           v17.4h,  v18.4s,  #6
        rshrn2          v17.8h,  v19.4s,  #6
        rshrn           v6.4h,   v2.4s,   #6
        rshrn2          v6.8h,   v3.4s,   #6
        rshrn           v7.4h,   v4.4s,   #6
        rshrn2          v7.8h,   v5.4s,   #6
        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
        add             v3.8h,   v17.8h,  v0.8h
        add             v4.8h,   v6.8h,   v0.8h
        add             v5.8h,   v7.8h,   v0.8h
        smax            v2.8h,   v2.8h,   v30.8h
        smax            v3.8h,   v3.8h,   v30.8h
        smax            v4.8h,   v4.8h,   v30.8h
        smax            v5.8h,   v5.8h,   v30.8h
        smin            v2.8h,   v2.8h,   v31.8h
        smin            v3.8h,   v3.8h,   v31.8h
        smin            v4.8h,   v4.8h,   v31.8h
        smin            v5.8h,   v5.8h,   v31.8h
        st1             {v2.8h, v3.8h},  [x0], #32
        st1             {v4.8h, v5.8h},  [x6], #32
        b.gt            1b
        subs            w4,  w4,  #2
        add             x5,  x5,  w9, uxtw #1
        add             x7,  x7,  w9, uxtw #1
        add             x0,  x0,  x1
        add             x6,  x6,  x1
        mov             w3,  w9
        b.gt            1b
        ret

L(ipred_cfl_128_tbl):
L(ipred_cfl_splat_tbl):
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w16)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w8)
        .hword L(ipred_cfl_128_tbl) - L(ipred_cfl_splat_w4)
endfunc

// void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                               const pixel *const topleft,
//                               const int width, const int height,
//                               const int16_t *ac, const int alpha,
//                               const int bitdepth_max);
function ipred_cfl_top_16bpc_neon, export=1
        dup             v31.8h,  w7   // bitdepth_max
        clz             w9,  w3
        adr             x7,  L(ipred_cfl_top_tbl)
        sub             w9,  w9,  #26
        ldrh            w9,  [x7, w9, uxtw #1]
        dup             v1.8h,   w6   // alpha
        add             x2,  x2,  #2
        sub             x7,  x7,  w9, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        movi            v30.8h,  #0
        br              x7
4:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2]
        addv            h0,      v0.4h
        urshr           v0.4h,   v0.4h,   #2
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w4)
8:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2]
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #3
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w8)
16:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h}, [x2]
        addp            v0.8h,   v2.8h,   v3.8h
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #4
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)
32:
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v0.8h,   v2.8h,   v4.8h
        uaddlv          s0,      v0.8h
        rshrn           v0.4h,   v0.4s,   #5
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_top_tbl):
        .hword L(ipred_cfl_top_tbl) - 32b
        .hword L(ipred_cfl_top_tbl) - 16b
        .hword L(ipred_cfl_top_tbl) -  8b
        .hword L(ipred_cfl_top_tbl) -  4b
endfunc

// void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                                const pixel *const topleft,
//                                const int width, const int height,
//                                const int16_t *ac, const int alpha,
//                                const int bitdepth_max);
function ipred_cfl_left_16bpc_neon, export=1
        dup             v31.8h,  w7   // bitdepth_max
        sub             x2,  x2,  w4, uxtw #1
        clz             w9,  w3
        clz             w8,  w4
        adr             x10, L(ipred_cfl_splat_tbl)
        adr             x7,  L(ipred_cfl_left_tbl)
        sub             w9,  w9,  #26
        sub             w8,  w8,  #26
        ldrh            w9,  [x10, w9, uxtw #1]
        ldrh            w8,  [x7,  w8, uxtw #1]
        dup             v1.8h,   w6   // alpha
        sub             x9,  x10, w9, uxtw
        sub             x7,  x7,  w8, uxtw
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        movi            v30.8h,  #0
        br              x7

L(ipred_cfl_left_h4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2]
        addv            h0,      v0.4h
        urshr           v0.4h,   v0.4h,   #2
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2]
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #3
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h}, [x2]
        addp            v0.8h,   v2.8h,   v3.8h
        addv            h0,      v0.8h
        urshr           v0.4h,   v0.4h,   #4
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_h32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v0.8h,   v2.8h,   v4.8h
        uaddlv          s0,      v0.8h
        rshrn           v0.4h,   v0.4s,   #5
        dup             v0.8h,   v0.h[0]
        br              x9

L(ipred_cfl_left_tbl):
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h32)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h16)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h8)
        .hword L(ipred_cfl_left_tbl) - L(ipred_cfl_left_h4)
endfunc

// void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
//                           const pixel *const topleft,
//                           const int width, const int height,
//                           const int16_t *ac, const int alpha,
//                           const int bitdepth_max);
function ipred_cfl_16bpc_neon, export=1
        dup             v31.8h,  w7              // bitdepth_max
        sub             x2,  x2,  w4, uxtw #1
        add             w8,  w3,  w4             // width + height
        dup             v1.8h,   w6              // alpha
        clz             w9,  w3
        clz             w6,  w4
        dup             v16.4s, w8               // width + height
        adr             x7,  L(ipred_cfl_tbl)
        rbit            w8,  w8                  // rbit(width + height)
        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
        sub             w6,  w6,  #26
        clz             w8,  w8                  // ctz(width + height)
        ldrh            w9,  [x7, w9, uxtw #1]
        ldrh            w6,  [x7, w6, uxtw #1]
        neg             w8,  w8                  // -ctz(width + height)
        sub             x9,  x7,  w9, uxtw
        sub             x7,  x7,  w6, uxtw
        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
        dup             v17.4s,  w8              // -ctz(width + height)
        add             x6,  x0,  x1
        lsl             x1,  x1,  #1
        movi            v30.8h,  #0
        br              x7

L(ipred_cfl_h4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.4h},  [x2], #8
        uaddlv          s0,      v0.4h
        add             x2,  x2,  #2
        br              x9
L(ipred_cfl_w4):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.4h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s2,      v2.4h
        cmp             w4,  #4
        add             v0.2s,   v0.2s,   v2.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 8/16
        cmp             w4,  #16
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w4)

L(ipred_cfl_h8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v0.8h},  [x2], #16
        uaddlv          s0,      v0.8h
        add             x2,  x2,  #2
        br              x9
L(ipred_cfl_w8):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h},  [x2]
        add             v0.2s,   v0.2s,   v16.2s
        uaddlv          s2,      v2.8h
        cmp             w4,  #8
        add             v0.2s,   v0.2s,   v2.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 4/16/32
        cmp             w4,  #32
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w8)

L(ipred_cfl_h16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h}, [x2], #32
        addp            v0.8h,   v2.8h,   v3.8h
        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x9
L(ipred_cfl_w16):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h}, [x2]
        add             v0.2s,   v0.2s,   v16.2s
        addp            v2.8h,   v2.8h,   v3.8h
        uaddlv          s2,      v2.8h
        cmp             w4,  #16
        add             v0.2s,   v0.2s,   v2.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 4/8/32
        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_h32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v0.8h,   v2.8h,   v4.8h
        add             x2,  x2,  #2
        uaddlv          s0,      v0.8h
        br              x9
L(ipred_cfl_w32):
        AARCH64_VALID_JUMP_TARGET
        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
        add             v0.4s,   v0.4s,   v16.4s
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v2.8h,   v2.8h,   v4.8h
        cmp             w4,  #32
        uaddlv          s2,      v2.8h
        add             v0.2s,   v0.2s,   v2.2s
        ushl            v0.2s,   v0.2s,   v17.2s
        b.eq            1f
        // h = 8/16
        cmp             w4,  #8
        mov             w16, #0x6667
        mov             w17, #0xAAAB
        csel            w16, w16, w17, eq
        dup             v16.2s,  w16
        mul             v0.2s,   v0.2s,   v16.2s
        ushr            v0.2s,   v0.2s,   #17
1:
        dup             v0.8h,   v0.h[0]
        b               L(ipred_cfl_splat_w16)

L(ipred_cfl_tbl):
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h32)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h16)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h8)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_h4)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w32)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w16)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w8)
        .hword L(ipred_cfl_tbl) - L(ipred_cfl_w4)
endfunc

// void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
//                            const ptrdiff_t stride, const int w_pad,
//                            const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_420_16bpc_neon, export=1
        clz             w8,  w5
        lsl             w4,  w4,  #2
        adr             x7,  L(ipred_cfl_ac_420_tbl)
        sub             w8,  w8,  #27
        ldrh            w8,  [x7, w8, uxtw #1]
        movi            v24.4s,  #0
        movi            v25.4s,  #0
        movi            v26.4s,  #0
        movi            v27.4s,  #0
        sub             x7,  x7,  w8, uxtw
        sub             w8,  w6,  w4         // height - h_pad
        rbit            w9,  w5              // rbit(width)
        rbit            w10, w6              // rbit(height)
        clz             w9,  w9              // ctz(width)
        clz             w10, w10             // ctz(height)
        add             w9,  w9,  w10        // log2sz
        add             x10, x1,  x2
        dup             v31.4s,  w9
        lsl             x2,  x2,  #1
        neg             v31.4s,  v31.4s      // -log2sz
        br              x7

L(ipred_cfl_ac_420_w4):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input
        ld1             {v0.8h}, [x1],  x2
        ld1             {v1.8h}, [x10], x2
        ld1             {v2.8h}, [x1],  x2
        ld1             {v3.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v1.8h,   v1.8h,   v3.8h
        add             v0.8h,   v0.8h,   v1.8h
        shl             v0.8h,   v0.8h,   #1
        subs            w8,  w8,  #2
        st1             {v0.8h}, [x0], #16
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        b.gt            1b
        trn2            v1.2d,   v0.2d,   v0.2d
        trn2            v0.2d,   v0.2d,   v0.2d
L(ipred_cfl_ac_420_w4_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        b.gt            2b
3:
L(ipred_cfl_ac_420_w4_calc_subtract_dc):
        // Aggregate the sums
        add             v24.4s,  v24.4s,  v25.4s
        add             v26.4s,  v26.4s,  v27.4s
        add             v0.4s,   v24.4s,  v26.4s
        addv            s0,  v0.4s                // sum
        sub             x0,  x0,  w6, uxtw #3
        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
        dup             v4.8h,   v4.h[0]
6:      // Subtract dc from ac
        ld1             {v0.8h, v1.8h}, [x0]
        subs            w6,  w6,  #4
        sub             v0.8h,   v0.8h,   v4.8h
        sub             v1.8h,   v1.8h,   v4.8h
        st1             {v0.8h, v1.8h}, [x0], #32
        b.gt            6b
        ret

L(ipred_cfl_ac_420_w8):
        AARCH64_VALID_JUMP_TARGET
        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
1:      // Copy and subsample input, without padding
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ld1             {v2.8h, v3.8h}, [x10], x2
        ld1             {v4.8h, v5.8h}, [x1],  x2
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v6.8h, v7.8h}, [x10], x2
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v4.8h,   v4.8h,   v6.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v1.8h,   v4.8h,   #1
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        b.gt            1b
        mov             v0.16b,  v1.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_420_w8_wpad):
1:      // Copy and subsample input, padding 4
        ld1             {v0.8h}, [x1],  x2
        ld1             {v1.8h}, [x10], x2
        ld1             {v2.8h}, [x1],  x2
        ld1             {v3.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v2.8h
        addp            v1.8h,   v1.8h,   v3.8h
        add             v0.8h,   v0.8h,   v1.8h
        shl             v0.8h,   v0.8h,   #1
        dup             v1.4h,   v0.h[3]
        dup             v3.4h,   v0.h[7]
        trn2            v2.2d,   v0.2d,   v0.2d
        subs            w8,  w8,  #2
        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw           v25.4s,  v25.4s,  v1.4h
        uaddw           v26.4s,  v26.4s,  v2.4h
        uaddw           v27.4s,  v27.4s,  v3.4h
        b.gt            1b
        trn1            v0.2d,   v2.2d,   v3.2d
        trn1            v1.2d,   v2.2d,   v3.2d

L(ipred_cfl_ac_420_w8_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        b.gt            2b
3:

        // Double the height and reuse the w4 summing/subtracting
        lsl             w6,  w6,  #1
        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

L(ipred_cfl_ac_420_w16):
        AARCH64_VALID_JUMP_TARGET
        adr             x7,  L(ipred_cfl_ac_420_w16_tbl)
        ldrh            w3,  [x7, w3, uxtw #1]
        sub             x7,  x7,  w3, uxtw
        br              x7

L(ipred_cfl_ac_420_w16_wpad0):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, without padding
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
        add             v0.8h,   v0.8h,   v4.8h
        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
        add             v2.8h,   v2.8h,   v6.8h
        addp            v16.8h,  v16.8h,  v17.8h
        addp            v18.8h,  v18.8h,  v19.8h
        addp            v20.8h,  v20.8h,  v21.8h
        addp            v22.8h,  v22.8h,  v23.8h
        add             v16.8h,  v16.8h,  v20.8h
        add             v18.8h,  v18.8h,  v22.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v1.8h,   v2.8h,   #1
        shl             v2.8h,   v16.8h,  #1
        shl             v3.8h,   v18.8h,  #1
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad1):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 4
        ldr             q2,  [x1,  #32]
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ldr             q5,  [x10, #32]
        ld1             {v3.8h, v4.8h}, [x10], x2
        addp            v2.8h,   v2.8h,   v2.8h
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v5.8h,   v5.8h,   v5.8h
        addp            v3.8h,   v3.8h,   v4.8h
        ldr             q18, [x1,  #32]
        add             v2.4h,   v2.4h,   v5.4h
        ld1             {v16.8h, v17.8h}, [x1],  x2
        add             v0.8h,   v0.8h,   v3.8h
        ldr             q21, [x10, #32]
        ld1             {v19.8h, v20.8h}, [x10], x2
        addp            v18.8h,  v18.8h,  v18.8h
        addp            v16.8h,  v16.8h,  v17.8h
        addp            v21.8h,  v21.8h,  v21.8h
        addp            v19.8h,  v19.8h,  v20.8h
        add             v18.4h,  v18.4h,  v21.4h
        add             v16.8h,  v16.8h,  v19.8h
        shl             v1.4h,   v2.4h,   #1
        shl             v0.8h,   v0.8h,   #1
        shl             v3.4h,   v18.4h,  #1
        shl             v2.8h,   v16.8h,  #1
        dup             v4.4h,   v1.h[3]
        dup             v5.4h,   v3.h[3]
        trn1            v1.2d,   v1.2d,   v4.2d
        trn1            v3.2d,   v3.2d,   v5.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad2):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 8
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ld1             {v2.8h, v3.8h}, [x10], x2
        ld1             {v4.8h, v5.8h}, [x1],  x2
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v6.8h, v7.8h}, [x10], x2
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        add             v0.8h,   v0.8h,   v2.8h
        add             v4.8h,   v4.8h,   v6.8h
        shl             v0.8h,   v0.8h,   #1
        shl             v2.8h,   v4.8h,   #1
        dup             v1.8h,   v0.h[7]
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_420_w16_wpad3):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 12
        ld1             {v0.8h}, [x1],  x2
        ld1             {v2.8h}, [x10], x2
        ld1             {v4.8h}, [x1],  x2
        ld1             {v6.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v4.8h
        addp            v2.8h,   v2.8h,   v6.8h
        add             v0.8h,   v0.8h,   v2.8h
        shl             v0.8h,   v0.8h,   #1
        dup             v1.8h,   v0.h[3]
        dup             v3.8h,   v0.h[7]
        trn2            v2.2d,   v0.2d,   v3.2d
        trn1            v0.2d,   v0.2d,   v1.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b

L(ipred_cfl_ac_420_w16_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            2b
3:

        // Quadruple the height and reuse the w4 summing/subtracting
        lsl             w6,  w6,  #2
        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

L(ipred_cfl_ac_420_tbl):
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w16)
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w8)
        .hword L(ipred_cfl_ac_420_tbl) - L(ipred_cfl_ac_420_w4)
        .hword 0

L(ipred_cfl_ac_420_w16_tbl):
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad0)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad1)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad2)
        .hword L(ipred_cfl_ac_420_w16_tbl) - L(ipred_cfl_ac_420_w16_wpad3)
endfunc

// void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
//                            const ptrdiff_t stride, const int w_pad,
//                            const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_422_16bpc_neon, export=1
        clz             w8,  w5
        lsl             w4,  w4,  #2
        adr             x7,  L(ipred_cfl_ac_422_tbl)
        sub             w8,  w8,  #27
        ldrh            w8,  [x7, w8, uxtw #1]
        movi            v24.4s,  #0
        movi            v25.4s,  #0
        movi            v26.4s,  #0
        movi            v27.4s,  #0
        sub             x7,  x7,  w8, uxtw
        sub             w8,  w6,  w4         // height - h_pad
        rbit            w9,  w5              // rbit(width)
        rbit            w10, w6              // rbit(height)
        clz             w9,  w9              // ctz(width)
        clz             w10, w10             // ctz(height)
        add             w9,  w9,  w10        // log2sz
        add             x10, x1,  x2
        dup             v31.4s,  w9
        lsl             x2,  x2,  #1
        neg             v31.4s,  v31.4s      // -log2sz
        br              x7

L(ipred_cfl_ac_422_w4):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input
        ld1             {v0.8h}, [x1],  x2
        ld1             {v1.8h}, [x10], x2
        ld1             {v2.8h}, [x1],  x2
        ld1             {v3.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v2.8h,   #2
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        b.gt            1b
        trn2            v0.2d,   v1.2d,   v1.2d
        trn2            v1.2d,   v1.2d,   v1.2d
        b               L(ipred_cfl_ac_420_w4_hpad)

L(ipred_cfl_ac_422_w8):
        AARCH64_VALID_JUMP_TARGET
        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
1:      // Copy and subsample input, without padding
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ld1             {v2.8h, v3.8h}, [x10], x2
        ld1             {v4.8h, v5.8h}, [x1],  x2
        addp            v0.8h,   v0.8h,   v1.8h
        ld1             {v6.8h, v7.8h}, [x10], x2
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v2.8h,   #2
        shl             v2.8h,   v4.8h,   #2
        shl             v3.8h,   v6.8h,   #2
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v3.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_422_w8_wpad):
1:      // Copy and subsample input, padding 4
        ld1             {v0.8h}, [x1],  x2
        ld1             {v1.8h}, [x10], x2
        ld1             {v2.8h}, [x1],  x2
        ld1             {v3.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        shl             v0.8h,   v0.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        dup             v4.4h,   v0.h[3]
        dup             v5.8h,   v0.h[7]
        dup             v6.4h,   v2.h[3]
        dup             v7.8h,   v2.h[7]
        trn2            v1.2d,   v0.2d,   v5.2d
        trn1            v0.2d,   v0.2d,   v4.2d
        trn2            v3.2d,   v2.2d,   v7.2d
        trn1            v2.2d,   v2.2d,   v6.2d
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v3.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_422_w16):
        AARCH64_VALID_JUMP_TARGET
        adr             x7,  L(ipred_cfl_ac_422_w16_tbl)
        ldrh            w3,  [x7, w3, uxtw #1]
        sub             x7,  x7,  w3, uxtw
        br              x7

L(ipred_cfl_ac_422_w16_wpad0):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, without padding
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        addp            v4.8h,   v4.8h,   v5.8h
        addp            v6.8h,   v6.8h,   v7.8h
        shl             v0.8h,   v0.8h,   #2
        shl             v1.8h,   v2.8h,   #2
        shl             v2.8h,   v4.8h,   #2
        shl             v3.8h,   v6.8h,   #2
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad1):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 4
        ldr             q2,  [x1,  #32]
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ldr             q6,  [x10, #32]
        ld1             {v4.8h, v5.8h}, [x10], x2
        addp            v2.8h,   v2.8h,   v2.8h
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v6.8h,   v6.8h,   v6.8h
        addp            v4.8h,   v4.8h,   v5.8h
        shl             v1.4h,   v2.4h,   #2
        shl             v0.8h,   v0.8h,   #2
        shl             v3.4h,   v6.4h,   #2
        shl             v2.8h,   v4.8h,   #2
        dup             v4.4h,   v1.h[3]
        dup             v5.4h,   v3.h[3]
        trn1            v1.2d,   v1.2d,   v4.2d
        trn1            v3.2d,   v3.2d,   v5.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad2):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 8
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ld1             {v2.8h, v3.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v1.8h
        addp            v2.8h,   v2.8h,   v3.8h
        shl             v0.8h,   v0.8h,   #2
        shl             v2.8h,   v2.8h,   #2
        dup             v1.8h,   v0.h[7]
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_w16_wpad3):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and subsample input, padding 12
        ld1             {v0.8h}, [x1],  x2
        ld1             {v2.8h}, [x10], x2
        addp            v0.8h,   v0.8h,   v0.8h
        addp            v2.8h,   v2.8h,   v2.8h
        shl             v0.4h,   v0.4h,   #2
        shl             v2.4h,   v2.4h,   #2
        dup             v1.8h,   v0.h[3]
        dup             v3.8h,   v2.h[3]
        trn1            v0.2d,   v0.2d,   v1.2d
        trn1            v2.2d,   v2.2d,   v3.2d
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_422_tbl):
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w16)
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w8)
        .hword L(ipred_cfl_ac_422_tbl) - L(ipred_cfl_ac_422_w4)
        .hword 0

L(ipred_cfl_ac_422_w16_tbl):
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad0)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad1)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad2)
        .hword L(ipred_cfl_ac_422_w16_tbl) - L(ipred_cfl_ac_422_w16_wpad3)
endfunc

// void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
//                            const ptrdiff_t stride, const int w_pad,
//                            const int h_pad, const int cw, const int ch);
function ipred_cfl_ac_444_16bpc_neon, export=1
        clz             w8,  w5
        lsl             w4,  w4,  #2
        adr             x7,  L(ipred_cfl_ac_444_tbl)
        sub             w8,  w8,  #26
        ldrh            w8,  [x7, w8, uxtw #1]
        movi            v24.4s,  #0
        movi            v25.4s,  #0
        movi            v26.4s,  #0
        movi            v27.4s,  #0
        sub             x7,  x7,  w8, uxtw
        sub             w8,  w6,  w4         // height - h_pad
        rbit            w9,  w5              // rbit(width)
        rbit            w10, w6              // rbit(height)
        clz             w9,  w9              // ctz(width)
        clz             w10, w10             // ctz(height)
        add             w9,  w9,  w10        // log2sz
        add             x10, x1,  x2
        dup             v31.4s,  w9
        lsl             x2,  x2,  #1
        neg             v31.4s,  v31.4s      // -log2sz
        br              x7

L(ipred_cfl_ac_444_w4):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input
        ld1             {v0.4h},   [x1],  x2
        ld1             {v0.d}[1], [x10], x2
        ld1             {v1.4h},   [x1],  x2
        ld1             {v1.d}[1], [x10], x2
        shl             v0.8h,   v0.8h,   #3
        shl             v1.8h,   v1.8h,   #3
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h}, [x0], #32
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        b.gt            1b
        trn2            v0.2d,   v1.2d,   v1.2d
        trn2            v1.2d,   v1.2d,   v1.2d
        b               L(ipred_cfl_ac_420_w4_hpad)

L(ipred_cfl_ac_444_w8):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input
        ld1             {v0.8h}, [x1],  x2
        ld1             {v1.8h}, [x10], x2
        ld1             {v2.8h}, [x1],  x2
        shl             v0.8h,   v0.8h,   #3
        ld1             {v3.8h}, [x10], x2
        shl             v1.8h,   v1.8h,   #3
        shl             v2.8h,   v2.8h,   #3
        shl             v3.8h,   v3.8h,   #3
        subs            w8,  w8,  #4
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v3.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w8_hpad)

L(ipred_cfl_ac_444_w16):
        AARCH64_VALID_JUMP_TARGET
        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
1:      // Copy and expand input, without padding
        ld1             {v0.8h, v1.8h}, [x1],  x2
        ld1             {v2.8h, v3.8h}, [x10], x2
        shl             v0.8h,   v0.8h,   #3
        shl             v1.8h,   v1.8h,   #3
        shl             v2.8h,   v2.8h,   #3
        shl             v3.8h,   v3.8h,   #3
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_444_w16_wpad):
1:      // Copy and expand input, padding 8
        ld1             {v0.8h}, [x1],  x2
        ld1             {v2.8h}, [x10], x2
        shl             v0.8h,   v0.8h,   #3
        shl             v2.8h,   v2.8h,   #3
        dup             v1.8h,   v0.h[7]
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        mov             v0.16b,  v2.16b
        mov             v1.16b,  v3.16b
        b               L(ipred_cfl_ac_420_w16_hpad)

L(ipred_cfl_ac_444_w32):
        AARCH64_VALID_JUMP_TARGET
        adr             x7,  L(ipred_cfl_ac_444_w32_tbl)
        ldrh            w3,  [x7, w3, uxtw] // (w3>>1) << 1
        lsr             x2,  x2,  #1 // Restore the stride to one line increments
        sub             x7,  x7,  w3, uxtw
        br              x7

L(ipred_cfl_ac_444_w32_wpad0):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input, without padding
        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
        shl             v0.8h,   v0.8h,   #3
        shl             v1.8h,   v1.8h,   #3
        shl             v2.8h,   v2.8h,   #3
        shl             v3.8h,   v3.8h,   #3
        subs            w8,  w8,  #1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        b               L(ipred_cfl_ac_444_w32_hpad)

L(ipred_cfl_ac_444_w32_wpad2):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input, padding 8
        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
        shl             v2.8h,   v2.8h,   #3
        shl             v0.8h,   v0.8h,   #3
        shl             v1.8h,   v1.8h,   #3
        dup             v3.8h,   v2.h[7]
        subs            w8,  w8,  #1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        b               L(ipred_cfl_ac_444_w32_hpad)

L(ipred_cfl_ac_444_w32_wpad4):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input, padding 16
        ld1             {v0.8h, v1.8h}, [x1],  x2
        shl             v1.8h,   v1.8h,   #3
        shl             v0.8h,   v0.8h,   #3
        dup             v2.8h,   v1.h[7]
        dup             v3.8h,   v1.h[7]
        subs            w8,  w8,  #1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b
        b               L(ipred_cfl_ac_444_w32_hpad)

L(ipred_cfl_ac_444_w32_wpad6):
        AARCH64_VALID_JUMP_TARGET
1:      // Copy and expand input, padding 24
        ld1             {v0.8h}, [x1],  x2
        shl             v0.8h,   v0.8h,   #3
        dup             v1.8h,   v0.h[7]
        dup             v2.8h,   v0.h[7]
        dup             v3.8h,   v0.h[7]
        subs            w8,  w8,  #1
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            1b

L(ipred_cfl_ac_444_w32_hpad):
        cbz             w4,  3f
2:      // Vertical padding (h_pad > 0)
        subs            w4,  w4,  #2
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
        uaddw           v24.4s,  v24.4s,  v0.4h
        uaddw2          v25.4s,  v25.4s,  v0.8h
        uaddw           v26.4s,  v26.4s,  v1.4h
        uaddw2          v27.4s,  v27.4s,  v1.8h
        uaddw           v24.4s,  v24.4s,  v2.4h
        uaddw2          v25.4s,  v25.4s,  v2.8h
        uaddw           v26.4s,  v26.4s,  v3.4h
        uaddw2          v27.4s,  v27.4s,  v3.8h
        b.gt            2b
3:

        //  Multiply the height by eight and reuse the w4 subtracting
        lsl             w6,  w6,  #3
        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)

L(ipred_cfl_ac_444_tbl):
        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w32)
        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w16)
        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w8)
        .hword L(ipred_cfl_ac_444_tbl) - L(ipred_cfl_ac_444_w4)

L(ipred_cfl_ac_444_w32_tbl):
        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad0)
        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad2)
        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad4)
        .hword L(ipred_cfl_ac_444_w32_tbl) - L(ipred_cfl_ac_444_w32_wpad6)
endfunc