///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_weighted_pred_bi_default.s
//*
//* @brief
//*  contains function definitions for weighted prediction used in inter
//* prediction
//*
//* @author
//*  parthiban v
//*
//* @par list of functions:
//*  - ihevc_weighted_pred_bi_default()
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
///**
//*******************************************************************************
//*
//* @brief
//*  does default bi-weighted prediction on the arrays pointed by pi2_src1 and
//* pi2_src2 and stores it at location  pointed by pi2_dst assumptions : the
//* function is optimized considering the fact width and  height are multiple
//* of 2.
//*
//* @par description:
//*  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
//* >> shift  where shift = 15 - bitdepth
//*
//* @param[in] pi2_src1
//*  pointer to source 1
//*
//* @param[in] pi2_src2
//*  pointer to source 2
//*
//* @param[out] pu1_dst
//*  pointer to destination
//*
//* @param[in] src_strd1
//*  source stride 1
//*
//* @param[in] src_strd2
//*  source stride 2
//*
//* @param[in] dst_strd
//*  destination stride
//*
//* @param[in] lvl_shift1
//*  added before shift and offset
//*
//* @param[in] lvl_shift2
//*  added before shift and offset
//*
//* @param[in] ht
//*  height of the source
//*
//* @param[in] wd
//*  width of the source
//*
//* @returns
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/
//void ihevc_weighted_pred_bi_default(word16 *pi2_src1,
//                                    word16 *pi2_src2,
//                                    uword8 *pu1_dst,
//                                    word32 src_strd1,
//                                    word32 src_strd2,
//                                    word32 dst_strd,
//                                    word32 lvl_shift1,
//                                    word32 lvl_shift2,
//                                    word32 ht,
//                                    word32 wd)

//**************variables vs registers*****************************************
//    x0 => *pi2_src1
//    x1 => *pi2_src2
//    x2 => *pu1_dst
//    x3 =>  src_strd1
//    x4 =>  src_strd2
//    x5 =>  dst_strd
//    x6 =>  lvl_shift1
//    x7 =>  lvl_shift2
//    x8 =>  ht
//    x9 =>  wd
.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_weighted_pred_bi_default_av8

.type ihevc_weighted_pred_bi_default_av8, %function

ihevc_weighted_pred_bi_default_av8:

    ldr         w8,[sp,#0]
    ldr         w9,[sp,#8]

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!
    stp         x21, x22,[sp,#-16]!

    mov         x15,x4 // src_strd2 40
    mov         x16,x5 // dst_strd 44
    mov         x17,x6 // lvl_shift1 48
    mov         x19,x7 // lvl_shift2 52
    mov         x20,x8 // ht 56
    mov         x21,x9 // wd 60

    mov         x4,x15                      //load src_strd2
    lsl         x3,x3,#1
    mov         x5,x16                      //load dst_strd
    mov         x6,x17                      //load lvl_shift1
    lsl         x4,x4,#1
    mov         x7,x19                      //load lvl_shift2
    mov         x8,x20                      //load ht
    mov         x9,x21                      //load wd
    dup         v4.8h,w6                    //lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1)
    dup         v6.8h,w7                    //lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2)
    movi        v0.8h, #0x40                //tmp_lvl_shift = 1 << (shift - 1)
    add         v4.8h,  v4.8h,v6.8h
    add         v0.8h,  v0.8h ,  v4.8h
//   vmvn.i32    v2.8h,#0x6                         @vmovq_n_s32(tmp_shift)
    lsl         x6,x9,#1
    sub         x20,x6,x3,lsl #2            //4*src_strd1 - wd
    neg         x7, x20
    sub         x20,x6,x4,lsl #2            //4*src_strd2 - wd
    neg         x10, x20
    //asr            x6,#1
    //rsb            x6,x6,x5,lsl #2             @4*dst_strd - wd

    cmp         x8,#0                       //check ht == 0
    beq         end_loops                   //if equal, then end the function

chroma_decision:
    orr         x14,x8,x9
    cmp         x14,#10
    beq         outer_loop_chroma_8x2

    cmp         x14,#6
    beq         outer_loop_chroma_4x2


luma_decision:
    cmp         x9,#24
    beq         outer_loop_8

    cmp         x9,#16
    bge         outer_loop_16

    cmp         x9,#12
    beq         outer_loop_4

    cmp         x9,#8
    bge         outer_loop_8


outer_loop_4:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function

core_loop_4:
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
    sqadd       v18.4h,v6.4h,v7.4h
    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    mov         v18.d[1],v19.d[0]
    sqshrun     v20.8b, v18.8h,#7
    ld1         {v22.4h},[x11],x3           //load and increment the pi2_src1 iii iteration
    ld1         {v23.4h},[x12],x4           //load and increment the pi2_src2 iii iteration
    sqadd       v30.4h,v22.4h,v23.4h
    sqadd       v30.4h,v30.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    ld1         {v24.4h},[x11],x3           //load and increment the pi2_src1 iv iteration
    ld1         {v25.4h},[x12],x4           //load and increment the pi2_src2 iv iteration
    sqadd       v18.4h,v24.4h,v25.4h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    sqadd       v31.4h,v18.4h,v0.4h
    mov         v30.d[1],v31.d[0]
    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration
    sqshrun     v30.8b, v30.8h,#7
    st1         {v30.s}[0],[x14],x5         //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
    st1         {v30.s}[1],[x14],x5         //store pu1_dst iv iteration
    bgt         core_loop_4                 //if greater than 0 repeat the core loop again

end_core_loop_4:

    subs        x8,x8,#4                    //decrement the ht by 4

    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         x9,x6,#1
    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
    neg         x14, x20
    add         x2,x2,x14
                                            //pu1_dst + dst_std - wd
    bgt         core_loop_4                 //if ht is greater than 0 goto outer_loop

    b           end_loops


// this is only for chroma module with input 2x2
outer_loop_chroma_4x2:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function
    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
    neg         x7, x20
    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
    neg         x10, x20
core_loop_chroma_4x2:
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    ld1         {v6.4h},[x0],#8             //load and increment the pi2_src1
    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         {v7.4h},[x1],#8             //load and increment the pi2_src2
    ld1         {v1.4h},[x11],x3            //load and increment the pi2_src1 ii iteration
    sqadd       v18.4h,v6.4h,v7.4h
    sqadd       v18.4h,v18.4h,v0.4h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    ld1         {v3.4h},[x12],x4            //load and increment the pi2_src2 ii iteration
    sqadd       v20.4h,v1.4h,v3.4h          //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    sqadd       v19.4h,v20.4h,v0.4h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    mov         v18.d[1],v19.d[0]
    sqshrun     v20.8b, v18.8h,#7
    st1         {v20.s}[0],[x2],#4          //store pu1_dst i iteration
    st1         {v20.s}[1],[x14],x5         //store pu1_dst ii iteration

    subs        x9,x9,#4                    //decrement wd by 4 and check for 0

    bgt         core_loop_chroma_4x2        //if greater than 0 repeat the core loop again

end_core_loop_chorma_4x2:

    subs        x8,x8,#2                    //decrement the ht by 4

    add         x0,x0,x7                    //pi2_src1 + 2*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         x9,x6,#1
    add         x1,x1,x10                   //pi2_src2 + 2*src_strd2 - 2*wd
    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
    neg         x14, x20
    add         x2,x2,x14
                                            //pu1_dst + dst_std - wd
    bgt         core_loop_chroma_4x2        //if ht is greater than 0 goto outer_loop

    b           end_loops


outer_loop_8:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
core_loop_8:

    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
    sqadd       v24.8h,v24.8h,v26.8h
    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    ld1         { v18.8h},[x12],x4          //load and increment the pi2_src2 iii iteration
    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    sqshrun     v20.8b, v24.8h,#7
    ld1         { v17.8h},[x11],x3          //load and increment the pi2_src1 iv iteration
    sqadd       v30.8h,v16.8h,v18.8h
    sqshrun     v21.8b, v22.8h,#7
    ld1         { v29.8h},[x12],x4          //load and increment the pi2_src2 iv iteration
    sqadd       v30.8h,v30.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration
    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
    sqadd       v1.8h,v17.8h,v29.8h         //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration
    sqadd       v1.8h,v1.8h,v0.8h
    sqshrun     v30.8b, v30.8h,#7
    sqshrun     v31.8b, v1.8h,#7
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    st1         {v30.2s},[x14],x5           //store pu1_dst iii iteration                                                //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        x9,x9,#8                    //decrement wd by 4 and check for 0
    st1         {v31.2s},[x14],x5           //store pu1_dst iv iteration
    bgt         core_loop_8                 //if greater than 0 repeat the core loop again

end_core_loop_8:

    subs        x8,x8,#4                    //decrement the ht by 4

    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         x9,x6,#1
    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
    sub         x20,x9,x5,lsl #2            //4*dst_strd - wd
    neg         x14, x20
    add         x2,x2,x14
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd

    bgt         core_loop_8
    b           end_loops


// this is only for chroma module with inpput 4x2
outer_loop_chroma_8x2:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
    neg         x7, x20
    sub         x20,x6,x4,lsl #1            //2*src_strd2 - wd
    neg         x10, x20
core_loop_chroma_8x2:

    ld1         { v24.8h},[x0],#16          //load and increment the pi2_src1
    add         x14,x2,x5                   //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         { v26.8h},[x1],#16          //load and increment the pi2_src2
    sqadd       v24.8h,v24.8h,v26.8h
    ld1         { v28.8h},[x11],x3          //load and increment the pi2_src1 ii iteration
    sqadd       v24.8h,v24.8h,v0.8h         //vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    ld1         { v30.8h},[x12],x4          //load and increment the pi2_src2 ii iteration
    ld1         { v16.8h},[x11],x3          //load and increment the pi2_src1 iii iteration
    sqadd       v22.8h,v28.8h,v30.8h        //vaddq_s32(i4_tmp2_t1, i4_tmp2_t2)
    sqadd       v22.8h,v22.8h,v0.8h         //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t)
    sqshrun     v20.8b, v24.8h,#7
    sqshrun     v21.8b, v22.8h,#7
    st1         {v20.2s},[x2],#8            //store pu1_dst i iteration
    st1         {v21.2s},[x14],x5           //store pu1_dst ii iteration

    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
                                            //vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteratio
    subs        x9,x9,#8                    //decrement wd by 4 and check for 0

    bgt         core_loop_chroma_8x2        //if greater than 0 repeat the core loop again

end_core_loop_chroma_8x2:

    subs        x8,x8,#2                    //decrement the ht by 4

    add         x0,x0,x7                    //pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         x9,x6,#1
    add         x1,x1,x10                   //pi2_src2 + 4*src_strd2 - 2*wd
    sub         x20,x9,x5,lsl #1            //4*dst_strd - wd
    neg         x14, x20
    add         x2,x2,x14
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)                                  //pu1_dst + dst_std - wd

    bgt         core_loop_chroma_8x2

    b           end_loops


outer_loop_16:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function
    add         x11,x0,x3                   //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x12,x1,x4                   //pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    sub         x20,x6,x3,lsl #1            //2*src_strd1 - wd
    neg         x7, x20
    mov         x14,#16
    sub         x10,x14,x5
    sub         x11,x3,x14
    sub         x12,x14,x3

    sub         x20,x9,x5,lsl #1            //2*dst_strd - wd
    neg         x14, x20


prolog_16:


    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
    subs        x9,x9,#16
    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
    sub         x20,x8,#2
    csel        x8, x20, x8,eq
    sqadd       v22.8h,v2.8h,v4.8h
    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
    sqadd       v28.8h,v5.8h,v17.8h
    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
    add         x20,x0,x7
    csel        x0, x20, x0,eq
    add         x20,x1,x7
    csel        x1, x20, x1,eq
    sqadd       v24.8h,v6.8h,v1.8h
    ld1         { v2.8h},[x0],#16
    sqadd       v26.8h,v29.8h,v16.8h
// if the input is chroma with 8x2 block size
    cmp         x8,#0
    beq         epilog_16

    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
    sqadd       v22.8h,v22.8h,v0.8h
    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
    sqadd       v28.8h,v28.8h,v0.8h
    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
    sqadd       v24.8h,v24.8h,v0.8h
    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
    sqadd       v30.8h,v26.8h,v0.8h
    sqshrun     v20.8b, v22.8h,#7
    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
    sqshrun     v21.8b, v28.8h,#7
    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
    sqshrun     v26.8b, v24.8h,#7
    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration
    sqshrun     v27.8b, v30.8h,#7


core_loop_16:

    cmp         x9,#0
    sqadd       v22.8h,v2.8h,v4.8h
    asr         x20,x6,#1
    csel        x9,x20,x9,eq
    //asreq           x9,x6,#1
    mov         v20.d[1],v21.d[0]
    mov         v26.d[1],v27.d[0]
    st1         { v20.4s},[x2],x5
    sqadd       v28.8h,v5.8h,v17.8h
    st1         { v26.4s},[x2],x10
    add         x20,x2,x14
    csel        x2, x20, x2,eq
    sqadd       v24.8h,v6.8h,v1.8h
    subs        x9,x9,#16
    add         x20,x0,x7
    csel        x0, x20, x0,eq
    sqadd       v26.8h,v29.8h,v16.8h

    add         x20,x1,x7
    csel        x1, x20, x1,eq
    sub         x20,x8,#2
    csel        x8,x20,x8,eq
    cmp         x8,#0
    //subeqs           x8,x8,#2                      //decrement the ht by 2
    beq         epilog_16


    sqadd       v22.8h,v22.8h,v0.8h
    ld1         { v2.8h},[x0],#16           //load and increment the pi2_src1
    sqadd       v28.8h,v28.8h,v0.8h
    ld1         { v4.8h},[x1],#16           //load and increment the pi2_src2
    sqadd       v24.8h,v24.8h,v0.8h
    ld1         { v5.8h},[x0],x11           //load and increment the pi2_src1
    sqadd       v30.8h,v26.8h,v0.8h
    ld1         { v17.8h},[x1],x11          //load and increment the pi2_src2
    sqshrun     v20.8b, v22.8h,#7
    ld1         { v6.8h},[x0],#16           //load and increment the pi2_src1 ii iteration
    sqshrun     v21.8b, v28.8h,#7
    ld1         { v1.8h},[x1],#16           //load and increment the pi2_src2 ii iteration
    sqshrun     v26.8b, v24.8h,#7
    ld1         { v29.8h},[x0],x12          //load and increment the pi2_src1 ii iteration
    sqshrun     v27.8b, v30.8h,#7
    ld1         { v16.8h},[x1],x12          //load and increment the pi2_src2 ii iteration


    b           core_loop_16


epilog_16:

    sqadd       v22.8h,v22.8h,v0.8h
    sqadd       v28.8h,v28.8h,v0.8h
    sqadd       v24.8h,v24.8h,v0.8h
    sqadd       v30.8h,v26.8h,v0.8h
    sqshrun     v20.8b, v22.8h,#7
    sqshrun     v21.8b, v28.8h,#7
    sqshrun     v26.8b, v24.8h,#7
    sqshrun     v27.8b, v30.8h,#7
    mov         v20.d[1],v21.d[0]
    mov         v26.d[1],v27.d[0]
    st1         { v20.4s},[x2],x5
    st1         { v26.4s},[x2]


end_core_loop_16:


end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

    ret