///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* //file
//*  ihevc_inter_pred_chroma_vert_neon_w16inp_neon.s
//*
//* //brief
//*  contains function definitions for inter prediction  interpolation.
//* functions are coded using neon  intrinsics and can be compiled using

//* rvct
//*
//* //author
//*  yogeswaran rs / parthiban
//*
//* //par list of functions:
//*
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
///**
///**
//*******************************************************************************
//*
//* //brief
//*       chroma interprediction filter for 16bit vertical input.
//*
//* //par description:
//*    applies a vertical filter with coefficients pointed to  by 'pi1_coeff' to
//*    the elements pointed by 'pu1_src' and  writes to the location pointed by
//*    'pu1_dst'  input is 16 bits  the filter output is downshifted by 12 and
//*    clipped to lie  between 0 and 255   assumptions : the function is
//*    optimized considering the fact width and  height are multiple of 2.
//*
//* //param[in] pi2_src
//*  word16 pointer to the source
//*
//* //param[out] pu1_dst
//*  uword8 pointer to the destination
//*
//* //param[in] src_strd
//*  integer source stride
//*
//* //param[in] dst_strd
//*  integer destination stride
//*
//* //param[in] pi1_coeff
//*  word8 pointer to the filter coefficients
//*
//* //param[in] ht
//*  integer height of the array
//*
//* //param[in] wd
//*  integer width of the array
//*
//* //returns
//*
//* //remarks
//*  none
//*
//*******************************************************************************
//*/
//void ihevc_inter_pred_chroma_vert_w16inp(word16 *pi2_src,
//                                          uword8 *pu1_dst,
//                                          word32 src_strd,
//                                          word32 dst_strd,
//                                          word8 *pi1_coeff,
//                                          word32 ht,
//                                          word32 wd)
//**************variables vs registers*****************************************
//x0 => *pu1_src
//x1 => *pi2_dst
//x2 =>  src_strd
//x3 =>  dst_strd

.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_inter_pred_chroma_vert_w16inp_av8

.type ihevc_inter_pred_chroma_vert_w16inp_av8, %function

ihevc_inter_pred_chroma_vert_w16inp_av8:

    // stmfd sp!, {x4-x12, x14}                    //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!

    mov         x15,x4 // pi1_coeff
    mov         x16,x5 // ht
    mov         x17,x6 // wd

    mov         x4, x15                     //loads pi1_coeff
    mov         x6, x17                     //wd
    lsl         x2,x2,#1                    //src_strd = 2* src_strd
    mov         x5,x16                      //loads ht
    ld1         {v0.8b},[x4]                //loads pi1_coeff
    sub         x4,x0,x2                    //pu1_src - src_strd
    sxtl        v0.8h, v0.8b                //long the value

    tst         x6,#3                       //checks wd  == 2
    dup         v16.4h, v0.h[0]             //coeff_0
    dup         v17.4h, v0.h[1]             //coeff_1
    dup         v18.4h, v0.h[2]             //coeff_2
    dup         v19.4h, v0.h[3]             //coeff_3

    bgt         core_loop_ht_2              //jumps to loop handles wd 2

    tst         x5,#3                       //checks ht == mul of 4
    beq         core_loop_ht_4              //jumps to loop handles ht mul of 4

core_loop_ht_2:
    lsl         x7,x2,#1                    //2*src_strd
    lsl         x12,x3,#1                   //2*dst_strd
    lsl         x9,x6,#2                    //4*wd
    sub         x6,x12,x6,lsl #1            //2*dst_strd - 2*wd
    sub         x8,x7,x9                    //2*src_strd - 4*wd
    mov         x12,x9                      //4wd

inner_loop_ht_2:
    add         x0,x4,x2                    //increments pi2_src
    ld1         {v0.4h},[x4],#8             //loads pu1_src
    smull       v0.4s, v0.4h, v16.4h        //vmull_s16(src_tmp1, coeff_0)
    subs        x12,x12,#8                  //2wd + 8
    ld1         {v2.4h},[x0],x2             //loads pi2_src
    smull       v7.4s, v2.4h, v16.4h        //vmull_s16(src_tmp2, coeff_0)
    ld1         {v3.4h},[x0],x2             //loads pi2_src
    smlal       v0.4s, v2.4h, v17.4h
    ld1         {v6.4h},[x0],x2
    smlal       v7.4s, v3.4h, v17.4h
    ld1         {v2.4h},[x0]
    add         x7,x1,x3                    //pu1_dst + dst_strd
    smlal       v0.4s, v3.4h, v18.4h
    smlal       v7.4s, v6.4h, v18.4h
    smlal       v0.4s, v6.4h, v19.4h
    smlal       v7.4s, v2.4h, v19.4h
    sqshrn      v0.4h, v0.4s,#6             //right shift
    sqshrn      v30.4h, v7.4s,#6            //right shift
    sqrshrun    v0.8b, v0.8h,#6             //rounding shift
    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    st1         {v0.s}[0],[x1],#4           //stores the loaded value
    st1         {v30.s}[0],[x7]             //stores the loaded value
    bgt         inner_loop_ht_2             //inner loop -again

    //inner loop ends
    subs        x5,x5,#2                    //increments ht
    add         x1,x1,x6                    //pu1_dst += 2*dst_strd - 2*wd
    mov         x12,x9                      //4wd
    add         x4,x4,x8                    //pi1_src_tmp1 += 2*src_strd - 4*wd
    bgt         inner_loop_ht_2             //loop again

    b           end_loops                   //jumps to end

core_loop_ht_4:
    lsl         x7,x2,#2                    //2*src_strd
    lsl         x12,x3,#2                   //2*dst_strd
    lsr         x11, x6, #1                 //divide by 2
    sub         x14,x12,x6,lsl #1           //2*dst_strd - 2*wd
    sub         x8,x7,x6,lsl #2             //2*src_strd - 4*wd

    mul         x12, x5 , x11               //multiply height by width
    sub         x12, x12,#4                 //subtract by one for epilog
    lsl         x11, x6, #1                 //2*wd

prolog:
    add         x0,x4,x2                    //increments pi2_src
    ld1         {v0.4h},[x4],#8             //loads pu1_src
    ld1         {v1.4h},[x0],x2             //loads pi2_src
    subs        x11,x11,#4
    ld1         {v2.4h},[x0],x2             //loads pi2_src
    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    ld1         {v3.4h},[x0],x2
    smlal       v30.4s, v1.4h, v17.4h
    smlal       v30.4s, v2.4h, v18.4h
    add         x9,x1,x3                    //pu1_dst + dst_strd
    smlal       v30.4s, v3.4h, v19.4h

    ld1         {v4.4h},[x0],x2
    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    add         x20,x4,x8
    csel        x4, x20, x4,le
    smlal       v28.4s, v2.4h, v17.4h
    ld1         {v5.4h},[x0],x2
    smlal       v28.4s, v3.4h, v18.4h
    ld1         {v6.4h},[x0],x2
    smlal       v28.4s, v4.4h, v19.4h
    lsl         x20,x6,#1
    csel        x11, x20, x11,le

    sqshrn      v30.4h, v30.4s,#6           //right shift

    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    add         x0,x4,x2
    smlal       v26.4s, v3.4h, v17.4h
    smlal       v26.4s, v4.4h, v18.4h
    ld1         {v0.4h},[x4],#8             //loads pu1_src
    smlal       v26.4s, v5.4h, v19.4h

    sqrshrun    v30.8b, v30.8h,#6           //rounding shift
    sqshrn      v28.4h, v28.4s,#6           //right shift

    ld1         {v1.4h},[x0],x2             //loads pi2_src
    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    st1         {v30.s}[0],[x1],#4          //stores the loaded value
    smlal       v24.4s, v4.4h, v17.4h
    ld1         {v2.4h},[x0],x2             //loads pi2_src
    smlal       v24.4s, v5.4h, v18.4h
    ld1         {v3.4h},[x0],x2
    smlal       v24.4s, v6.4h, v19.4h
    add         x20,x1,x14
    csel        x1, x20, x1,le

    sqshrn      v26.4h, v26.4s,#6           //right shift
    subs        x12,x12,#4
    sqrshrun    v28.8b, v28.8h,#6           //rounding shift

    beq         epilog                      //jumps to epilog

kernel_4:
    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    subs        x11,x11,#4
    smlal       v30.4s, v1.4h, v17.4h
    st1         {v28.s}[0],[x9],x3          //stores the loaded value
    smlal       v30.4s, v2.4h, v18.4h
    smlal       v30.4s, v3.4h, v19.4h

    sqshrn      v24.4h, v24.4s,#6           //right shift
    sqrshrun    v26.8b, v26.8h,#6           //rounding shift

    ld1         {v4.4h},[x0],x2
    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    smlal       v28.4s, v2.4h, v17.4h
    smlal       v28.4s, v3.4h, v18.4h
    smlal       v28.4s, v4.4h, v19.4h
    st1         {v26.s}[0],[x9],x3          //stores the loaded value
    add         x20,x4,x8
    csel        x4, x20, x4,le
    lsl         x20,x6,#1
    csel        x11, x20, x11,le

    sqshrn      v30.4h, v30.4s,#6           //right shift
    sqrshrun    v24.8b, v24.8h,#6           //rounding shift

    ld1         {v5.4h},[x0],x2
    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    ld1         {v6.4h},[x0],x2
    smlal       v26.4s, v3.4h, v17.4h
    st1         {v24.s}[0],[x9]             //stores the loaded value
    add         x0,x4,x2
    smlal       v26.4s, v4.4h, v18.4h
    ld1         {v0.4h},[x4],#8             //loads pu1_src
    smlal       v26.4s, v5.4h, v19.4h

    sqshrn      v28.4h, v28.4s,#6           //right shift
    sqrshrun    v30.8b, v30.8h,#6           //rounding shift

    ld1         {v1.4h},[x0],x2             //loads pi2_src
    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    add         x9,x1,x3                    //pu1_dst + dst_strd
    ld1         {v2.4h},[x0],x2             //loads pi2_src
    smlal       v24.4s, v4.4h, v17.4h
    ld1         {v3.4h},[x0],x2
    smlal       v24.4s, v5.4h, v18.4h

    st1         {v30.s}[0],[x1],#4          //stores the loaded value
    smlal       v24.4s, v6.4h, v19.4h

    sqshrn      v26.4h, v26.4s,#6           //right shift
    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
    add         x20,x1,x14
    csel        x1, x20, x1,le

    subs        x12,x12,#4

    bgt         kernel_4                    //jumps to kernel_4

epilog:
    smull       v30.4s, v0.4h, v16.4h       //vmull_s16(src_tmp1, coeff_0)
    st1         {v28.s}[0],[x9],x3          //stores the loaded value
    smlal       v30.4s, v1.4h, v17.4h
    smlal       v30.4s, v2.4h, v18.4h
    smlal       v30.4s, v3.4h, v19.4h

    sqshrn      v24.4h, v24.4s,#6           //right shift
    sqrshrun    v26.8b, v26.8h,#6           //rounding shift

    smull       v28.4s, v1.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    ld1         {v4.4h},[x0],x2
    smlal       v28.4s, v2.4h, v17.4h
    st1         {v26.s}[0],[x9],x3          //stores the loaded value
    smlal       v28.4s, v3.4h, v18.4h
    smlal       v28.4s, v4.4h, v19.4h

    sqshrn      v30.4h, v30.4s,#6           //right shift
    sqrshrun    v24.8b, v24.8h,#6           //rounding shift

    smull       v26.4s, v2.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    ld1         {v5.4h},[x0],x2
    smlal       v26.4s, v3.4h, v17.4h
    smlal       v26.4s, v4.4h, v18.4h
    smlal       v26.4s, v5.4h, v19.4h

    sqshrn      v28.4h, v28.4s,#6           //right shift
    sqrshrun    v30.8b, v30.8h,#6           //rounding shift

    st1         {v24.s}[0],[x9]             //stores the loaded value
    smull       v24.4s, v3.4h, v16.4h       //vmull_s16(src_tmp2, coeff_0)
    smlal       v24.4s, v4.4h, v17.4h
    add         x9,x1,x3                    //pu1_dst + dst_strd
    ld1         {v6.4h},[x0],x2
    smlal       v24.4s, v5.4h, v18.4h
    smlal       v24.4s, v6.4h, v19.4h
    st1         {v30.s}[0],[x1],#4          //stores the loaded value

    sqrshrun    v28.8b, v28.8h,#6           //rounding shift
    sqshrn      v26.4h, v26.4s,#6           //right shift

    st1         {v28.s}[0],[x9],x3          //stores the loaded value
    sqrshrun    v26.8b, v26.8h,#6           //rounding shift

    sqshrn      v24.4h, v24.4s,#6           //right shift
    st1         {v26.s}[0],[x9],x3          //stores the loaded value
    sqrshrun    v24.8b, v24.8h,#6           //rounding shift

    st1         {v24.s}[0],[x9]             //stores the loaded value

end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x19, x20,[sp],#16

    ret