///*****************************************************************************
//*
//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************/
///**
//*******************************************************************************
//* @file
//*  ihevc_weighted_pred_uni.s
//*
//* @brief
//*  contains function definitions for weighted prediction used in inter
//* prediction
//*
//* @author
//*  parthiban v
//*
//* @par list of functions:
//*  - ihevc_weighted_pred_uni()
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/

///**
//*******************************************************************************
//*
//* @brief
//*  does uni-weighted prediction on the array pointed by  pi2_src and stores
//* it at the location pointed by pi2_dst assumptions : the function is
//* optimized considering the fact width and  height are multiple of 2.
//*
//* @par description:
//*  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
//* offset
//*
//* @param[in] pi2_src
//*  pointer to the source
//*
//* @param[out] pu1_dst
//*  pointer to the destination
//*
//* @param[in] src_strd
//*  source stride
//*
//* @param[in] dst_strd
//*  destination stride
//*
//* @param[in] wgt0
//*  weight to be multiplied to the source
//*
//* @param[in] off0
//*  offset to be added after rounding and
//*
//* @param[in] shifting
//*
//*
//* @param[in] shift
//*  (14 bit depth) + log2_weight_denominator
//*
//* @param[in] lvl_shift
//*  added before shift and offset
//*
//* @param[in] ht
//*  height of the source
//*
//* @param[in] wd
//*  width of the source
//*
//* @returns
//*
//* @remarks
//*  none
//*
//*******************************************************************************
//*/

//void ihevc_weighted_pred_uni(word16 *pi2_src,
//                             uword8 *pu1_dst,
//                             word32 src_strd,
//                             word32 dst_strd,
//                             word32 wgt0,
//                             word32 off0,
//                             word32 shift,
//                             word32 lvl_shift,
//                             word32 ht,
//                             word32 wd)

//**************variables vs registers*****************************************
//    x0 => *pi2_src
//    x1 => *pu1_dst
//    x2 =>  src_strd
//    x3 =>  dst_strd
//    x4 =>  wgt0
//    x5 =>  off0
//    x6 =>  shift
//    x7 =>  lvl_shift
//    x8 =>    ht
//    x9    =>    wd

.text
.align 4

.include "ihevc_neon_macros.s"

.globl ihevc_weighted_pred_uni_av8

.type ihevc_weighted_pred_uni_av8, %function

ihevc_weighted_pred_uni_av8:

    // stmfd sp!, {x4-x12, x14}            //stack stores the values of the arguments

    ldr         w8,[sp,#0]
    ldr         w9,[sp,#8]

    // stmfd sp!, {x4-x12, x14}                //stack stores the values of the arguments

    stp         x19, x20,[sp,#-16]!
    stp         x21, x22,[sp,#-16]!

    mov         x15,x4 // src_strd2 40
    mov         x16,x5 // dst_strd 44
    mov         x17,x6 // lvl_shift1 48
    mov         x19,x7 // lvl_shift2 52
    mov         x20,x8 // ht 56
    mov         x21,x9 // wd 60

    mov         x4,x15                      //load wgt0
    mov         x7,x19                      //load lvl_shift
    mov         x11,#1
    mov         x5,x16                      //load off0
    mul         x10, x7, x4                 //lvl_shift * wgt0
    mov         x6,x17                      //load shift
    mov         x8,x20                      //load ht
    lsl         x22,x5,x6
    add         x10,x10,x22                 //lvl_shift * wgt0 + (off0 << shift)
    mov         x9,x21                      //load wt
    sub         x12,x6,#1
    mov         v0.h[0], w4                 //moved for scalar multiplication
    lsl         x2,x2,#1
    dup         v28.4s,w6                   //vmovq_n_s32(tmp_shift)
    lsl         x22,x11,x12
    add         x10,x10,x22                 //tmp_lvl_shift += (1 << (shift - 1))
    dup         v30.4s,w10                  //vmovq_n_s32(tmp_lvl_shift)
    neg         v28.4s, v28.4s
    lsl         x4,x9,#1

    cmp         x8,#0                       //check ht == 0
    beq         end_loops                   //if equal, then end the function

outer_loop:
    cmp         x9,#0                       //check wd == 0
    beq         end_loops                   //if equal, then end the function

core_loop:
    add         x5,x0,x2                    //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         x6,x1,x3                    //pu1_dst_tmp = pu1_dst + dst_strd
    ld1         {v1.4h},[x0],#8             //load and increment the pi2_src
    ld1         {v2.4h},[x5],x2             //load and increment the pi2_src_tmp ii iteration
    smull       v4.4s, v1.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0)

    add         v4.4s,  v4.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t)
    ld1         {v3.4h},[x5],x2             //load and increment the pi2_src iii iteration

    smull       v6.4s, v2.4h, v0.h[0]       //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration
    ld1         {v5.4h},[x5],x2             //load and increment the pi2_src_tmp iv iteration

    sshl        v4.4s,v4.4s,v28.4s
    //vshl.s32    q2,q2,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t)
    add         v6.4s,  v6.4s ,  v30.4s     //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration

    smull       v7.4s, v3.4h, v0.h[0]       //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration
    sqxtun      v4.4h, v4.4s                //vqmovun_s32(sto_res_tmp1)

    add         v7.4s,  v7.4s ,  v30.4s     //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration
    //mov v5, v4                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2)

    sshl        v6.4s,v6.4s,v28.4s
    //vshl.s32    q3,q3,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration

    smull       v16.4s, v5.4h, v0.h[0]      //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration
    uqxtn       v4.8b,  v4.8h               //vqmovn_u16(sto_res_tmp3)

    sshl        v7.4s,v7.4s,v28.4s
    //vshl.s32    q5,q5,q14                    //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration
    sqxtun      v6.4h, v6.4s                //vqmovun_s32(sto_res_tmp1) ii iteration

    add         v16.4s,  v16.4s ,  v30.4s   //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration
    //mov v7, v6                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

    sqxtun      v7.4h, v7.4s                //vqmovun_s32(sto_res_tmp1) iii iteration

    sshl        v16.4s,v16.4s,v28.4s
    //vshl.s32    q6,q6,q14                    //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration
    st1         {v4.s}[0],[x1],#4           //store pu1_dst i iteration
    //mov v11, v10                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration

    uqxtn       v6.8b,  v6.8h               //vqmovn_u16(sto_res_tmp3) ii iteration
    st1         {v6.s}[0],[x6],x3           //store pu1_dst ii iteration

    uqxtn       v7.8b,  v7.8h               //vqmovn_u16(sto_res_tmp3) iii iteration
    sqxtun      v16.4h, v16.4s              //vqmovun_s32(sto_res_tmp1) iv iteration

    //mov v13, v12                        //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration
    st1         {v7.s}[0],[x6],x3           //store pu1_dst i iteration iii iteration
    uqxtn       v16.8b,  v16.8h             //vqmovn_u16(sto_res_tmp3) iv iteration

    subs        x9,x9,#4                    //decrement wd by 4 and check for 0
    st1         {v16.s}[0],[x6],x3          //store pu1_dst iv iteration
    bgt         core_loop                   //if greater than 0 repeat the core loop again

end_core_loop:
    sub         x22,x4,x2,lsl #2            //2*src_strd - wd
    neg         x11, x22
    subs        x8,x8,#4                    //decrement the ht by 4
    add         x0,x0,x11                   //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement)
    asr         x9,x4,#1
    sub         x22,x9,x3,lsl #2            //2*dst_strd - wd
    neg         x12, x22
    add         x1,x1,x12                   //pu1_dst + dst_std - wd
    bgt         core_loop                   //if ht is greater than 0 goto outer_loop

end_loops:
    // ldmfd sp!,{x4-x12,x15}                  //reload the registers from sp
    ldp         x21, x22,[sp],#16
    ldp         x19, x20,[sp],#16

    ret