@/*****************************************************************************
@*
@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
@*
@* Licensed under the Apache License, Version 2.0 (the "License");
@* you may not use this file except in compliance with the License.
@* You may obtain a copy of the License at:
@*
@* http://www.apache.org/licenses/LICENSE-2.0
@*
@* Unless required by applicable law or agreed to in writing, software
@* distributed under the License is distributed on an "AS IS" BASIS,
@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@* See the License for the specific language governing permissions and
@* limitations under the License.
@*
@*****************************************************************************/
@/**
@*******************************************************************************
@* @file
@*  ihevc_weighted_pred_bi.s
@*
@* @brief
@*  contains function definitions for weighted prediction used in inter
@* prediction
@*
@* @author
@*  parthiban v
@*
@* @par list of functions:
@*  - ihevc_weighted_pred_bi()
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/
@/**
@*******************************************************************************
@*
@* @brief
@*  does bi-weighted prediction on the arrays pointed by  pi2_src1 and
@* pi2_src2 and stores it at location pointed  by pi2_dst   assumptions : the
@* function is optimized considering the fact width and  height are multiple
@* of 2.
@*
@* @par description:
@*  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
@* off1 + 1) << (shift - 1) ) >> shift
@*
@* @param[in] pi2_src1
@*  pointer to source 1
@*
@* @param[in] pi2_src2
@*  pointer to source 2
@*
@* @param[out] pu1_dst
@*  pointer to destination
@*
@* @param[in] src_strd1
@*  source stride 1
@*
@* @param[in] src_strd2
@*  source stride 2
@*
@* @param[in] dst_strd
@*  destination stride
@*
@* @param[in] wgt0
@*  weight to be multiplied to source 1
@*
@* @param[in] off0
@*  offset 0
@*
@* @param[in] wgt1
@*  weight to be multiplied to source 2
@*
@* @param[in] off1
@*  offset 1
@*
@* @param[in] shift
@*  (14 bit depth) + log2_weight_denominator
@*
@* @param[in] lvl_shift1
@*  added before shift and offset
@*
@* @param[in] lvl_shift2
@*  added before shift and offset
@*
@* @param[in] ht
@*  height of the source
@*
@* @param[in] wd
@*  width of the source
@*
@* @returns
@*
@* @remarks
@*  none
@*
@*******************************************************************************
@*/

@void ihevc_weighted_pred_bi(word16 *pi2_src1,
@                            word16 *pi2_src2,
@                            uword8 *pu1_dst,
@                            word32 src_strd1,
@                            word32 src_strd2,
@                            word32 dst_strd,
@                            word32 wgt0,
@                            word32 off0,
@                            word32 wgt1,
@                            word32 off1,
@                            word32 shift,
@                            word32 lvl_shift1,
@                            word32 lvl_shift2,
@                            word32 ht,
@                            word32 wd)

@**************variables vs registers*****************************************
@   r0 => *pi2_src1
@   r1 => *pi2_src2
@   r2 => *pu1_dst
@   r3 =>  src_strd1
@   r4 =>  src_strd2
@   r5 =>  dst_strd
@   r6 =>  wgt0
@   r7 =>  off0
@   r8 =>  wgt1
@   r9 =>  off1
@   r10 =>  shift
@   r11 =>  lvl_shift1
@   r12 =>  lvl_shift2
@   r14 =>  ht
@   r7  =>  wd

.text
.align 4


.globl ihevc_weighted_pred_bi_a9q

.type ihevc_weighted_pred_bi_a9q, %function

ihevc_weighted_pred_bi_a9q:

    stmfd       sp!, {r4-r12, r14}          @stack stores the values of the arguments

    ldr         r6,[sp,#48]                 @load wgt0
    ldr         r11,[sp,#68]                @load lvl_shift1
    ldr         r12,[sp,#72]                @load lvl_shift2
    vmov.s16    d7[0],r6                    @moved for scalar multiplication
    mul         r4,r11,r6                   @lvl_shift1 * wgt0
    ldr         r8,[sp,#56]                 @load wgt1
    ldr         r7,[sp,#52]                 @load off0
    vmov.s16    d7[1],r8                    @moved for scalar multiplication
    mla         r4,r12,r8,r4                @(lvl_shift1 * wgt0) + (lvl_shift2 * wgt1)
    ldr         r9,[sp,#60]                 @load off1
    add         r5,r7,r9                    @off0 + off1
    ldr         r10,[sp,#64]                @load shift
    add         r5,r5,#1                    @off0 + off1 + 1
    sub         r14,r10,#1                  @shift - 1
    ldr         r7,[sp,#80]                 @load wd
    lsl         r5,r5,r14                   @((off0 + off1 + 1) << (shift - 1))
    vdup.u32    q14,r10                     @vmovq_n_s32(0-shift)
    add         r4,r4,r5                    @tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1))
    vdup.u32    q15,r4                      @vmovq_n_s32(tmp_lvl_shift)
    vneg.s32    q14,q14
    ldr         r4,[sp,#40]                 @load src_strd2
    lsl         r9,r7,#1
    ldr         r5,[sp,#44]                 @load dst_strd
    lsl         r3,r3,#1
    ldr         r14,[sp,#76]                @load ht
    lsl         r4,r4,#1

    cmp         r14,#0                      @check ht == 0
    beq         end_loops                   @if equal, then end the function

outer_loop:
    cmp         r7,#0                       @check wd == 0
    beq         end_loops                   @if equal, then end the function

core_loop:
    add         r6,r0,r3                    @pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer)
    add         r8,r1,r4                    @pi2_src_tmp2 = pi2_src2 + 2*src_strd2(2* because pi2_src is a 16 bit pointer)
    vld1.s16    {d0},[r0]!                  @load and increment the pi2_src1
    add         r10,r2,r5                   @pu1_dst_tmp = pu1_dst + dst_strd
    vld1.s16    {d1},[r1]!                  @load and increment the pi2_src2
    vmull.s16   q2,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0)
    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 ii iteration
    vmull.s16   q4,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1)
    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 ii iteration
    vadd.s32    q2,q2,q4                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2)

    vld1.s16    {d0},[r6],r3                @load and increment the pi2_src1 iii iteration
    vmull.s16   q5,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) ii iteration

    vld1.s16    {d1},[r8],r4                @load and increment the pi2_src2 iii iteration
    vadd.s32    q2,q2,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t)
    vmull.s16   q7,d0,d7[0]                 @vmull_n_s16(pi2_src1_val1, (int16_t) wgt0) iii iteration

    vld1.s16    {d2},[r6],r3                @load and increment the pi2_src_tmp1 iv iteration
    vmull.s16   q6,d3,d7[1]                 @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) ii iteration
    vshl.s32    q2,q2,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t)

    vld1.s16    {d3},[r8],r4                @load and increment the pi2_src_tmp1 iv iteration
    vadd.s32    q5,q5,q6                    @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) ii iteration

    vqmovun.s32 d4,q2                       @vqmovun_s32(sto_res_tmp1)
    vmull.s16   q8,d1,d7[1]                 @vmull_n_s16(pi2_src2_val1, (int16_t) wgt1) iii iteration

    vadd.s32    q5,q5,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) ii iteration
    vmov.s32    d5,d4                       @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    vadd.s32    q7,q7,q8                    @vaddq_s32(i4_tmp1_t1, i4_tmp1_t2) iii iteration

    vshl.s32    q5,q5,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) ii iteration
    vmull.s16   q9,d2,d7[0]                 @vmull_n_s16(pi2_src1_val2, (int16_t) wgt0) iv iteration
    vqmovn.u16  d4,q2                       @vqmovn_u16(sto_res_tmp3)
    vadd.s32    q7,q7,q15                   @vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t) iii iteration

    vqmovun.s32 d10,q5                      @vqmovun_s32(sto_res_tmp1) ii iteration
    vmull.s16   q10,d3,d7[1]                @vmull_n_s16(pi2_src2_val2, (int16_t) wgt1) iv iteration

    vshl.s32    q7,q7,q14                   @vshlq_s32(i4_tmp1_t1, tmp_shift_t) iii iteration
    vmov.s32    d11,d10                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration

    vadd.s32    q9,q9,q10                   @vaddq_s32(i4_tmp2_t1, i4_tmp2_t2) iv iteration
    vqmovun.s32 d14,q7                      @vqmovun_s32(sto_res_tmp1) iii iteration

    vadd.s32    q9,q9,q15                   @vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t) iv iteration
    vst1.s32    {d4[0]},[r2]!               @store pu1_dst i iteration

    vqmovn.u16  d10,q5                      @vqmovn_u16(sto_res_tmp3) ii iteration
    vshl.s32    q9,q9,q14                   @vshlq_s32(i4_tmp2_t1, tmp_shift_t) iv iteration
    vst1.s32    {d10[0]},[r10],r5           @store pu1_dst ii iteration


    vmov.s32    d15,d14                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration
    vqmovn.u16  d14,q7                      @vqmovn_u16(sto_res_tmp3) iii iteration
    vqmovun.s32 d18,q9                      @vqmovun_s32(sto_res_tmp1) iv iteration
    vmov.s32    d19,d18                     @vcombine_u16(sto_res_tmp2, sto_res_tmp2)
    vst1.s32    {d14[0]},[r10],r5           @store pu1_dst iii iteration
    vqmovn.u16  d18,q9                      @vqmovn_u16(sto_res_tmp3) iv iteration
    subs        r7,r7,#4                    @decrement wd by 4 and check for 0
    vst1.s32    {d18[0]},[r10],r5           @store pu1_dst iv iteration

    bgt         core_loop                   @if greater than 0 repeat the core loop again

end_core_loop:
    rsb         r11,r9,r3,lsl #2            @2*src_strd1 - wd
    subs        r14,r14,#4                  @decrement the ht by 4
    rsb         r12,r9,r4,lsl #2            @2*src_strd2 - wd
    add         r0,r0,r11                   @pi2_src1 + 4*src_strd1 - 2*wd(since pi2_src1 is 16 bit pointer double the increment with double the wd decrement)
    asr         r7,r9,#1
    add         r1,r1,r12                   @pi2_src2 + 4*src_strd2 - 2*wd
    rsb         r10,r7,r5,lsl #2            @2*dst_strd - wd
    add         r2,r2,r10                   @pu1_dst + dst_std - wd
    bgt         core_loop                   @if ht is greater than 0 goto outer_loop

end_loops:
    ldmfd       sp!,{r4-r12,r15}            @reload the registers from sp