1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_weighted_pred_uni.s 22//* 23//* @brief 24//* contains function definitions for weighted prediction used in inter 25//* prediction 26//* 27//* @author 28//* parthiban v 29//* 30//* @par list of functions: 31//* - ihevc_weighted_pred_uni() 32//* 33//* @remarks 34//* none 35//* 36//******************************************************************************* 37//*/ 38 39///** 40//******************************************************************************* 41//* 42//* @brief 43//* does uni-weighted prediction on the array pointed by pi2_src and stores 44//* it at the location pointed by pi2_dst assumptions : the function is 45//* optimized considering the fact width and height are multiple of 2. 46//* 47//* @par description: 48//* dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift + 49//* offset 50//* 51//* @param[in] pi2_src 52//* pointer to the source 53//* 54//* @param[out] pu1_dst 55//* pointer to the destination 56//* 57//* @param[in] src_strd 58//* source stride 59//* 60//* @param[in] dst_strd 61//* destination stride 62//* 63//* @param[in] wgt0 64//* weight to be multiplied to the source 65//* 66//* @param[in] off0 67//* offset to be added after rounding and 68//* 69//* @param[in] shifting 70//* 71//* 72//* @param[in] shift 73//* (14 bit depth) + log2_weight_denominator 74//* 75//* @param[in] lvl_shift 76//* added before shift and offset 77//* 78//* @param[in] ht 79//* height of the source 80//* 81//* @param[in] wd 82//* width of the source 83//* 84//* @returns 85//* 86//* @remarks 87//* none 88//* 89//******************************************************************************* 90//*/ 91 92//void ihevc_weighted_pred_uni(word16 *pi2_src, 93// uword8 *pu1_dst, 94// word32 src_strd, 95// word32 dst_strd, 96// word32 wgt0, 97// word32 off0, 98// word32 shift, 99// word32 lvl_shift, 100// word32 ht, 101// word32 wd) 102 103//**************variables vs registers***************************************** 104// x0 => *pi2_src 105// x1 => *pu1_dst 106// x2 => src_strd 107// x3 => dst_strd 108// x4 => wgt0 109// x5 => off0 110// x6 => shift 111// x7 => lvl_shift 112// x8 => ht 113// x9 => wd 114 115.text 116.align 4 117 118.include "ihevc_neon_macros.s" 119 120.globl ihevc_weighted_pred_uni_av8 121 122.type ihevc_weighted_pred_uni_av8, %function 123 124ihevc_weighted_pred_uni_av8: 125 126 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 127 128 ldr w8,[sp,#0] 129 ldr w9,[sp,#8] 130 131 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 132 133 stp x19, x20,[sp,#-16]! 134 stp x21, x22,[sp,#-16]! 135 136 mov x15,x4 // src_strd2 40 137 mov x16,x5 // dst_strd 44 138 mov x17,x6 // lvl_shift1 48 139 mov x19,x7 // lvl_shift2 52 140 mov x20,x8 // ht 56 141 mov x21,x9 // wd 60 142 143 mov x4,x15 //load wgt0 144 mov x7,x19 //load lvl_shift 145 mov x11,#1 146 mov x5,x16 //load off0 147 mul x10, x7, x4 //lvl_shift * wgt0 148 mov x6,x17 //load shift 149 mov x8,x20 //load ht 150 lsl x22,x5,x6 151 add x10,x10,x22 //lvl_shift * wgt0 + (off0 << shift) 152 mov x9,x21 //load wt 153 sub x12,x6,#1 154 mov v0.h[0], w4 //moved for scalar multiplication 155 lsl x2,x2,#1 156 dup v28.4s,w6 //vmovq_n_s32(tmp_shift) 157 lsl x22,x11,x12 158 add x10,x10,x22 //tmp_lvl_shift += (1 << (shift - 1)) 159 dup v30.4s,w10 //vmovq_n_s32(tmp_lvl_shift) 160 neg v28.4s, v28.4s 161 lsl x4,x9,#1 162 163 cmp x8,#0 //check ht == 0 164 beq end_loops //if equal, then end the function 165 166outer_loop: 167 cmp x9,#0 //check wd == 0 168 beq end_loops //if equal, then end the function 169 170core_loop: 171 add x5,x0,x2 //pi2_src_tmp1 = pi2_src1 + 2*src_strd1(2* because pi1_src is a 16 bit pointer) 172 add x6,x1,x3 //pu1_dst_tmp = pu1_dst + dst_strd 173 ld1 {v1.4h},[x0],#8 //load and increment the pi2_src 174 ld1 {v2.4h},[x5],x2 //load and increment the pi2_src_tmp ii iteration 175 smull v4.4s, v1.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) 176 177 add v4.4s, v4.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) 178 ld1 {v3.4h},[x5],x2 //load and increment the pi2_src iii iteration 179 180 smull v6.4s, v2.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) ii iteration 181 ld1 {v5.4h},[x5],x2 //load and increment the pi2_src_tmp iv iteration 182 183 sshl v4.4s,v4.4s,v28.4s 184 //vshl.s32 q2,q2,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) 185 add v6.4s, v6.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) ii iteration 186 187 smull v7.4s, v3.4h, v0.h[0] //vmull_n_s16(pi2_src_val1, (int16_t) wgt0) iii iteration 188 sqxtun v4.4h, v4.4s //vqmovun_s32(sto_res_tmp1) 189 190 add v7.4s, v7.4s , v30.4s //vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t) iii iteration 191 //mov v5, v4 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) 192 193 sshl v6.4s,v6.4s,v28.4s 194 //vshl.s32 q3,q3,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) ii iteration 195 196 smull v16.4s, v5.4h, v0.h[0] //vmull_n_s16(pi2_src_val2, (int16_t) wgt0) iv iteration 197 uqxtn v4.8b, v4.8h //vqmovn_u16(sto_res_tmp3) 198 199 sshl v7.4s,v7.4s,v28.4s 200 //vshl.s32 q5,q5,q14 //vshlq_s32(i4_tmp1_t, tmp_shift_t) iii iteration 201 sqxtun v6.4h, v6.4s //vqmovun_s32(sto_res_tmp1) ii iteration 202 203 add v16.4s, v16.4s , v30.4s //vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t) iv iteration 204 //mov v7, v6 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) ii iteration 205 206 sqxtun v7.4h, v7.4s //vqmovun_s32(sto_res_tmp1) iii iteration 207 208 sshl v16.4s,v16.4s,v28.4s 209 //vshl.s32 q6,q6,q14 //vshlq_s32(i4_tmp2_t, tmp_shift_t) iv iteration 210 st1 {v4.s}[0],[x1],#4 //store pu1_dst i iteration 211 //mov v11, v10 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iii iteration 212 213 uqxtn v6.8b, v6.8h //vqmovn_u16(sto_res_tmp3) ii iteration 214 st1 {v6.s}[0],[x6],x3 //store pu1_dst ii iteration 215 216 uqxtn v7.8b, v7.8h //vqmovn_u16(sto_res_tmp3) iii iteration 217 sqxtun v16.4h, v16.4s //vqmovun_s32(sto_res_tmp1) iv iteration 218 219 //mov v13, v12 //vcombine_u16(sto_res_tmp2, sto_res_tmp2) iv iteration 220 st1 {v7.s}[0],[x6],x3 //store pu1_dst i iteration iii iteration 221 uqxtn v16.8b, v16.8h //vqmovn_u16(sto_res_tmp3) iv iteration 222 223 subs x9,x9,#4 //decrement wd by 4 and check for 0 224 st1 {v16.s}[0],[x6],x3 //store pu1_dst iv iteration 225 bgt core_loop //if greater than 0 repeat the core loop again 226 227end_core_loop: 228 sub x22,x4,x2,lsl #2 //2*src_strd - wd 229 neg x11, x22 230 subs x8,x8,#4 //decrement the ht by 4 231 add x0,x0,x11 //pi2_src + 4*src_strd - 2*wd(since pi2_src is 16 bit pointer double the increment with double the wd decrement) 232 asr x9,x4,#1 233 sub x22,x9,x3,lsl #2 //2*dst_strd - wd 234 neg x12, x22 235 add x1,x1,x12 //pu1_dst + dst_std - wd 236 bgt core_loop //if ht is greater than 0 goto outer_loop 237 238end_loops: 239 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 240 ldp x21, x22,[sp],#16 241 ldp x19, x20,[sp],#16 242 243 ret 244 245 246