1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_default_weighted_pred_av8.s 24//* 25//* @brief 26//* Contains function definitions for default weighted prediction. 27//* 28//* @author 29//* Kaushik Senthoor R 30//* 31//* @par List of Functions: 32//* 33//* - ih264_default_weighted_pred_luma_av8() 34//* - ih264_default_weighted_pred_chroma_av8() 35//* 36//* @remarks 37//* None 38//* 39//******************************************************************************* 40//*/ 41//******************************************************************************* 42//* @function 43//* ih264_default_weighted_pred_luma_av8() 44//* 45//* @brief 46//* This routine performs the default weighted prediction as described in sec 47//* 8.4.2.3.1 titled "Default weighted sample prediction process" for luma. 48//* 49//* @par Description: 50//* This function gets two ht x wd blocks, calculates their rounded-average and 51//* stores it in the destination block. 52//* 53//* @param[in] puc_src1: 54//* UWORD8 Pointer to the buffer containing the first input block. 55//* 56//* @param[in] puc_src2: 57//* UWORD8 Pointer to the buffer containing the second input block. 58//* 59//* @param[out] puc_dst 60//* UWORD8 pointer to the destination where the output block is stored. 61//* 62//* @param[in] src_strd1 63//* Stride of the first input buffer 64//* 65//* @param[in] src_strd2 66//* Stride of the second input buffer 67//* 68//* @param[in] dst_strd 69//* Stride of the destination buffer 70//* 71//* @param[in] ht 72//* integer height of the array 73//* 74//* @param[in] wd 75//* integer width of the array 76//* 77//* @returns 78//* None 79//* 80//* @remarks 81//* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16). 82//* 83//******************************************************************************* 84//*/ 85//void ih264_default_weighted_pred_luma_av8(UWORD8 *puc_src1, 86// UWORD8 *puc_src2, 87// UWORD8 *puc_dst, 88// WORD32 src_strd1, 89// WORD32 src_strd2, 90// WORD32 dst_strd, 91// WORD32 ht, 92// WORD32 wd) 93// 94//**************Variables Vs Registers***************************************** 95// x0 => puc_src1 96// x1 => puc_src2 97// x2 => puc_dst 98// w3 => src_strd1 99// w4 => src_strd2 100// w5 => dst_strd 101// w6 => ht 102// w7 => wd 103// 104.text 105.p2align 2 106.include "ih264_neon_macros.s" 107 108 109 110 .global ih264_default_weighted_pred_luma_av8 111 112ih264_default_weighted_pred_luma_av8: 113 114 push_v_regs 115 stp x19, x20, [sp, #-16]! 116 sxtw x3, w3 117 sxtw x4, w4 118 sxtw x5, w5 119 cmp w7, #16 120 beq loop_16 //branch if wd is 16 121 cmp w7, #8 122 beq loop_8 //branch if wd is 8 123 124loop_4: //each iteration processes four rows 125 126 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 127 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 128 ld1 {v2.s}[0], [x1], x4 //load row 1 in source 2 129 ld1 {v2.s}[1], [x1], x4 //load row 2 in source 2 130 ld1 {v1.s}[0], [x0], x3 //load row 3 in source 1 131 ld1 {v1.s}[1], [x0], x3 //load row 4 in source 1 132 urhadd v0.8b, v0.8b , v2.8b 133 ld1 {v3.s}[0], [x1], x4 //load row 3 in source 2 134 ld1 {v3.s}[1], [x1], x4 //load row 4 in source 2 135 subs w6, w6, #4 //decrement ht by 4 136 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 137 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 138 urhadd v1.8b, v1.8b , v3.8b 139 st1 {v1.s}[0], [x2], x5 //load row 3 in destination 140 st1 {v1.s}[1], [x2], x5 //load row 4 in destination 141 bgt loop_4 //if greater than 0 repeat the loop again 142 b end_loops 143 144loop_8: //each iteration processes four rows 145 146 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 147 ld1 {v4.8b}, [x1], x4 //load row 1 in source 2 148 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 149 ld1 {v5.8b}, [x1], x4 //load row 2 in source 2 150 ld1 {v2.8b}, [x0], x3 //load row 3 in source 1 151 urhadd v0.16b, v0.16b , v4.16b 152 urhadd v1.16b, v1.16b , v5.16b 153 ld1 {v6.8b}, [x1], x4 //load row 3 in source 2 154 ld1 {v3.8b}, [x0], x3 //load row 4 in source 1 155 urhadd v2.8b, v2.8b , v6.8b 156 ld1 {v7.8b}, [x1], x4 //load row 4 in source 2 157 subs w6, w6, #4 //decrement ht by 4 158 st1 {v0.8b}, [x2], x5 //load row 1 in destination 159 urhadd v3.8b, v3.8b , v7.8b 160 st1 {v1.8b}, [x2], x5 //load row 2 in destination 161 st1 {v2.8b}, [x2], x5 //load row 3 in destination 162 st1 {v3.8b}, [x2], x5 //load row 4 in destination 163 bgt loop_8 //if greater than 0 repeat the loop again 164 b end_loops 165 166loop_16: //each iteration processes eight rows 167 168 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 169 ld1 {v16.8b, v17.8b}, [x1], x4 //load row 1 in source 2 170 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 171 ld1 {v18.8b, v19.8b}, [x1], x4 //load row 2 in source 2 172 urhadd v0.16b, v0.16b , v16.16b 173 urhadd v1.16b, v1.16b , v17.16b 174 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 175 ld1 {v20.8b, v21.8b}, [x1], x4 //load row 3 in source 2 176 urhadd v2.16b, v2.16b , v18.16b 177 urhadd v3.16b, v3.16b , v19.16b 178 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 179 ld1 {v22.8b, v23.8b}, [x1], x4 //load row 4 in source 2 180 urhadd v4.16b, v4.16b , v20.16b 181 urhadd v5.16b, v5.16b , v21.16b 182 ld1 {v8.8b, v9.8b}, [x0], x3 //load row 5 in source 1 183 ld1 {v24.8b, v25.8b}, [x1], x4 //load row 5 in source 2 184 urhadd v6.16b, v6.16b , v22.16b 185 urhadd v7.16b, v7.16b , v23.16b 186 ld1 {v10.8b, v11.8b}, [x0], x3 //load row 6 in source 1 187 ld1 {v26.8b, v27.8b}, [x1], x4 //load row 6 in source 2 188 urhadd v8.16b, v8.16b , v24.16b 189 urhadd v9.16b, v9.16b , v25.16b 190 ld1 {v12.8b, v13.8b}, [x0], x3 //load row 7 in source 1 191 ld1 {v28.8b, v29.8b}, [x1], x4 //load row 7 in source 2 192 urhadd v10.16b, v10.16b , v26.16b 193 urhadd v11.16b, v11.16b , v27.16b 194 ld1 {v14.8b, v15.8b}, [x0], x3 //load row 8 in source 1 195 ld1 {v30.8b, v31.8b}, [x1], x4 //load row 8 in source 2 196 urhadd v12.16b, v12.16b , v28.16b 197 urhadd v13.16b, v13.16b , v29.16b 198 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 199 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 200 urhadd v14.16b, v14.16b , v30.16b 201 urhadd v15.16b, v15.16b , v31.16b 202 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 203 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 204 subs w6, w6, #8 //decrement ht by 8 205 st1 {v8.8b, v9.8b}, [x2], x5 //load row 5 in destination 206 st1 {v10.8b, v11.8b}, [x2], x5 //load row 6 in destination 207 st1 {v12.8b, v13.8b}, [x2], x5 //load row 7 in destination 208 st1 {v14.8b, v15.8b}, [x2], x5 //load row 8 in destination 209 bgt loop_16 //if greater than 0 repeat the loop again 210 211end_loops: 212 213 // LDMFD sp!,{x4-x7,x15} //Reload the registers from sp 214 ldp x19, x20, [sp], #16 215 pop_v_regs 216 ret 217 218 219//******************************************************************************* 220//* @function 221//* ih264_default_weighted_pred_chroma_av8() 222//* 223//* @brief 224//* This routine performs the default weighted prediction as described in sec 225//* 8.4.2.3.1 titled "Default weighted sample prediction process" for chroma. 226//* 227//* @par Description: 228//* This function gets two ht x wd blocks, calculates their rounded-average and 229//* stores it in the destination block for U and V. 230//* 231//* @param[in] puc_src1: 232//* UWORD8 Pointer to the buffer containing the first input block. 233//* 234//* @param[in] puc_src2: 235//* UWORD8 Pointer to the buffer containing the second input block. 236//* 237//* @param[out] puc_dst 238//* UWORD8 pointer to the destination where the output block is stored. 239//* 240//* @param[in] src_strd1 241//* Stride of the first input buffer 242//* 243//* @param[in] src_strd2 244//* Stride of the second input buffer 245//* 246//* @param[in] dst_strd 247//* Stride of the destination buffer 248//* 249//* @param[in] ht 250//* integer height of the array 251//* 252//* @param[in] wd 253//* integer width of the array 254//* 255//* @returns 256//* None 257//* 258//* @remarks 259//* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8). 260//* 261//******************************************************************************* 262//*/ 263//void ih264_default_weighted_pred_chroma_av8(UWORD8 *puc_src1, 264// UWORD8 *puc_src2, 265// UWORD8 *puc_dst, 266// WORD32 src_strd1, 267// WORD32 src_strd2, 268// WORD32 dst_strd, 269// WORD32 ht, 270// WORD32 wd) 271// 272//**************Variables Vs Registers***************************************** 273// x0 => puc_src1 274// x1 => puc_src2 275// x2 => puc_dst 276// w3 => src_strd1 277// w4 => src_strd2 278// w5 => dst_strd 279// w6 => ht 280// w7 => wd 281// 282 283 284 285 286 .global ih264_default_weighted_pred_chroma_av8 287 288ih264_default_weighted_pred_chroma_av8: 289 290 push_v_regs 291 stp x19, x20, [sp, #-16]! 292 sxtw x3, w3 293 sxtw x4, w4 294 sxtw x5, w5 295 cmp w7, #8 296 beq loop_8_uv //branch if wd is 8 297 cmp w7, #4 298 beq loop_4_uv //branch if wd is 4 299 300loop_2_uv: //each iteration processes two rows 301 302 ld1 {v0.s}[0], [x0], x3 //load row 1 in source 1 303 ld1 {v0.s}[1], [x0], x3 //load row 2 in source 1 304 ld1 {v1.s}[0], [x1], x4 //load row 1 in source 2 305 ld1 {v1.s}[1], [x1], x4 //load row 2 in source 2 306 urhadd v0.8b, v0.8b , v1.8b 307 subs w6, w6, #2 //decrement ht by 2 308 st1 {v0.s}[0], [x2], x5 //load row 1 in destination 309 st1 {v0.s}[1], [x2], x5 //load row 2 in destination 310 bgt loop_2_uv //if greater than 0 repeat the loop again 311 b end_loops_uv 312 313loop_4_uv: //each iteration processes two rows 314 315 ld1 {v0.8b}, [x0], x3 //load row 1 in source 1 316 ld1 {v2.8b}, [x1], x4 //load row 1 in source 2 317 ld1 {v1.8b}, [x0], x3 //load row 2 in source 1 318 urhadd v0.8b, v0.8b , v2.8b 319 ld1 {v3.8b}, [x1], x4 //load row 2 in source 2 320 urhadd v1.8b, v1.8b , v3.8b 321 st1 {v0.8b}, [x2], x5 //load row 1 in destination 322 subs w6, w6, #2 //decrement ht by 2 323 st1 {v1.8b}, [x2], x5 //load row 2 in destination 324 bgt loop_4_uv //if greater than 0 repeat the loop again 325 b end_loops_uv 326 327loop_8_uv: //each iteration processes four rows 328 329 ld1 {v0.8b, v1.8b}, [x0], x3 //load row 1 in source 1 330 ld1 {v8.8b, v9.8b}, [x1], x4 //load row 1 in source 2 331 ld1 {v2.8b, v3.8b}, [x0], x3 //load row 2 in source 1 332 urhadd v0.16b, v0.16b , v8.16b 333 urhadd v1.16b, v1.16b , v9.16b 334 ld1 {v10.8b, v11.8b}, [x1], x4 //load row 2 in source 2 335 ld1 {v4.8b, v5.8b}, [x0], x3 //load row 3 in source 1 336 urhadd v2.16b, v2.16b , v10.16b 337 urhadd v3.16b, v3.16b , v11.16b 338 ld1 {v12.8b, v13.8b}, [x1], x4 //load row 3 in source 2 339 ld1 {v6.8b, v7.8b}, [x0], x3 //load row 4 in source 1 340 urhadd v4.16b, v4.16b , v12.16b 341 urhadd v5.16b, v5.16b , v13.16b 342 ld1 {v14.8b, v15.8b}, [x1], x4 //load row 4 in source 2 343 st1 {v0.8b, v1.8b}, [x2], x5 //load row 1 in destination 344 urhadd v6.16b, v6.16b , v14.16b 345 urhadd v7.16b, v7.16b , v15.16b 346 st1 {v2.8b, v3.8b}, [x2], x5 //load row 2 in destination 347 subs w6, w6, #4 //decrement ht by 4 348 st1 {v4.8b, v5.8b}, [x2], x5 //load row 3 in destination 349 st1 {v6.8b, v7.8b}, [x2], x5 //load row 4 in destination 350 bgt loop_8_uv //if greater than 0 repeat the loop again 351 352end_loops_uv: 353 ldp x19, x20, [sp], #16 354 pop_v_regs 355 ret 356 357 358 359