1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21// ******************************************************************************* 22// * @file 23// * ih264_iquant_itrans_recon_dc_av8.s 24// * 25// * @brief 26// * Contains function definitions for single stage inverse transform 27// * 28// * @author 29// * Mohit 30// * 31// * @par List of Functions: 32// * - ih264_iquant_itrans_recon_4x4_dc_av8() 33// * - ih264_iquant_itrans_recon_8x8_dc_av8() 34// * - ih264_iquant_itrans_recon_chroma_4x4_dc_av8() 35// * 36// * @remarks 37// * None 38// * 39// ******************************************************************************* 40//*/ 41 42 43.include "ih264_neon_macros.s" 44 45 46///** 47// ******************************************************************************* 48// * 49// * @brief 50// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 51// * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is 52// * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s 53// * 54// * @par Description: 55// * Performs inverse transform Ci4 and adds the residue to get the 56// * reconstructed block 57// * 58// * @param[in] pi2_src 59// * Input 4x4 coefficients 60// * 61// * @param[in] pu1_pred 62// * Prediction 4x4 block 63// * 64// * @param[out] pu1_out 65// * Output 4x4 block 66// * 67// * @param[in] u4_qp_div_6 68// * QP 69// * 70// * @param[in] pu2_weigh_mat 71// * Pointer to weight matrix 72// * 73// * @param[in] pred_strd, 74// * Prediction stride 75// * 76// * @param[in] out_strd 77// * Output Stride 78// * 79// *@param[in] pi2_tmp 80// * temporary buffer of size 1*16 81// * 82// * @param[in] pu2_iscal_mat 83// * Pointer to the inverse quantization matrix 84// * 85// * @returns Void 86// * 87// * @remarks 88// * None 89// * 90// ******************************************************************************* 91// */ 92//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, 93// UWORD8 *pu1_pred, 94// UWORD8 *pu1_out, 95// WORD32 pred_strd, 96// WORD32 out_strd, 97// const UWORD16 *pu2_iscal_mat, 98// const UWORD16 *pu2_weigh_mat, 99// UWORD32 u4_qp_div_6, 100// WORD32 *pi4_tmp, 101// WORD32 iq_start_idx 102// WORD16 *pi2_dc_ld_addr) 103//**************Variables Vs Registers***************************************** 104//x0 => *pi2_src 105//x1 => *pu1_pred 106//x2 => *pu1_out 107//w3 => pred_strd 108//w4 => out_strd 109//x5 => *pu2_iscal_mat 110//x6 => *pu2_weigh_mat 111//w7 => u4_qp_div_6 112// => pi4_tmp 113// => iq_start_idx 114// => pi2_dc_ld_addr 115 116.text 117.p2align 2 118 119 .global ih264_iquant_itrans_recon_4x4_dc_av8 120ih264_iquant_itrans_recon_4x4_dc_av8: 121 122 sxtw x3, w3 123 sxtw x4, w4 124 ldr w8, [sp, #8] //Loads iq_start_idx 125 subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set 126 127 ldr x10, [sp, #16] //Load alternate dc address 128 push_v_regs 129 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 130 131 132 bne donot_use_pi2_dc_ld_addr_luma_dc 133 ld1 {v0.h}[0], [x10] 134donot_use_pi2_dc_ld_addr_luma_dc: 135 136 beq donot_use_pi2_src_luma_dc 137 ld1 {v0.h}[0], [x5] 138 ld1 {v1.h}[0], [x6] 139 ld1 {v2.h}[0], [x0] 140 mul v0.4h, v1.4h, v0.4h 141 smull v0.4s, v0.4h, v2.4h 142 sshl v0.4s, v0.4s, v30.4s 143 sqrshrn v0.4h, v0.4s, #4 144donot_use_pi2_src_luma_dc: 145 146 147 dup v0.8h, v0.h[0] 148 srshr v0.8h, v0.8h, #6 149 150 ld1 {v1.s}[0], [x1], x3 151 ld1 {v1.s}[1], [x1], x3 152 ld1 {v2.s}[0], [x1], x3 153 ld1 {v2.s}[1], [x1] 154 155 uxtl v1.8h, v1.8b 156 uxtl v2.8h, v2.8b 157 158 add v1.8h, v0.8h, v1.8h 159 add v2.8h, v0.8h, v2.8h 160 161 sqxtun v1.8b, v1.8h 162 sqxtun v2.8b, v2.8h 163 164 st1 {v1.s}[0], [x2], x4 165 st1 {v1.s}[1], [x2], x4 166 st1 {v2.s}[0], [x2], x4 167 st1 {v2.s}[1], [x2] 168 pop_v_regs 169 ret 170 171// /* 172// ******************************************************************************** 173// * 174// * @brief This function reconstructs a 4x4 sub block from quantized resiude and 175// * prediction buffer if only dc value is present for residue 176// * 177// * @par Description: 178// * The quantized residue is first inverse quantized, 179// * This inverse quantized content is added to the prediction buffer to recon- 180// * struct the end output 181// * 182// * @param[in] pi2_src 183// * quantized dc coeffiient 184// * 185// * @param[in] pu1_pred 186// * prediction 4x4 block in interleaved format 187// * 188// * @param[in] pred_strd, 189// * Prediction buffer stride in interleaved format 190// * 191// * @param[in] out_strd 192// * recon buffer Stride 193// * 194// * @returns none 195// * 196// * @remarks none 197// * 198// ******************************************************************************* 199// */ 200// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, 201// UWORD8 *pu1_pred, 202// UWORD8 *pu1_out, 203// WORD32 pred_strd, 204// WORD32 out_strd, 205// const UWORD16 *pu2_iscal_mat, 206// const UWORD16 *pu2_weigh_mat, 207// UWORD32 u4_qp_div_6, 208// WORD16 *pi2_tmp, 209// WORD16 *pi2_dc_src) 210// Register Usage 211// x0 : pi2_src 212// x1 : pu1_pred 213// x2 : pu1_out 214// w3 : pred_strd 215// w4 : out_strd 216// x5 : pu2_iscal_mat 217// x6 : pu2_weigh_mat 218// w7 : u4_qp_div_6 219// : pi2_tmp 220// : pi2_dc_src 221// Neon registers d0-d7, d16-d30 are used 222// No need for pushing arm and neon registers 223 224 225 .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8 226ih264_iquant_itrans_recon_chroma_4x4_dc_av8: 227 228 sxtw x3, w3 229 sxtw x4, w4 230 ldr x0, [sp, #8] 231 push_v_regs 232 ld1 {v0.h}[0], [x0] 233 dup v0.8h, v0.h[0] 234 srshr v0.8h, v0.8h, #6 235 236 237 //backup pu1_out 238 mov x0, x2 239 240 //nop v3.16b //dummy for deinterleaving 241 movi v31.8h, #0x00ff //mask for interleaving [copy lower 8 bits] 242 243 ld1 {v1.d}[0], [x1], x3 244 ld1 {v1.d}[1], [x1], x3 245 ld1 {v2.d}[0], [x1], x3 246 ld1 {v2.d}[1], [x1], x3 247 248 ld1 {v11.d}[0], [x2], x4 //load pu1_out for interleaving 249 ld1 {v11.d}[1], [x2], x4 250 ld1 {v12.d}[0], [x2], x4 251 ld1 {v12.d}[1], [x2] 252 253 uzp1 v1.16b, v1.16b, v3.16b 254 uzp1 v2.16b, v2.16b, v3.16b 255 256 uaddw v1.8h, v0.8h, v1.8b 257 uaddw v2.8h, v0.8h, v2.8b 258 259 sqxtun v1.8b, v1.8h 260 sqxtun v2.8b, v2.8h 261 262 uxtl v1.8h, v1.8b 263 uxtl v2.8h, v2.8b 264 265 bit v11.16b, v1.16b, v31.16b 266 bit v12.16b, v2.16b, v31.16b 267 268 st1 {v11.d}[0], [x0], x4 269 st1 {v11.d}[1], [x0], x4 270 st1 {v12.d}[0], [x0], x4 271 st1 {v12.d}[1], [x0] 272 pop_v_regs 273 ret 274 275///* 276// ******************************************************************************* 277// * 278// * //brief 279// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 280// * [Only for Dc coeff] 281// * //par Description: 282// * Performs inverse transform Ci8 and adds the residue to get the 283// * reconstructed block 284// * 285// * //param[in] pi2_src 286// * Input 4x4 coefficients 287// * 288// * //param[in] pu1_pred 289// * Prediction 4x4 block 290// * 291// * //param[out] pu1_out 292// * Output 4x4 block 293// * 294// * //param[in] u4_qp_div_6 295// * QP 296// * 297// * //param[in] pu2_weigh_mat 298// * Pointer to weight matrix 299// * 300// * //param[in] pred_strd, 301// * Prediction stride 302// * 303// * //param[in] out_strd 304// * Output Stride 305// * 306// *//param[in] pi2_tmp 307// * temporary buffer of size 1*64 308// * 309// * //param[in] pu2_iscal_mat 310// * Pointer to the inverse quantization matrix 311// * 312// * //returns Void 313// * 314// * //remarks 315// * None 316// * 317// ******************************************************************************* 318// */ 319//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src, 320// UWORD8 *pu1_pred, 321// UWORD8 *pu1_out, 322// WORD32 pred_strd, 323// WORD32 out_strd, 324// const UWORD16 *pu2_iscal_mat, 325// const UWORD16 *pu2_weigh_mat, 326// UWORD32 u4_qp_div_6, 327// WORD32 *pi4_tmp, 328// WORD32 iq_start_idx 329// WORD16 *pi2_dc_ld_addr) 330//**************Variables Vs Registers***************************************** 331//x0 => *pi2_src 332//x1 => *pu1_pred 333//x2 => *pu1_out 334//w3 => pred_strd 335//w4 => out_strd 336//x5 => *pu2_iscal_mat 337//x6 => *pu2_weigh_mat 338//w7 => u4_qp_div_6 339//NOT USED => pi4_tmp 340//NOT USED => iq_start_idx 341//NOT USED => pi2_dc_ld_addr 342 343 .global ih264_iquant_itrans_recon_8x8_dc_av8 344ih264_iquant_itrans_recon_8x8_dc_av8: 345 346 push_v_regs 347 sxtw x3, w3 348 sxtw x4, w4 349 350 ld1 {v1.h}[0], [x5] 351 ld1 {v2.h}[0], [x6] 352 ld1 {v0.h}[0], [x0] 353 dup v3.4s, w7 354 355 356 mul v1.8h, v1.8h, v2.8h 357 smull v0.4s, v0.4h, v1.4h 358 sshl v0.4s, v0.4s, v3.4s 359 360 sqrshrn v0.4h, v0.4s, #6 361 srshr v0.8h, v0.8h, #6 362 dup v0.8h, v0.h[0] 363 364 ld1 {v22.8b}, [x1], x3 365 ld1 {v23.8b}, [x1], x3 366 ld1 {v24.8b}, [x1], x3 367 ld1 {v25.8b}, [x1], x3 368 ld1 {v26.8b}, [x1], x3 369 ld1 {v27.8b}, [x1], x3 370 ld1 {v28.8b}, [x1], x3 371 ld1 {v29.8b}, [x1] 372 373 uaddw v1.8h, v0.8h, v22.8b 374 uaddw v2.8h, v0.8h, v23.8b 375 uaddw v3.8h, v0.8h, v24.8b 376 uaddw v8.8h, v0.8h, v25.8b 377 uaddw v9.8h, v0.8h, v26.8b 378 uaddw v10.8h, v0.8h, v27.8b 379 uaddw v11.8h, v0.8h, v28.8b 380 uaddw v12.8h, v0.8h, v29.8b 381 382 sqxtun v1.8b, v1.8h 383 sqxtun v2.8b, v2.8h 384 sqxtun v3.8b, v3.8h 385 sqxtun v8.8b, v8.8h 386 sqxtun v9.8b, v9.8h 387 sqxtun v10.8b, v10.8h 388 sqxtun v11.8b, v11.8h 389 sqxtun v12.8b, v12.8h 390 391 st1 {v1.8b}, [x2], x4 392 st1 {v2.8b}, [x2], x4 393 st1 {v3.8b}, [x2], x4 394 st1 {v8.8b}, [x2], x4 395 st1 {v9.8b}, [x2], x4 396 st1 {v10.8b}, [x2], x4 397 st1 {v11.8b}, [x2], x4 398 st1 {v12.8b}, [x2] 399 400 pop_v_regs 401 ret 402 403 404