1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21// ******************************************************************************* 22// * @file 23// * ih264_iquant_itrans_recon_dc_av8.s 24// * 25// * @brief 26// * Contains function definitions for single stage inverse transform 27// * 28// * @author 29// * Mohit 30// * 31// * @par List of Functions: 32// * - ih264_iquant_itrans_recon_4x4_dc_av8() 33// * - ih264_iquant_itrans_recon_8x8_dc_av8() 34// * - ih264_iquant_itrans_recon_chroma_4x4_dc_av8() 35// * 36// * @remarks 37// * None 38// * 39// ******************************************************************************* 40//*/ 41 42 43.include "ih264_neon_macros.s" 44 45 46///** 47// ******************************************************************************* 48// * 49// * @brief 50// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 51// * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is 52// * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s 53// * 54// * @par Description: 55// * Performs inverse transform Ci4 and adds the residue to get the 56// * reconstructed block 57// * 58// * @param[in] pi2_src 59// * Input 4x4 coefficients 60// * 61// * @param[in] pu1_pred 62// * Prediction 4x4 block 63// * 64// * @param[out] pu1_out 65// * Output 4x4 block 66// * 67// * @param[in] u4_qp_div_6 68// * QP 69// * 70// * @param[in] pu2_weigh_mat 71// * Pointer to weight matrix 72// * 73// * @param[in] pred_strd, 74// * Prediction stride 75// * 76// * @param[in] out_strd 77// * Output Stride 78// * 79// *@param[in] pi2_tmp 80// * temporary buffer of size 1*16 81// * 82// * @param[in] pu2_iscal_mat 83// * Pointer to the inverse quantization matrix 84// * 85// * @returns Void 86// * 87// * @remarks 88// * None 89// * 90// ******************************************************************************* 91// */ 92//void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, 93// UWORD8 *pu1_pred, 94// UWORD8 *pu1_out, 95// WORD32 pred_strd, 96// WORD32 out_strd, 97// const UWORD16 *pu2_iscal_mat, 98// const UWORD16 *pu2_weigh_mat, 99// UWORD32 u4_qp_div_6, 100// WORD32 *pi4_tmp, 101// WORD32 iq_start_idx 102// WORD16 *pi2_dc_ld_addr) 103//**************Variables Vs Registers***************************************** 104//x0 => *pi2_src 105//x1 => *pu1_pred 106//x2 => *pu1_out 107//x3 => pred_strd 108//x4 => out_strd 109//x5 => *pu2_iscal_mat 110//x6 => *pu2_weigh_mat 111//x7 => u4_qp_div_6 112// => pi4_tmp 113// => iq_start_idx 114// => pi2_dc_ld_addr 115 116.text 117.p2align 2 118 119 .global ih264_iquant_itrans_recon_4x4_dc_av8 120ih264_iquant_itrans_recon_4x4_dc_av8: 121 122 ldr w8, [sp, #8] //Loads iq_start_idx 123 subs w8, w8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set 124 125 ldr x10, [sp, #16] //Load alternate dc address 126 push_v_regs 127 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 128 129 130 bne donot_use_pi2_dc_ld_addr_luma_dc 131 ld1 {v0.h}[0], [x10] 132donot_use_pi2_dc_ld_addr_luma_dc: 133 134 beq donot_use_pi2_src_luma_dc 135 ld1 {v0.h}[0], [x5] 136 ld1 {v1.h}[0], [x6] 137 ld1 {v2.h}[0], [x0] 138 mul v0.4h, v1.4h, v0.4h 139 smull v0.4s, v0.4h, v2.4h 140 sshl v0.4s, v0.4s, v30.4s 141 sqrshrn v0.4h, v0.4s, #4 142donot_use_pi2_src_luma_dc: 143 144 145 dup v0.8h, v0.h[0] 146 srshr v0.8h, v0.8h, #6 147 148 ld1 {v1.s}[0], [x1], x3 149 ld1 {v1.s}[1], [x1], x3 150 ld1 {v2.s}[0], [x1], x3 151 ld1 {v2.s}[1], [x1] 152 153 uxtl v1.8h, v1.8b 154 uxtl v2.8h, v2.8b 155 156 add v1.8h, v0.8h, v1.8h 157 add v2.8h, v0.8h, v2.8h 158 159 sqxtun v1.8b, v1.8h 160 sqxtun v2.8b, v2.8h 161 162 st1 {v1.s}[0], [x2], x4 163 st1 {v1.s}[1], [x2], x4 164 st1 {v2.s}[0], [x2], x4 165 st1 {v2.s}[1], [x2] 166 pop_v_regs 167 ret 168 169// /* 170// ******************************************************************************** 171// * 172// * @brief This function reconstructs a 4x4 sub block from quantized resiude and 173// * prediction buffer if only dc value is present for residue 174// * 175// * @par Description: 176// * The quantized residue is first inverse quantized, 177// * This inverse quantized content is added to the prediction buffer to recon- 178// * struct the end output 179// * 180// * @param[in] pi2_src 181// * quantized dc coeffiient 182// * 183// * @param[in] pu1_pred 184// * prediction 4x4 block in interleaved format 185// * 186// * @param[in] pred_strd, 187// * Prediction buffer stride in interleaved format 188// * 189// * @param[in] out_strd 190// * recon buffer Stride 191// * 192// * @returns none 193// * 194// * @remarks none 195// * 196// ******************************************************************************* 197// */ 198// void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, 199// UWORD8 *pu1_pred, 200// UWORD8 *pu1_out, 201// WORD32 pred_strd, 202// WORD32 out_strd, 203// const UWORD16 *pu2_iscal_mat, 204// const UWORD16 *pu2_weigh_mat, 205// UWORD32 u4_qp_div_6, 206// WORD16 *pi2_tmp, 207// WORD16 *pi2_dc_src) 208// Register Usage 209// x0 : pi2_src 210// x1 : pu1_pred 211// x2 : pu1_out 212// x3 : pred_strd 213// x4 : out_strd 214// x5 : pu2_iscal_mat 215// x6 : pu2_weigh_mat 216// x7 : u4_qp_div_6 217// : pi2_tmp 218// : pi2_dc_src 219// Neon registers d0-d7, d16-d30 are used 220// No need for pushing arm and neon registers 221 222 223 .global ih264_iquant_itrans_recon_chroma_4x4_dc_av8 224ih264_iquant_itrans_recon_chroma_4x4_dc_av8: 225 226 ldr x0, [sp, #8] 227 push_v_regs 228 ld1 {v0.h}[0], [x0] 229 dup v0.8h, v0.h[0] 230 srshr v0.8h, v0.8h, #6 231 232 233 //backup pu1_out 234 mov x0, x2 235 236 //nop v3.16b //dummy for deinterleaving 237 movi v31.8h, #0x00ff //mask for interleaving [copy lower 8 bits] 238 239 ld1 {v1.d}[0], [x1], x3 240 ld1 {v1.d}[1], [x1], x3 241 ld1 {v2.d}[0], [x1], x3 242 ld1 {v2.d}[1], [x1], x3 243 244 ld1 {v11.d}[0], [x2], x4 //load pu1_out for interleaving 245 ld1 {v11.d}[1], [x2], x4 246 ld1 {v12.d}[0], [x2], x4 247 ld1 {v12.d}[1], [x2] 248 249 uzp1 v1.16b, v1.16b, v3.16b 250 uzp1 v2.16b, v2.16b, v3.16b 251 252 uaddw v1.8h, v0.8h, v1.8b 253 uaddw v2.8h, v0.8h, v2.8b 254 255 sqxtun v1.8b, v1.8h 256 sqxtun v2.8b, v2.8h 257 258 uxtl v1.8h, v1.8b 259 uxtl v2.8h, v2.8b 260 261 bit v11.16b, v1.16b, v31.16b 262 bit v12.16b, v2.16b, v31.16b 263 264 st1 {v11.d}[0], [x0], x4 265 st1 {v11.d}[1], [x0], x4 266 st1 {v12.d}[0], [x0], x4 267 st1 {v12.d}[1], [x0] 268 pop_v_regs 269 ret 270 271///* 272// ******************************************************************************* 273// * 274// * //brief 275// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 276// * [Only for Dc coeff] 277// * //par Description: 278// * Performs inverse transform Ci8 and adds the residue to get the 279// * reconstructed block 280// * 281// * //param[in] pi2_src 282// * Input 4x4 coefficients 283// * 284// * //param[in] pu1_pred 285// * Prediction 4x4 block 286// * 287// * //param[out] pu1_out 288// * Output 4x4 block 289// * 290// * //param[in] u4_qp_div_6 291// * QP 292// * 293// * //param[in] pu2_weigh_mat 294// * Pointer to weight matrix 295// * 296// * //param[in] pred_strd, 297// * Prediction stride 298// * 299// * //param[in] out_strd 300// * Output Stride 301// * 302// *//param[in] pi2_tmp 303// * temporary buffer of size 1*64 304// * 305// * //param[in] pu2_iscal_mat 306// * Pointer to the inverse quantization matrix 307// * 308// * //returns Void 309// * 310// * //remarks 311// * None 312// * 313// ******************************************************************************* 314// */ 315//void ih264_iquant_itrans_recon_dc_8x8(WORD16 *pi2_src, 316// UWORD8 *pu1_pred, 317// UWORD8 *pu1_out, 318// WORD32 pred_strd, 319// WORD32 out_strd, 320// const UWORD16 *pu2_iscal_mat, 321// const UWORD16 *pu2_weigh_mat, 322// UWORD32 u4_qp_div_6, 323// WORD32 *pi4_tmp, 324// WORD32 iq_start_idx 325// WORD16 *pi2_dc_ld_addr) 326//**************Variables Vs Registers***************************************** 327//x0 => *pi2_src 328//x1 => *pu1_pred 329//x2 => *pu1_out 330//x3 => pred_strd 331//x4 => out_strd 332//x5 => *pu2_iscal_mat 333//x6 => *pu2_weigh_mat 334//x7 => u4_qp_div_6 335//NOT USED => pi4_tmp 336//NOT USED => iq_start_idx 337//NOT USED => pi2_dc_ld_addr 338 339 .global ih264_iquant_itrans_recon_8x8_dc_av8 340ih264_iquant_itrans_recon_8x8_dc_av8: 341 342 push_v_regs 343 344 ld1 {v1.h}[0], [x5] 345 ld1 {v2.h}[0], [x6] 346 ld1 {v0.h}[0], [x0] 347 dup v3.4s, w7 348 349 350 mul v1.8h, v1.8h, v2.8h 351 smull v0.4s, v0.4h, v1.4h 352 sshl v0.4s, v0.4s, v3.4s 353 354 sqrshrn v0.4h, v0.4s, #6 355 srshr v0.8h, v0.8h, #6 356 dup v0.8h, v0.h[0] 357 358 ld1 {v22.8b}, [x1], x3 359 ld1 {v23.8b}, [x1], x3 360 ld1 {v24.8b}, [x1], x3 361 ld1 {v25.8b}, [x1], x3 362 ld1 {v26.8b}, [x1], x3 363 ld1 {v27.8b}, [x1], x3 364 ld1 {v28.8b}, [x1], x3 365 ld1 {v29.8b}, [x1] 366 367 uaddw v1.8h, v0.8h, v22.8b 368 uaddw v2.8h, v0.8h, v23.8b 369 uaddw v3.8h, v0.8h, v24.8b 370 uaddw v8.8h, v0.8h, v25.8b 371 uaddw v9.8h, v0.8h, v26.8b 372 uaddw v10.8h, v0.8h, v27.8b 373 uaddw v11.8h, v0.8h, v28.8b 374 uaddw v12.8h, v0.8h, v29.8b 375 376 sqxtun v1.8b, v1.8h 377 sqxtun v2.8b, v2.8h 378 sqxtun v3.8b, v3.8h 379 sqxtun v8.8b, v8.8h 380 sqxtun v9.8b, v9.8h 381 sqxtun v10.8b, v10.8h 382 sqxtun v11.8b, v11.8h 383 sqxtun v12.8b, v12.8h 384 385 st1 {v1.8b}, [x2], x4 386 st1 {v2.8b}, [x2], x4 387 st1 {v3.8b}, [x2], x4 388 st1 {v8.8b}, [x2], x4 389 st1 {v9.8b}, [x2], x4 390 st1 {v10.8b}, [x2], x4 391 st1 {v11.8b}, [x2], x4 392 st1 {v12.8b}, [x2] 393 394 pop_v_regs 395 ret 396 397 398