1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@** 21@ ******************************************************************************* 22@ * @file 23@ * ih264_iquant_itrans_recon_dc_a9.s 24@ * 25@ * @brief 26@ * Contains function definitions for single stage inverse transform 27@ * 28@ * @author 29@ * Mohit 30@ * 31@ * @par List of Functions: 32@ * - ih264_iquant_itrans_recon_4x4_dc_a9() 33@ * - ih264_iquant_itrans_recon_8x8_dc_a9() 34@ * - ih264_iquant_itrans_recon_chroma_4x4_dc_a9() 35@ * 36@ * @remarks 37@ * None 38@ * 39@ ******************************************************************************* 40@* 41@** 42@ ******************************************************************************* 43@ * 44@ * @brief 45@ * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 46@ * for dc input pattern only, i.e. only the (0,0) element of the input 4x4 block is 47@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s 48@ * 49@ * @par Description: 50@ * Performs inverse transform Ci4 and adds the residue to get the 51@ * reconstructed block 52@ * 53@ * @param[in] pi2_src 54@ * Input 4x4 coefficients 55@ * 56@ * @param[in] pu1_pred 57@ * Prediction 4x4 block 58@ * 59@ * @param[out] pu1_out 60@ * Output 4x4 block 61@ * 62@ * @param[in] u4_qp_div_6 63@ * QP 64@ * 65@ * @param[in] pu2_weigh_mat 66@ * Pointer to weight matrix 67@ * 68@ * @param[in] pred_strd, 69@ * Prediction stride 70@ * 71@ * @param[in] out_strd 72@ * Output Stride 73@ * 74@ *@param[in] pi2_tmp 75@ * temporary buffer of size 1*16 76@ * 77@ * @param[in] pu2_iscal_mat 78@ * Pointer to the inverse quantization matrix 79@ * 80@ * @returns Void 81@ * 82@ * @remarks 83@ * None 84@ * 85@ ******************************************************************************* 86@ * 87@void ih264_iquant_itrans_recon_4x4_dc(WORD16 *pi2_src, 88@ UWORD8 *pu1_pred, 89@ UWORD8 *pu1_out, 90@ WORD32 pred_strd, 91@ WORD32 out_strd, 92@ const UWORD16 *pu2_iscal_mat, 93@ const UWORD16 *pu2_weigh_mat, 94@ UWORD32 u4_qp_div_6, 95@ WORD32 *pi4_tmp, 96@ WORD32 iq_start_idx 97@ WORD16 *pi2_dc_ld_addr) 98@**************Variables Vs Registers***************************************** 99@r0 => *pi2_src 100@r1 => *pu1_pred 101@r2 => *pu1_out 102@r3 => pred_strd 103@r4 => out_strd 104@r5 => *pu2_iscal_mat 105@r6 => *pu2_weigh_mat 106@r7 => u4_qp_div_6 107@r9 => iq_start_idx 108@unused => pi2_dc_ld_addr 109 110.text 111.syntax unified 112.p2align 2 113 114 .global ih264_iquant_itrans_recon_4x4_dc_a9 115 116ih264_iquant_itrans_recon_4x4_dc_a9: 117 118@Only one shift is done in horizontal inverse because, 119@if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 120@if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 121 122 stmfd sp!, {r4-r10, r14} @stack stores the values of the arguments 123 ldr r5, [sp, #36] @Loads *pu2_iscal_mat 124 ldr r6, [sp, #40] @Loads *pu2_weigh_mat 125 ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load 126 ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load 127 ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load 128@=======================DEQUANT FROM HERE=================================== 129 mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] 130 ldr r7, [sp, #44] @Loads u4_qp_div_6 131 mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] 132 ldr r4, [sp, #32] @Loads out_strd 133 ldr r9, [sp, #52] @Loads iq_start_idx 134 135 lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 136 add r6, r6, #8 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact 137 asr r6, r6, #4 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) 138 139 subs r9, r9, #1 @ if r8 == 1 => intra case , so result of subtraction is zero and Z flag is set 140 ldrsheq r10, [r0] @ Loads signed halfword pi2_src[0], if r9==1 141 moveq r6, r10 @ Restore dc value in case of intra, i.e. r9 == 1 142 143 add r6, r6, #32 @i_macro = q0 + 32 144 asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform 145 vdup.s16 q0, r6 @copy transform output to Q0 146 147 vld1.32 d30[0], [r1], r3 @I row Load pu1_pred buffer 148 149 vld1.32 d30[1], [r1], r3 @II row Load pu1_pred buffer 150 151 vld1.32 d31[0], [r1], r3 @III row Load pu1_pred buf 152 153 vld1.32 d31[1], [r1], r3 @IV row Load pu1_pred buffer 154 vaddw.u8 q10, q0, d30 155 156 vaddw.u8 q11, q0, d31 157 158 vqmovun.s16 d0, q10 159 160 vst1.32 d0[0], [r2], r4 @I row store the value 161 vqmovun.s16 d1, q11 162 vst1.32 d0[1], [r2], r4 @II row store the value 163 vst1.32 d1[0], [r2], r4 @III row store the value 164 vst1.32 d1[1], [r2] @IV row store the value 165 166 ldmfd sp!, {r4-r10, r15} @Reload the registers from SP 167 168 169 170 171@* 172@ ******************************************************************************* 173@ * 174@ * @brief 175@ * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 176@ * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is 177@ * non-zero. For complete function, refer ih264_iquant_itrans_recon_a9.s 178@ * 179@ * @par Description: 180@ * Performs inverse transform Ci8 and adds the residue to get the 181@ * reconstructed block 182@ * 183@ * @param[in] pi2_src 184@ * Input 4x4 coefficients 185@ * 186@ * @param[in] pu1_pred 187@ * Prediction 4x4 block 188@ * 189@ * @param[out] pu1_out 190@ * Output 4x4 block 191@ * 192@ * @param[in] u4_qp_div_6 193@ * QP 194@ * 195@ * @param[in] pu2_weigh_mat 196@ * Pointer to weight matrix 197@ * 198@ * @param[in] pred_strd, 199@ * Prediction stride 200@ * 201@ * @param[in] out_strd 202@ * Output Stride 203@ * 204@ *@param[in] pi2_tmp 205@ * temporary buffer of size 1*64 206@ * 207@ * @param[in] pu2_iscal_mat 208@ * Pointer to the inverse quantization matrix 209@ * 210@ * @returns Void 211@ * 212@ * @remarks 213@ * None 214@ * 215@ ******************************************************************************* 216@ * 217@void ih264_iquant_itrans_recon_8x8_dc(WORD16 *pi2_src, 218@ UWORD8 *pu1_pred, 219@ UWORD8 *pu1_out, 220@ WORD32 pred_strd, 221@ WORD32 out_strd, 222@ const UWORD16 *pu2_iscal_mat, 223@ const UWORD16 *pu2_weigh_mat, 224@ UWORD32 u4_qp_div_6, 225@ WORD32 *pi4_tmp, 226@ WORD32 iq_start_idx) 227@**************Variables Vs Registers***************************************** 228@r0 => *pi2_src 229@r1 => *pu1_pred 230@r2 => *pu1_out 231@r3 => pred_strd 232@r4 => out_strd 233@r5 => *pu2_iscal_mat 234@r6 => *pu2_weigh_mat 235@r7 => u4_qp_div_6 236 237 238 .global ih264_iquant_itrans_recon_8x8_dc_a9 239ih264_iquant_itrans_recon_8x8_dc_a9: 240 241 stmfd sp!, {r4-r8, r14} @stack stores the values of the arguments 242 ldr r5, [sp, #28] @Loads *pu2_iscal_mat 243 ldr r6, [sp, #32] @Loads *pu2_weigh_mat 244 ldrsh r8, [r0] @load pi2_src[0], SH for signed halfword load 245 ldrh r6, [r6] @load pu2_weight_mat[0] , H for unsigned halfword load 246 ldrh r5, [r5] @load pu2_iscal_mat[0] , H for unsigned halfword load 247@=======================DEQUANT FROM HERE=================================== 248 mul r6, r6, r5 @pu2_iscal_mat[0]*pu2_weigh_mat[0] 249 ldr r7, [sp, #36] @Loads u4_qp_div_6 250 mul r6, r6, r8 @pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] 251 ldr r4, [sp, #24] @Loads out_strd 252 253 vpush {d8-d15} 254 lsl r6, r6, r7 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 255 add r6, r6, #32 @(pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0])<<u4_qp_div_6 + rnd_fact 256 asr r6, r6, #6 @q0 = (pi2_src[0]*pu2_iscal_mat[0]*pu2_weigh_mat[0] + rnd_fact)<<(u4_qp_div_6-4) 257 add r6, r6, #32 @i_macro = q0 + 32 258 asr r6, r6, #6 @i_macro >>6 = DC output of 2-stage transform 259 vdup.s16 q8, r6 @copy transform output to Q0 260 261 vld1.32 d24, [r1], r3 @ Q12 = 0x070605....0x070605.... 262 263 vld1.32 d25, [r1], r3 @ Q12 = 0x070605....0x070605.... 264 265 vld1.32 d26, [r1], r3 @ Q12 = 0x070605....0x070605.... 266 vaddw.u8 q0, q8, d24 267 vld1.32 d27, [r1], r3 @ Q12 = 0x070605....0x070605.... 268 vaddw.u8 q1, q8, d25 269 vld1.32 d28, [r1], r3 @ Q12 = 0x070605....0x070605.... 270 vaddw.u8 q2, q8, d26 271 vld1.32 d29, [r1], r3 @ Q12 = 0x070605....0x070605.... 272 vaddw.u8 q3, q8, d27 273 vld1.32 d30, [r1], r3 @ Q12 = 0x070605....0x070605.... 274 vaddw.u8 q4, q8, d28 275 vld1.32 d31, [r1], r3 @ Q12 = 0x070605....0x070605.... 276 277@ Code Added to pack sign and magnitudes 278 279 280 vqmovun.s16 d0, q0 281 vaddw.u8 q5, q8, d29 282 vqmovun.s16 d1, q1 283 vaddw.u8 q6, q8, d30 284 vqmovun.s16 d2, q2 285 vqmovun.s16 d3, q3 286 vaddw.u8 q7, q8, d31 287 vqmovun.s16 d4, q4 288 vqmovun.s16 d5, q5 289 vst1.32 d0, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 290 vqmovun.s16 d6, q6 291 vst1.32 d1, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 292 vqmovun.s16 d7, q7 293 vst1.32 d2, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 294 vst1.32 d3, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 295 vst1.32 d4, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 296 vst1.32 d5, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 297 vst1.32 d6, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 298 vst1.32 d7, [r2], r4 @ Magnitudes of 1st 4x4 block coeffs 299 300 vpop {d8-d15} 301 ldmfd sp!, {r4-r8, r15} 302 303 304@ * 305@ ******************************************************************************** 306@ * 307@ * @brief This function reconstructs a 4x4 sub block from quantized resiude and 308@ * prediction buffer if only dc value is present for residue 309@ * 310@ * @par Description: 311@ * The quantized residue is first inverse quantized, 312@ * This inverse quantized content is added to the prediction buffer to recon- 313@ * struct the end output 314@ * 315@ * @param[in] pi2_src 316@ * quantized dc coeffiient 317@ * 318@ * @param[in] pu1_pred 319@ * prediction 4x4 block in interleaved format 320@ * 321@ * @param[in] pred_strd, 322@ * Prediction buffer stride in interleaved format 323@ * 324@ * @param[in] out_strd 325@ * recon buffer Stride 326@ * 327@ * @returns none 328@ * 329@ * @remarks none 330@ * 331@ ******************************************************************************* 332@ * 333@ void ih264_iquant_itrans_recon_chroma_4x4_dc(WORD16 *pi2_src, 334@ UWORD8 *pu1_pred, 335@ UWORD8 *pu1_out, 336@ WORD32 pred_strd, 337@ WORD32 out_strd, 338@ const UWORD16 *pu2_iscal_mat, 339@ const UWORD16 *pu2_weigh_mat, 340@ UWORD32 u4_qp_div_6, 341@ WORD16 *pi2_tmp, 342@ WORD16 *pi2_dc_src) 343@ Register Usage 344@ r0 : pi2_src 345@ r1 : pu1_pred 346@ r2 : pu1_out 347@ r3 : pred_strd 348@ Neon registers d0-d7, d16-d30 are used 349@ No need for pushing arm and neon registers 350 .global ih264_iquant_itrans_recon_chroma_4x4_dc_a9 351ih264_iquant_itrans_recon_chroma_4x4_dc_a9: 352 353 ldr r0, [sp, #20] 354 vld1.s16 d0, [r0] @load pi2_dc_src 355 356 ldr r0, [sp] @load out_strd 357 358 vld2.s8 {d2, d3}, [r1], r3 @load pred plane 1 => d2 &pred palne 2 => d3 359 vld2.s8 {d3, d4}, [r1], r3 360 vrshr.s16 d0, d0, #6 @i_macro = ((q0 + 32) >> 6); 361 vld2.s8 {d4, d5}, [r1], r3 362 vld2.s8 {d5, d6}, [r1], r3 363 364 vdup.s16 q0, d0[0] @duplicate pi2_sr[0] 365 mov r1, r2 @backup pu1_out 366 367 vtrn.32 d2, d3 @mov the 4 coeffs of current block to d2 368 vtrn.32 d4, d5 369 370 vmov.u16 q15, #0x00ff 371 372 373 vld1.u8 d18, [r2], r0 @load out [8 bit size) -8 coeffs 374 vaddw.u8 q1, q0, d2 @Add pred 375 vld1.u8 d19, [r2], r0 376 vaddw.u8 q2, q0, d4 377 vld1.u8 d20, [r2], r0 378 vld1.u8 d21, [r2], r0 379 380 vqmovun.s16 d2, q1 381 vqmovun.s16 d4, q2 382 383 vmovl.u8 q1, d2 384 vmovl.u8 q2, d4 385 386 vbit.u8 q9, q1, q15 387 vbit.u8 q10, q2, q15 388 389 vst1.u8 d18, [r1], r0 @store out 390 vst1.u8 d19, [r1], r0 391 vst1.u8 d20, [r1], r0 392 vst1.u8 d21, [r1], r0 393 394 bx lr 395 396 397 398 399 400 401 402