1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21///******************************************************************************* 22// * //file 23// * ih264_iquant_itrans_recon_a9.s 24// * 25// * //brief 26// * Contains function definitions for single stage inverse transform 27// * 28// * //author 29// * Parthiban V 30// * Mohit 31// * Harinarayanaan 32// * 33// * //par List of Functions: 34// * - ih264_iquant_itrans_recon_4x4_av8() 35// * - ih264_iquant_itrans_recon_8x8_av8() 36// * - ih264_iquant_itrans_recon_chroma_4x4_av8() 37// * 38// * //remarks 39// * None 40// * 41// ******************************************************************************* 42 43.text 44.p2align 2 45.include "ih264_neon_macros.s" 46 47///* 48// ******************************************************************************* 49// * 50// * //brief 51// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 52// * 53// * //par Description: 54// * Performs inverse transform Ci4 and adds the residue to get the 55// * reconstructed block 56// * 57// * //param[in] pi2_src 58// * Input 4x4 coefficients 59// * 60// * //param[in] pu1_pred 61// * Prediction 4x4 block 62// * 63// * //param[out] pu1_out 64// * Output 4x4 block 65// * 66// * //param[in] u4_qp_div_6 67// * QP 68// * 69// * //param[in] pu2_weigh_mat 70// * Pointer to weight matrix 71// * 72// * //param[in] pred_strd, 73// * Prediction stride 74// * 75// * //param[in] out_strd 76// * Output Stride 77// * 78// *//param[in] pi2_tmp 79// * temporary buffer of size 1*16 80// * 81// * //param[in] pu2_iscal_mat 82// * Pointer to the inverse quantization matrix 83// * 84// * //returns Void 85// * 86// * //remarks 87// * None 88// * 89// ******************************************************************************* 90// */ 91//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src, 92// UWORD8 *pu1_pred, 93// UWORD8 *pu1_out, 94// WORD32 pred_strd, 95// WORD32 out_strd, 96// const UWORD16 *pu2_iscal_mat, 97// const UWORD16 *pu2_weigh_mat, 98// UWORD32 u4_qp_div_6, 99// WORD32 *pi4_tmp, 100// WORD32 iq_start_idx 101// WORD16 *pi2_dc_ld_addr) 102//**************Variables Vs Registers***************************************** 103//x0 => *pi2_src 104//x1 => *pu1_pred 105//x2 => *pu1_out 106//x3 => pred_strd 107//x4 => out_strd 108//x5 => *pu2_iscal_mat 109//x6 => *pu2_weigh_mat 110//x7 => u4_qp_div_6 111// => pi4_tmp 112// => iq_start_idx 113// => pi2_dc_ld_addr 114//Only one shift is done in horizontal inverse because, 115//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 116//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 117 118 .global ih264_iquant_itrans_recon_4x4_av8 119ih264_iquant_itrans_recon_4x4_av8: 120 121 push_v_regs 122 123 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 124 125 ldr w8, [sp, #72] //Loads iq_start_idx 126 sxtw x8, w8 127 128 ldr x10, [sp, #80] //Load alternate dc address 129 130 subs x8, x8, #1 // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set 131 132 133//=======================DEQUANT FROM HERE=================================== 134 135 ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 136 ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 137 ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 138 139 140 mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 141 mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 142 mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 143 mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 144 145 smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 146 smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 147 smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 148 smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 149 150 sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 151 sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 152 sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 153 sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 154 155 sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 156 sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 157 sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 158 sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 159 160 bne skip_loading_luma_dc_src 161 ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_ld_addr[0], if x8==1 162skip_loading_luma_dc_src: 163 164 //========= PROCESS IDCT FROM HERE ======= 165 //Steps for Stage 1: 166 //------------------ 167 ld1 {v30.s}[0], [x1], x3 // i row load pu1_pred buffer 168 169 sshr v8.4h, v1.4h, #1 // d1>>1 170 sshr v9.4h, v3.4h, #1 // d3>>1 171 172 add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// 173 sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// 174 sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// 175 add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// 176 177 ld1 {v30.s}[1], [x1], x3 // ii row load pu1_pred buffer 178 179 add v10.4h, v4.4h , v7.4h // x0+x3 180 add v11.4h, v5.4h , v6.4h // x1+x2 181 sub v12.4h, v5.4h , v6.4h // x1-x2 182 sub v13.4h, v4.4h , v7.4h 183 184 ld1 {v31.s}[0], [x1], x3 // iii row load pu1_pred buf 185 186 187 //Steps for Stage 2: 188 //transopose 189 trn1 v4.4h, v10.4h, v11.4h 190 trn2 v5.4h, v10.4h, v11.4h 191 trn1 v6.4h, v12.4h, v13.4h 192 trn2 v7.4h, v12.4h, v13.4h 193 194 trn1 v10.2s, v4.2s, v6.2s // 0 195 trn1 v11.2s, v5.2s, v7.2s // 8 196 trn2 v12.2s, v4.2s, v6.2s // 4 197 trn2 v13.2s, v5.2s, v7.2s 198 //end transpose 199 200 sshr v18.4h, v11.4h, #1 // q0>>1 201 sshr v19.4h, v13.4h, #1 // q1>>1 202 203 add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// 204 sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// 205 sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// 206 add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// 207 208 209 ld1 {v31.s}[1], [x1], x3 // iv row load pu1_pred buffer 210 211 add v20.4h, v14.4h, v17.4h // x0 + x3 212 add v21.4h, v15.4h, v16.4h // x1 + x2 213 sub v22.4h, v15.4h, v16.4h // x1 - x2 214 sub v23.4h, v14.4h, v17.4h // x0 - x3 215 216 mov v20.d[1], v21.d[0] 217 mov v22.d[1], v23.d[0] 218 219 srshr v20.8h, v20.8h, #6 220 srshr v22.8h, v22.8h, #6 221 222 uaddw v20.8h, v20.8h , v30.8b 223 uaddw v22.8h, v22.8h , v31.8b 224 225 sqxtun v0.8b, v20.8h 226 sqxtun v1.8b, v22.8h 227 228 st1 {v0.s}[0], [x2], x4 //i row store the value 229 st1 {v0.s}[1], [x2], x4 //ii row store the value 230 st1 {v1.s}[0], [x2], x4 //iii row store the value 231 st1 {v1.s}[1], [x2] //iv row store the value 232 233 pop_v_regs 234 ret 235 236 237///** 238// ******************************************************************************* 239// * 240// * @brief 241// * This function performs inverse quant and Inverse transform type Ci4 for 4*4 block 242// * 243// * @par Description: 244// * Performs inverse transform Ci4 and adds the residue to get the 245// * reconstructed block 246// * 247// * @param[in] pi2_src 248// * Input 4x4 coefficients 249// * 250// * @param[in] pu1_pred 251// * Prediction 4x4 block 252// * 253// * @param[out] pu1_out 254// * Output 4x4 block 255// * 256// * @param[in] u4_qp_div_6 257// * QP 258// * 259// * @param[in] pu2_weigh_mat 260// * Pointer to weight matrix 261// * 262// * @param[in] pred_strd, 263// * Prediction stride 264// * 265// * @param[in] out_strd 266// * Output Stride 267// * 268// *@param[in] pi2_tmp 269// * temporary buffer of size 1*16 270// * 271// * @param[in] pu2_iscal_mat 272// * Pointer to the inverse quantization matrix 273// * 274// * @returns Void 275// * 276// * @remarks 277// * None 278// * 279// ******************************************************************************* 280// */ 281//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src, 282// UWORD8 *pu1_pred, 283// UWORD8 *pu1_out, 284// WORD32 pred_strd, 285// WORD32 out_strd, 286// const UWORD16 *pu2_iscal_mat, 287// const UWORD16 *pu2_weigh_mat, 288// UWORD32 u4_qp_div_6, 289// WORD32 *pi4_tmp 290// WORD16 *pi2_dc_src) 291//**************Variables Vs Registers***************************************** 292//x0 => *pi2_src 293//x1 => *pu1_pred 294//x2 => *pu1_out 295//x3 => pred_strd 296//x4 => out_strd 297//x5 => *pu2_iscal_mat 298//x6 => *pu2_weigh_mat 299//x7 => u4_qp_div_6 300//sp => pi4_tmp 301//sp#8 => *pi2_dc_src 302 303 .global ih264_iquant_itrans_recon_chroma_4x4_av8 304ih264_iquant_itrans_recon_chroma_4x4_av8: 305 306//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4 307//If the macro value changes need to change the instruction according to it. 308//Only one shift is done in horizontal inverse because, 309//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value 310//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0 311 312//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing 313//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput 314//all instructions were taken as equal 315 316 //reduce sp by 64 317 push_v_regs 318 319 dup v30.4s, w7 //Populate the u4_qp_div_6 in Q15 320 321 //was at sp + 8, hence now at sp+64+8 = sp+72 322 ldr x10, [sp, #72] //Load alternate dc address 323 324//=======================DEQUANT FROM HERE=================================== 325 326 ld4 {v20.4h - v23.4h}, [x5] // load pu2_iscal_mat[i], i =0..15 327 ld4 {v26.4h - v29.4h}, [x6] // pu2_weigh_mat[i], i =0..15 328 ld4 {v16.4h - v19.4h}, [x0] // pi2_src_tmp[i], i =0..15 329 330 331 mul v20.4h, v20.4h, v26.4h // x[i]=(scale[i] * dequant[i]) where i = 0..3 332 mul v21.4h, v21.4h, v27.4h // x[i]=(scale[i] * dequant[i]) where i = 4..7 333 mul v22.4h, v22.4h, v28.4h // x[i]=(scale[i] * dequant[i]) where i = 8..11 334 mul v23.4h, v23.4h, v29.4h // x[i]=(scale[i] * dequant[i]) where i = 12..14 335 336 smull v0.4s, v16.4h, v20.4h // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3 337 smull v2.4s, v17.4h, v21.4h // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7 338 smull v4.4s, v18.4h, v22.4h // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11 339 smull v6.4s, v19.4h, v23.4h // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15 340 341 sshl v0.4s, v0.4s, v30.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3 342 sshl v2.4s, v2.4s, v30.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7 343 sshl v4.4s, v4.4s, v30.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11 344 sshl v6.4s, v6.4s, v30.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15 345 346 sqrshrn v0.4h, v0.4s, #0x4 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3 347 sqrshrn v1.4h, v2.4s, #0x4 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7 348 sqrshrn v2.4h, v4.4s, #0x4 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11 349 sqrshrn v3.4h, v6.4s, #0x4 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15 350 351 ld1 {v0.h}[0], [x10] // loads signed halfword pi2_dc_src[0] 352 353 //========= PROCESS IDCT FROM HERE ======= 354 //Steps for Stage 1: 355 //------------------ 356 357 sshr v8.4h, v1.4h, #1 // d1>>1 358 sshr v9.4h, v3.4h, #1 // d3>>1 359 360 add v4.4h, v0.4h, v2.4h // x0 = d0 + d2// 361 sub v5.4h, v0.4h, v2.4h // x1 = d0 - d2// 362 sub v6.4h, v8.4h, v3.4h // x2 = (d1 >> 1) - d3// 363 add v7.4h, v1.4h, v9.4h // x3 = d1 + (d3 >> 1)// 364 365 366 add v10.4h, v4.4h , v7.4h // x0+x3 367 add v11.4h, v5.4h , v6.4h // x1+x2 368 sub v12.4h, v5.4h , v6.4h // x1-x2 369 sub v13.4h, v4.4h , v7.4h 370 371 ld1 {v26.8b}, [x1], x3 // i row load pu1_pred buffer 372 ld1 {v27.8b}, [x1], x3 // ii row load pu1_pred buffer 373 ld1 {v28.8b}, [x1], x3 // iii row load pu1_pred buf 374 ld1 {v29.8b}, [x1], x3 // iv row load pu1_pred buffer 375 376 //Steps for Stage 2: 377 //transopose 378 trn1 v4.4h, v10.4h, v11.4h 379 trn2 v5.4h, v10.4h, v11.4h 380 trn1 v6.4h, v12.4h, v13.4h 381 trn2 v7.4h, v12.4h, v13.4h 382 383 trn1 v10.2s, v4.2s, v6.2s // 0 384 trn1 v11.2s, v5.2s, v7.2s // 8 385 trn2 v12.2s, v4.2s, v6.2s // 4 386 trn2 v13.2s, v5.2s, v7.2s 387 //end transpose 388 389 sshr v18.4h, v11.4h, #1 // q0>>1 390 sshr v19.4h, v13.4h, #1 // q1>>1 391 392 add v14.4h, v10.4h, v12.4h // x0 = q0 + q2// 393 sub v15.4h, v10.4h, v12.4h // x1 = q0 - q2// 394 sub v16.4h, v18.4h, v13.4h // x2 = (q1 >> 1) - q3// 395 add v17.4h, v11.4h, v19.4h // x3 = q1+ (q3 >> 3)// 396 397 //Backup the output addr 398 mov x0, x2 399 400 //load outpt buufer for interleaving 401 ld1 {v10.8b}, [x2], x4 402 ld1 {v11.8b}, [x2], x4 403 ld1 {v12.8b}, [x2], x4 404 ld1 {v13.8b}, [x2] 405 406 add v20.4h, v14.4h, v17.4h // x0 + x3 407 add v21.4h, v15.4h, v16.4h // x1 + x2 408 sub v22.4h, v15.4h, v16.4h // x1 - x2 409 sub v23.4h, v14.4h, v17.4h // x0 - x3 410 411 srshr v20.4h, v20.4h, #6 412 srshr v21.4h, v21.4h, #6 413 srshr v22.4h, v22.4h, #6 414 srshr v23.4h, v23.4h, #6 415 416 //nop v30.8b //dummy for deinterleaving 417 movi v31.4h, #0x00ff //mask for interleaving [copy lower 8 bits] 418 419 //Extract u/v plane from interleaved data 420 uzp1 v26.8b, v26.8b, v30.8b 421 uzp1 v27.8b, v27.8b, v30.8b 422 uzp1 v28.8b, v28.8b, v30.8b 423 uzp1 v29.8b, v29.8b, v30.8b 424 425 uaddw v20.8h, v20.8h, v26.8b 426 uaddw v21.8h, v21.8h, v27.8b 427 uaddw v22.8h, v22.8h, v28.8b 428 uaddw v23.8h, v23.8h, v29.8b 429 430 sqxtun v0.8b, v20.8h 431 sqxtun v1.8b, v21.8h 432 sqxtun v2.8b, v22.8h 433 sqxtun v3.8b, v23.8h 434 435 //long the output so that we have 0 at msb and value at lsb 436 uxtl v6.8h, v0.8b 437 uxtl v7.8h, v1.8b 438 uxtl v8.8h, v2.8b 439 uxtl v9.8h, v3.8b 440 441 //select lsbs from proceesd data and msbs from pu1_out loaded data 442 bit v10.8b, v6.8b, v31.8b 443 bit v11.8b, v7.8b, v31.8b 444 bit v12.8b, v8.8b, v31.8b 445 bit v13.8b, v9.8b, v31.8b 446 447 //store the interleaved result 448 st1 {v10.8b}, [x0], x4 449 st1 {v11.8b}, [x0], x4 450 st1 {v12.8b}, [x0], x4 451 st1 {v13.8b}, [x0] 452 453 pop_v_regs 454 ret 455 456///* 457// ******************************************************************************* 458// * 459// * //brief 460// * This function performs inverse quant and Inverse transform type Ci4 for 8*8 block 461// * 462// * //par Description: 463// * Performs inverse transform Ci8 and adds the residue to get the 464// * reconstructed block 465// * 466// * //param[in] pi2_src 467// * Input 4x4 coefficients 468// * 469// * //param[in] pu1_pred 470// * Prediction 4x4 block 471// * 472// * //param[out] pu1_out 473// * Output 4x4 block 474// * 475// * //param[in] u4_qp_div_6 476// * QP 477// * 478// * //param[in] pu2_weigh_mat 479// * Pointer to weight matrix 480// * 481// * //param[in] pred_strd, 482// * Prediction stride 483// * 484// * //param[in] out_strd 485// * Output Stride 486// * 487// *//param[in] pi2_tmp 488// * temporary buffer of size 1*64 489// * 490// * //param[in] pu2_iscal_mat 491// * Pointer to the inverse quantization matrix 492// * 493// * //returns Void 494// * 495// * //remarks 496// * None 497// * 498// ******************************************************************************* 499// */ 500//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src, 501// UWORD8 *pu1_pred, 502// UWORD8 *pu1_out, 503// WORD32 pred_strd, 504// WORD32 out_strd, 505// const UWORD16 *pu2_iscal_mat, 506// const UWORD16 *pu2_weigh_mat, 507// UWORD32 u4_qp_div_6, 508// WORD32 *pi4_tmp, 509// WORD32 iq_start_idx 510// WORD16 *pi2_dc_ld_addr) 511//**************Variables Vs Registers***************************************** 512//x0 => *pi2_src 513//x1 => *pu1_pred 514//x2 => *pu1_out 515//x3 => pred_strd 516//x4 => out_strd 517//x5 => *pu2_iscal_mat 518//x6 => *pu2_weigh_mat 519//x7 => u4_qp_div_6 520//NOT USED => pi4_tmp 521//NOT USED => iq_start_idx 522//NOT USED => pi2_dc_ld_addr 523 524 .global ih264_iquant_itrans_recon_8x8_av8 525ih264_iquant_itrans_recon_8x8_av8: 526 527 push_v_regs 528 529 ld1 {v8.8h -v11.8h}, [x5], #64 530 ld1 {v12.8h-v15.8h}, [x5] 531 532 ld1 {v16.8h -v19.8h}, [x6], #64 533 ld1 {v20.8h -v23.8h}, [x6] 534 535 mov x8, #16 536 ld1 {v0.8h}, [x0], x8 537 ld1 {v1.8h}, [x0], x8 538 ld1 {v2.8h}, [x0], x8 539 ld1 {v3.8h}, [x0], x8 540 ld1 {v4.8h}, [x0], x8 541 ld1 {v5.8h}, [x0], x8 542 ld1 {v6.8h}, [x0], x8 543 ld1 {v7.8h}, [x0] 544 545 mul v8.8h, v8.8h, v16.8h 546 mul v9.8h, v9.8h, v17.8h 547 mul v10.8h, v10.8h, v18.8h 548 mul v11.8h, v11.8h, v19.8h 549 mul v12.8h, v12.8h, v20.8h 550 mul v13.8h, v13.8h, v21.8h 551 mul v14.8h, v14.8h, v22.8h 552 mul v15.8h, v15.8h, v23.8h 553 554 smull v16.4s, v0.4h, v8.4h 555 smull2 v17.4s, v0.8h, v8.8h 556 smull v18.4s, v1.4h, v9.4h 557 smull2 v19.4s, v1.8h, v9.8h 558 smull v20.4s, v2.4h, v10.4h 559 smull2 v21.4s, v2.8h, v10.8h 560 smull v22.4s, v3.4h, v11.4h 561 smull2 v23.4s, v3.8h, v11.8h 562 smull v24.4s, v4.4h, v12.4h 563 smull2 v25.4s, v4.8h, v12.8h 564 smull v26.4s, v5.4h, v13.4h 565 smull2 v27.4s, v5.8h, v13.8h 566 smull v28.4s, v6.4h, v14.4h 567 smull2 v29.4s, v6.8h, v14.8h 568 smull v30.4s, v7.4h, v15.4h 569 smull2 v31.4s, v7.8h, v15.8h 570 571 dup v0.4s, w7 572 573 sshl v16.4s, v16.4s, v0.4s 574 sshl v17.4s, v17.4s, v0.4s 575 sshl v18.4s, v18.4s, v0.4s 576 sshl v19.4s, v19.4s, v0.4s 577 sshl v20.4s, v20.4s, v0.4s 578 sshl v21.4s, v21.4s, v0.4s 579 sshl v22.4s, v22.4s, v0.4s 580 sshl v23.4s, v23.4s, v0.4s 581 sshl v24.4s, v24.4s, v0.4s 582 sshl v25.4s, v25.4s, v0.4s 583 sshl v26.4s, v26.4s, v0.4s 584 sshl v27.4s, v27.4s, v0.4s 585 sshl v28.4s, v28.4s, v0.4s 586 sshl v29.4s, v29.4s, v0.4s 587 sshl v30.4s, v30.4s, v0.4s 588 sshl v31.4s, v31.4s, v0.4s 589 590 sqrshrn v0.4h, v16.4s, #6 591 sqrshrn2 v0.8h, v17.4s, #6 592 sqrshrn v1.4h, v18.4s, #6 593 sqrshrn2 v1.8h, v19.4s, #6 594 sqrshrn v2.4h, v20.4s, #6 595 sqrshrn2 v2.8h, v21.4s, #6 596 sqrshrn v3.4h, v22.4s, #6 597 sqrshrn2 v3.8h, v23.4s, #6 598 sqrshrn v4.4h, v24.4s, #6 599 sqrshrn2 v4.8h, v25.4s, #6 600 sqrshrn v5.4h, v26.4s, #6 601 sqrshrn2 v5.8h, v27.4s, #6 602 sqrshrn v6.4h, v28.4s, #6 603 sqrshrn2 v6.8h, v29.4s, #6 604 sqrshrn v7.4h, v30.4s, #6 605 sqrshrn2 v7.8h, v31.4s, #6 606 607 //loop counter 608 mov x8, #2 609//1x8 transofORM 610trans_1x8_1d: 611 612 //transpose 8x8 613 trn1 v8.8h, v0.8h, v1.8h 614 trn2 v9.8h, v0.8h, v1.8h 615 trn1 v10.8h, v2.8h, v3.8h 616 trn2 v11.8h, v2.8h, v3.8h 617 trn1 v12.8h, v4.8h, v5.8h 618 trn2 v13.8h, v4.8h, v5.8h 619 trn1 v14.8h, v6.8h, v7.8h 620 trn2 v15.8h, v6.8h, v7.8h 621 622 trn1 v0.4s, v8.4s, v10.4s 623 trn2 v2.4s, v8.4s, v10.4s 624 trn1 v1.4s, v9.4s, v11.4s 625 trn2 v3.4s, v9.4s, v11.4s 626 trn1 v4.4s, v12.4s, v14.4s 627 trn2 v6.4s, v12.4s, v14.4s 628 trn1 v5.4s, v13.4s, v15.4s 629 trn2 v7.4s, v13.4s, v15.4s 630 631 trn1 v8.2d, v0.2d, v4.2d //0 632 trn2 v12.2d, v0.2d, v4.2d //1 633 trn1 v9.2d, v1.2d, v5.2d //2 634 trn2 v13.2d, v1.2d, v5.2d //3 635 trn1 v10.2d, v2.2d, v6.2d //4 636 trn2 v14.2d, v2.2d, v6.2d //5 637 trn1 v11.2d, v3.2d, v7.2d //6 638 trn2 v15.2d, v3.2d, v7.2d //7 639 640 // 1 3 5 6 7 641 sshr v16.8h, v9.8h, #1 //(pi2_tmp_ptr[1] >> 1) 642 sshr v17.8h, v10.8h, #1 //(pi2_tmp_ptr[2] >> 1) 643 sshr v18.8h, v11.8h, #1 //(pi2_tmp_ptr[3] >> 1) 644 sshr v19.8h, v13.8h, #1 //(pi2_tmp_ptr[5] >> 1) 645 sshr v20.8h, v14.8h, #1 //(pi2_tmp_ptr[6] >> 1) 646 sshr v21.8h, v15.8h, #1 //(pi2_tmp_ptr[7] >> 1) 647 648 add v0.8h, v8.8h, v12.8h // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] ); 649 sub v2.8h, v8.8h, v12.8h // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] ); 650 651 sub v4.8h, v17.8h, v14.8h //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] ); 652 add v6.8h, v10.8h, v20.8h //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1)); 653 654 //-w3 + w5 655 ssubl v22.4s, v13.4h, v11.4h 656 ssubl2 v23.4s, v13.8h, v11.8h 657 //w3 + w5 658 saddl v24.4s, v13.4h, v11.4h 659 saddl2 v25.4s, v13.8h, v11.8h 660 //-w1 + w7 661 ssubl v26.4s, v15.4h, v9.4h 662 ssubl2 v27.4s, v15.8h, v9.8h 663 //w1 + w7 664 saddl v28.4s, v15.4h, v9.4h 665 saddl2 v29.4s, v15.8h, v9.8h 666 667 //-w3 + w5 - w7 668 ssubw v22.4s, v22.4s, v15.4h 669 ssubw2 v23.4s, v23.4s, v15.8h 670 //w3 + w5 + w1 671 saddw v24.4s, v24.4s, v9.4h 672 saddw2 v25.4s, v25.4s, v9.8h 673 //-w1 + w7 + w5 674 saddw v26.4s, v26.4s, v13.4h 675 saddw2 v27.4s, v27.4s, v13.8h 676 //w1 + w7 - w3 677 ssubw v28.4s, v28.4s, v11.4h 678 ssubw2 v29.4s, v29.4s, v11.8h 679 680 //-w3 + w5 - w7 - (w7 >> 1) 681 ssubw v22.4s, v22.4s, v21.4h 682 ssubw2 v23.4s, v23.4s, v21.8h 683 //w3 + w5 + w1 + (w1 >> 1) 684 saddw v24.4s, v24.4s, v16.4h 685 saddw2 v25.4s, v25.4s, v16.8h 686 //-w1 + w7 + w5 + (w5 >> 1) 687 saddw v26.4s, v26.4s, v19.4h 688 saddw2 v27.4s, v27.4s, v19.8h 689 //w1 + w7 - w3 - (w3 >> 1) 690 ssubw v28.4s, v28.4s, v18.4h 691 ssubw2 v29.4s, v29.4s, v18.8h 692 693 xtn v1.4h, v22.4s 694 xtn2 v1.8h, v23.4s 695 xtn v3.4h, v28.4s 696 xtn2 v3.8h, v29.4s 697 xtn v5.4h, v26.4s 698 xtn2 v5.8h, v27.4s 699 xtn v7.4h, v24.4s 700 xtn2 v7.8h, v25.4s 701 702 sshr v16.8h, v1.8h, #2 //(y1 >> 2) 703 sshr v17.8h, v3.8h, #2 //(y3 >> 2) 704 sshr v18.8h, v5.8h, #2 //(y5 >> 2) 705 sshr v19.8h, v7.8h, #2 //(y7 >> 2) 706 707 add v8.8h, v0.8h, v6.8h 708 add v9.8h, v1.8h, v19.8h 709 add v10.8h, v2.8h, v4.8h 710 add v11.8h, v3.8h, v18.8h 711 sub v12.8h, v2.8h, v4.8h 712 sub v13.8h, v17.8h, v5.8h 713 sub v14.8h, v0.8h, v6.8h 714 sub v15.8h, v7.8h, v16.8h 715 716 add v0.8h, v8.8h, v15.8h 717 add v1.8h, v10.8h, v13.8h 718 add v2.8h, v12.8h, v11.8h 719 add v3.8h, v14.8h, v9.8h 720 sub v4.8h, v14.8h, v9.8h 721 sub v5.8h, v12.8h, v11.8h 722 sub v6.8h, v10.8h, v13.8h 723 sub v7.8h, v8.8h, v15.8h 724 725 subs x8, x8, #1 726 bne trans_1x8_1d 727 728 ld1 {v22.8b}, [x1], x3 729 ld1 {v23.8b}, [x1], x3 730 ld1 {v24.8b}, [x1], x3 731 ld1 {v25.8b}, [x1], x3 732 ld1 {v26.8b}, [x1], x3 733 ld1 {v27.8b}, [x1], x3 734 ld1 {v28.8b}, [x1], x3 735 ld1 {v29.8b}, [x1] 736 737 srshr v0.8h, v0.8h, #6 738 srshr v1.8h, v1.8h, #6 739 srshr v2.8h, v2.8h, #6 740 srshr v3.8h, v3.8h, #6 741 srshr v4.8h, v4.8h, #6 742 srshr v5.8h, v5.8h, #6 743 srshr v6.8h, v6.8h, #6 744 srshr v7.8h, v7.8h, #6 745 746 uaddw v0.8h, v0.8h, v22.8b 747 uaddw v1.8h, v1.8h, v23.8b 748 uaddw v2.8h, v2.8h, v24.8b 749 uaddw v3.8h, v3.8h, v25.8b 750 uaddw v4.8h, v4.8h, v26.8b 751 uaddw v5.8h, v5.8h, v27.8b 752 uaddw v6.8h, v6.8h, v28.8b 753 uaddw v7.8h, v7.8h, v29.8b 754 755 sqxtun v0.8b, v0.8h 756 sqxtun v1.8b, v1.8h 757 sqxtun v2.8b, v2.8h 758 sqxtun v3.8b, v3.8h 759 sqxtun v4.8b, v4.8h 760 sqxtun v5.8b, v5.8h 761 sqxtun v6.8b, v6.8h 762 sqxtun v7.8b, v7.8h 763 764 st1 {v0.8b}, [x2], x4 765 st1 {v1.8b}, [x2], x4 766 st1 {v2.8b}, [x2], x4 767 st1 {v3.8b}, [x2], x4 768 st1 {v4.8b}, [x2], x4 769 st1 {v5.8b}, [x2], x4 770 st1 {v6.8b}, [x2], x4 771 st1 {v7.8b}, [x2] 772 773 pop_v_regs 774 ret 775 776 777 778 779