1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///*****************************************************************************/ 21///** 22//******************************************************************************* 23//* @file 24//* ih264_resi_trans_quant_av8.c 25//* 26//* @brief 27//* contains function definitions for residual and forward trans 28//* 29//* @author 30//* ittiam 31//* 32//* @par list of functions: 33//* ih264_resi_trans_quant_4x4_av8 34//* ih264_resi_trans_quant_8x8_av8 35//* ih264_resi_trans_quant_chroma_4x4_av8 36//* @remarks 37//* none 38//* 39//******************************************************************************* 40.include "ih264_neon_macros.s" 41.text 42.p2align 2 43//***************************************************************************** 44//* 45//* function name : ih264_resi_trans_quant_4x4 46//* description : this function does cf4 of h264 47//* 48// values returned : none 49// 50// register usage : 51// stack usage : 64 bytes 52// cycles : 53// interruptiaility : interruptable 54// 55// known limitations 56// \assumptions : 57// 58// revision history : 59// dd mm yyyy author(s) changes 60// 1 12 2013 100633 first version 61// 20 1 2014 100633 changes the api, optimization 62// 63//***************************************************************************** 64 65 .global ih264_resi_trans_quant_4x4_av8 66ih264_resi_trans_quant_4x4_av8: 67 68 push_v_regs 69 //x0 :pointer to src buffer 70 //x1 :pointer to pred buffer 71 //x2 :pointer to dst buffer 72 //w3 :source stride 73 //w4 :pred stride 74 //w5 :scale matirx, 75 //x6 :threshold matrix 76 //w7 :qbits 77 //w8 :round factor 78 //x9 :nnz 79 //x10 :pointer to store non quantized dc value 80 81 sxtw x3, w3 82 sxtw x4, w4 83 ldr w8, [sp, #64] //load round factor 84 ldr x10, [sp, #80] //load addres for non quant val 85 neg w7, w7 //negate the qbit value for usiing lsl 86 ldr x9, [sp, #72] 87 88 //------------fucntion loading done----------------; 89 90 ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 91 ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 92 ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 93 ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 94 ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 95 ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 96 ld1 {v24.8b}, [x0] //load first 8 pix src row 4 97 ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 98 99 usubl v0.8h, v30.8b, v31.8b //find residue row 1 100 usubl v2.8h, v28.8b, v29.8b //find residue row 2 101 usubl v4.8h, v26.8b, v27.8b //find residue row 3 102 usubl v6.8h, v24.8b, v25.8b //find residue row 4 103 104 trn1 v1.4h, v0.4h, v2.4h 105 trn2 v3.4h, v0.4h, v2.4h //t12 106 trn1 v5.4h, v4.4h, v6.4h 107 trn2 v7.4h, v4.4h, v6.4h //t23 108 109 trn1 v0.2s, v1.2s, v5.2s 110 trn2 v4.2s, v1.2s, v5.2s //t13 111 trn1 v2.2s, v3.2s, v7.2s 112 trn2 v6.2s, v3.2s, v7.2s //t14 113 114 add v8.4h, v0.4h, v6.4h //x0 = x4+x7 115 add v9.4h, v2.4h, v4.4h //x1 = x5+x6 116 sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 117 sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 118 119 shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) 120 shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) 121 122 add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; 123 sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; 124 add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; 125 sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); 126 127 //taking transpose again so as to make do vert transform 128 trn1 v0.4h, v14.4h, v15.4h 129 trn2 v1.4h, v14.4h, v15.4h //t12 130 trn1 v2.4h, v16.4h, v17.4h 131 trn2 v3.4h, v16.4h, v17.4h //t23 132 133 trn1 v14.2s, v0.2s, v2.2s 134 trn2 v16.2s, v0.2s, v2.2s //t13 135 trn1 v15.2s, v1.2s, v3.2s 136 trn2 v17.2s, v1.2s, v3.2s //t24 137 138 //let us do vertical transform 139 //same code as horiz 140 add v18.4h, v14.4h , v17.4h //x0 = x4+x7 141 add v19.4h, v15.4h , v16.4h //x1 = x5+x6 142 sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 143 sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 144 145 shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) 146 shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) 147 148 dup v8.4s, w8 //load rounding value row 1 149 150 add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; 151 sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; 152 add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; 153 sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); 154 155 dup v23.4s, w8 //load round factor values 156 157 st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress 158//core tranform is done for 4x8 block 1 159 ld1 {v28.4h-v31.4h}, [x5] //load the scaling values 160 161 abs v0.4h, v24.4h //abs val of row 1 162 abs v1.4h, v25.4h //abs val of row 2 163 abs v2.4h, v26.4h //abs val of row 3 164 abs v3.4h, v27.4h //abs val of row 4 165 166 cmgt v4.4h, v24.4h, #0 167 cmgt v5.4h, v25.4h, #0 168 cmgt v6.4h, v26.4h, #0 169 cmgt v7.4h, v27.4h, #0 170 171 smull v0.4s, v0.4h, v28.4h //multiply and add row 1 172 smull v1.4s, v1.4h, v29.4h //multiply and add row 2 173 smull v2.4s, v2.4h, v30.4h //multiply and add row 3 174 smull v3.4s, v3.4h, v31.4h //multiply and add row 4 175 176 add v20.4s, v0.4s, v23.4s 177 add v21.4s, v1.4s, v23.4s 178 add v22.4s, v2.4s, v23.4s 179 add v23.4s, v3.4s, v23.4s 180 181 dup v24.4s, w7 182 183 sshl v20.4s, v20.4s, v24.4s //shift row 1 184 sshl v21.4s, v21.4s, v24.4s //shift row 2 185 sshl v22.4s, v22.4s, v24.4s //shift row 3 186 sshl v23.4s, v23.4s, v24.4s //shift row 4 187 188 xtn v20.4h, v20.4s //narrow row 1 189 xtn v21.4h, v21.4s //narrow row 2 190 xtn v22.4h, v22.4s //narrow row 3 191 xtn v23.4h, v23.4s //narrow row 4 192 193 neg v24.8h, v20.8h //get negative 194 neg v25.8h, v21.8h //get negative 195 neg v26.8h, v22.8h //get negative 196 neg v27.8h, v23.8h //get negative 197 198 //compare with zero for computng nnz 199 cmeq v0.4h, v20.4h, #0 200 cmeq v1.4h, v21.4h, #0 201 cmeq v2.4h, v22.4h, #0 202 cmeq v3.4h, v23.4h, #0 203 204 bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 205 bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 206 bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 207 bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 208 209 //narrow the comaprison result 210 mov v0.d[1], v2.d[0] 211 mov v1.d[1], v3.d[0] 212 213 xtn v0.8b, v0.8h 214 xtn v1.8b, v1.8h 215 216 ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 217 ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 218 219 add v0.8b, v0.8b, v1.8b //i pair add nnz 1 220 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 221 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 222 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 223 224 st1 {v4.4h-v7.4h}, [x2] //store blk 225 226 movi v25.8b, #16 //get max nnz 227 sub v26.8b, v25.8b , v0.8b //invert current nnz 228 st1 {v26.b}[0], [x9] //write nnz 229 230 pop_v_regs 231 ret 232 233 234//***************************************************************************** 235//* 236//* function name : ih264_resi_trans_quant_chroma_4x4 237//* description : this function does residue calculation, forward transform 238//* and quantization for 4x4 chroma block. 239//* 240// values returned : none 241// 242// register usage : 243// stack usage : 64 bytes 244// cycles : 245// interruptiaility : interruptable 246// 247// known limitations 248// \assumptions : 249// 250// revision history : 251// dd mm yyyy author(s) changes 252// 11 2 2015 100664 first version 253// 25 2 2015 100633 first av8 version 254//***************************************************************************** 255 256 .global ih264_resi_trans_quant_chroma_4x4_av8 257ih264_resi_trans_quant_chroma_4x4_av8: 258 259 push_v_regs 260 //x0 :pointer to src buffer 261 //x1 :pointer to pred buffer 262 //x2 :pointer to dst buffer 263 //w3 :source stride 264 //w4 :pred stride 265 //x5 :scale matirx, 266 //x6 :threshold matrix 267 //w7 :qbits 268 //w8 :round factor 269 //x9 :nnz 270 //x10 :pointer to store non quantized dc value 271 272 sxtw x3, w3 273 sxtw x4, w4 274 ldr w8, [sp, #64] //load round factor 275 ldr x10, [sp, #80] //load addres for non quant val 276 neg w7, w7 //negate the qbit value for usiing lsl 277 ldr x9, [sp, #72] 278 //------------fucntion loading done----------------; 279 280 ld1 {v30.8b}, [x0], x3 //load first 8 pix src row 1 281 ld1 {v31.8b}, [x1], x4 //load first 8 pix pred row 1 282 ld1 {v28.8b}, [x0], x3 //load first 8 pix src row 2 283 ld1 {v29.8b}, [x1], x4 //load first 8 pix pred row 2 284 ld1 {v26.8b}, [x0], x3 //load first 8 pix src row 3 285 ld1 {v27.8b}, [x1], x4 //load first 8 pix pred row 3 286 ld1 {v24.8b}, [x0] //load first 8 pix src row 4 287 ld1 {v25.8b}, [x1] //load first 8 pix pred row 4 288 289 290 //deinterleave the loaded values 291 uzp1 v30.8b, v30.8b, v30.8b 292 uzp1 v31.8b, v31.8b, v31.8b 293 uzp1 v28.8b, v28.8b, v28.8b 294 uzp1 v29.8b, v29.8b, v29.8b 295 uzp1 v26.8b, v26.8b, v26.8b 296 uzp1 v27.8b, v27.8b, v27.8b 297 uzp1 v24.8b, v24.8b, v24.8b 298 uzp1 v25.8b, v25.8b, v25.8b 299 //this deinterleaving is the only differnece betweenchrom and luma fucntions 300 301 usubl v0.8h, v30.8b, v31.8b //find residue row 1 302 usubl v2.8h, v28.8b, v29.8b //find residue row 2 303 usubl v4.8h, v26.8b, v27.8b //find residue row 3 304 usubl v6.8h, v24.8b, v25.8b //find residue row 4 305 306 trn1 v1.4h, v0.4h, v2.4h 307 trn2 v3.4h, v0.4h, v2.4h //t12 308 trn1 v5.4h, v4.4h, v6.4h 309 trn2 v7.4h, v4.4h, v6.4h //t23 310 311 trn1 v0.2s, v1.2s, v5.2s 312 trn2 v4.2s, v1.2s, v5.2s //t13 313 trn1 v2.2s, v3.2s, v7.2s 314 trn2 v6.2s, v3.2s, v7.2s //t14 315 316 add v8.4h, v0.4h, v6.4h //x0 = x4+x7 317 add v9.4h, v2.4h, v4.4h //x1 = x5+x6 318 sub v10.4h, v2.4h, v4.4h //x2 = x5-x6 319 sub v11.4h, v0.4h, v6.4h //x3 = x4-x7 320 321 shl v12.4h, v10.4h, #1 //u_shift(x2,1,shft) 322 shl v13.4h, v11.4h, #1 //u_shift(x3,1,shft) 323 324 add v14.4h, v8.4h, v9.4h //x4 = x0 + x1; 325 sub v16.4h, v8.4h, v9.4h //x6 = x0 - x1; 326 add v15.4h, v13.4h, v10.4h //x5 = u_shift(x3,1,shft) + x2; 327 sub v17.4h, v11.4h, v12.4h //x7 = x3 - u_shift(x2,1,shft); 328 329 //taking transpose again so as to make do vert transform 330 trn1 v0.4h, v14.4h, v15.4h 331 trn2 v1.4h, v14.4h, v15.4h //t12 332 trn1 v2.4h, v16.4h, v17.4h 333 trn2 v3.4h, v16.4h, v17.4h //t23 334 335 trn1 v14.2s, v0.2s, v2.2s 336 trn2 v16.2s, v0.2s, v2.2s //t13 337 trn1 v15.2s, v1.2s, v3.2s 338 trn2 v17.2s, v1.2s, v3.2s //t24 339 340 //let us do vertical transform 341 //same code as horiz 342 add v18.4h, v14.4h , v17.4h //x0 = x4+x7 343 add v19.4h, v15.4h , v16.4h //x1 = x5+x6 344 sub v20.4h, v15.4h , v16.4h //x2 = x5-x6 345 sub v21.4h, v14.4h , v17.4h //x3 = x4-x7 346 347 shl v22.4h, v20.4h, #1 //u_shift(x2,1,shft) 348 shl v23.4h, v21.4h, #1 //u_shift(x3,1,shft) 349 350 dup v8.4s, w8 //load rounding value row 1 351 352 add v24.4h, v18.4h , v19.4h //x5 = x0 + x1; 353 sub v26.4h, v18.4h , v19.4h //x7 = x0 - x1; 354 add v25.4h, v23.4h , v20.4h //x6 = u_shift(x3,1,shft) + x2; 355 sub v27.4h, v21.4h , v22.4h //x8 = x3 - u_shift(x2,1,shft); 356 357 dup v23.4s, w8 //load round factor values 358 359 st1 {v24.h}[0], [x10] //store the dc value to alternate dc sddress 360//core tranform is done for 4x8 block 1 361 ld1 {v28.4h-v31.4h}, [x5] //load the scaling values 362 363 abs v0.4h, v24.4h //abs val of row 1 364 abs v1.4h, v25.4h //abs val of row 2 365 abs v2.4h, v26.4h //abs val of row 3 366 abs v3.4h, v27.4h //abs val of row 4 367 368 cmgt v4.4h, v24.4h, #0 369 cmgt v5.4h, v25.4h, #0 370 cmgt v6.4h, v26.4h, #0 371 cmgt v7.4h, v27.4h, #0 372 373 smull v0.4s, v0.4h, v28.4h //multiply and add row 1 374 smull v1.4s, v1.4h, v29.4h //multiply and add row 2 375 smull v2.4s, v2.4h, v30.4h //multiply and add row 3 376 smull v3.4s, v3.4h, v31.4h //multiply and add row 4 377 378 add v20.4s, v0.4s, v23.4s 379 add v21.4s, v1.4s, v23.4s 380 add v22.4s, v2.4s, v23.4s 381 add v23.4s, v3.4s, v23.4s 382 383 dup v24.4s, w7 384 385 sshl v20.4s, v20.4s, v24.4s //shift row 1 386 sshl v21.4s, v21.4s, v24.4s //shift row 2 387 sshl v22.4s, v22.4s, v24.4s //shift row 3 388 sshl v23.4s, v23.4s, v24.4s //shift row 4 389 390 xtn v20.4h, v20.4s //narrow row 1 391 xtn v21.4h, v21.4s //narrow row 2 392 xtn v22.4h, v22.4s //narrow row 3 393 xtn v23.4h, v23.4s //narrow row 4 394 395 neg v24.8h, v20.8h //get negative 396 neg v25.8h, v21.8h //get negative 397 neg v26.8h, v22.8h //get negative 398 neg v27.8h, v23.8h //get negative 399 400 //compare with zero for computng nnz 401 cmeq v0.4h, v20.4h, #0 402 cmeq v1.4h, v21.4h, #0 403 cmeq v2.4h, v22.4h, #0 404 cmeq v3.4h, v23.4h, #0 405 406 bsl v4.8b, v20.8b, v24.8b //restore sign of row 1 and 2 407 bsl v5.8b, v21.8b, v25.8b //restore sign of row 3 and 4 408 bsl v6.8b, v22.8b, v26.8b //restore sign of row 1 and 2 409 bsl v7.8b, v23.8b, v27.8b //restore sign of row 3 and 4 410 411 //narrow the comaprison result 412 mov v0.d[1], v2.d[0] 413 mov v1.d[1], v3.d[0] 414 415 xtn v0.8b, v0.8h 416 xtn v1.8b, v1.8h 417 418 ushr v0.8b, v0.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 419 ushr v1.8b, v1.8b, #7 //i reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ] 420 421 add v0.8b, v0.8b, v1.8b //i pair add nnz 1 422 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 423 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 424 addp v0.8b, v0.8b, v0.8b //i pair add nnz 1 425 426 st1 {v4.4h-v7.4h}, [x2] //store blk 427 428 movi v25.8b, #16 //get max nnz 429 sub v26.8b, v25.8b , v0.8b //invert current nnz 430 st1 {v26.b}[0], [x9] //write nnz 431 432 pop_v_regs 433 ret 434 435 436//***************************************************************************** 437//* 438//* function name : ih264_hadamard_quant_4x4_av8 439//* description : this function does forward hadamard transform and 440//* quantization for luma dc block 441//* 442//* arguments : x0 :pointer to src buffer 443// x1 :pointer to dst buffer 444// x2 :pu2_scale_matrix 445// x3 :pu2_threshold_matrix 446// w4 :u4_qbits 447// w5 :u4_round_factor 448// x6 :pu1_nnz 449// values returned : none 450// 451// register usage : 452// stack usage : 0 bytes 453// cycles : around 454// interruptiaility : interruptable 455// 456// known limitations 457// \assumptions : 458// 459// revision history : 460// dd mm yyyy author(s) changes 461// 20 2 2015 100633 first version 462// 463//***************************************************************************** 464//ih264_hadamard_quant_4x4_av8(word16 *pi2_src, word16 *pi2_dst, 465// const uword16 *pu2_scale_matrix, 466// const uword16 *pu2_threshold_matrix, uword32 u4_qbits, 467// uword32 u4_round_factor,uword8 *pu1_nnz 468// ) 469 .global ih264_hadamard_quant_4x4_av8 470ih264_hadamard_quant_4x4_av8: 471 472//x0 :pointer to src buffer 473//x1 :pointer to dst buffer 474//x2 :pu2_scale_matrix 475//x3 :pu2_threshold_matrix 476//w4 :u4_qbits 477//w5 :u4_round_factor 478//x6 :pu1_nnz 479 480 push_v_regs 481 482 ld4 {v0.4h-v3.4h}, [x0] //load 4x4 block 483 ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] 484 485 saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7; 486 saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6; 487 ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6; 488 ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7; 489 490 dup v30.8h, v30.h[0] //pu2_scale_matrix[0] 491 492 add v14.4s, v4.4s, v5.4s //pi2_dst[0] = x0 + x1; 493 add v15.4s, v7.4s, v6.4s //pi2_dst[1] = x3 + x2; 494 sub v16.4s, v4.4s, v5.4s //pi2_dst[2] = x0 - x1; 495 sub v17.4s, v7.4s, v6.4s //pi2_dst[3] = x3 - x2; 496 497 //transpose 4x4 block 498 trn1 v18.4s, v14.4s, v15.4s 499 trn2 v19.4s, v14.4s, v15.4s 500 trn1 v20.4s, v16.4s, v17.4s 501 trn2 v21.4s, v16.4s, v17.4s 502 503 trn1 v14.2d, v18.2d, v20.2d 504 trn2 v16.2d, v18.2d, v20.2d 505 trn1 v15.2d, v19.2d, v21.2d 506 trn2 v17.2d, v19.2d, v21.2d 507 //end transpose 508 509 add v18.4s, v14.4s, v17.4s //x0 = x4 + x7; 510 add v19.4s, v15.4s, v16.4s //x1 = x5 + x6; 511 sub v20.4s, v15.4s, v16.4s //x2 = x5 - x6; 512 sub v21.4s, v14.4s, v17.4s //x3 = x4 - x7; 513 514 dup v14.4s, w5 //round factor 515 dup v15.4s, v14.s[0] 516 dup v16.4s, v14.s[0] 517 dup v17.4s, v14.s[0] 518 519 add v22.4s, v18.4s, v19.4s //(x0 + x1) 520 add v23.4s, v21.4s, v20.4s //(x3 + x2) 521 sub v24.4s, v18.4s, v19.4s //(x0 - x1) 522 sub v25.4s, v21.4s, v20.4s //(x3 - x2) 523 524 shrn v0.4h, v22.4s, #1 //i4_value = (x0 + x1) >> 1; 525 shrn2 v0.8h, v23.4s, #1 //i4_value = (x3 + x2) >> 1; 526 shrn v1.4h, v24.4s, #1 //i4_value = (x0 - x1) >> 1; 527 shrn2 v1.8h, v25.4s, #1 //i4_value = (x3 - x2) >> 1; 528 529 abs v2.8h, v0.8h 530 abs v3.8h, v1.8h 531 532 cmgt v4.8h, v0.8h, #0 //get the sign row 1,2 533 cmgt v5.8h, v1.8h, #0 534 535 neg w4, w4 //-u4_qbits 536 dup v22.4s, w4 //load -u4_qbits 537 538 umlal v14.4s, v2.4h, v30.4h 539 umlal2 v15.4s, v2.8h, v30.8h 540 umlal v16.4s, v3.4h, v30.4h 541 umlal2 v17.4s, v3.8h, v30.8h 542 543 ushl v14.4s, v14.4s, v22.4s 544 ushl v15.4s, v15.4s, v22.4s 545 ushl v16.4s, v16.4s, v22.4s 546 ushl v17.4s, v17.4s, v22.4s 547 548 uqxtn v14.4h, v14.4s 549 uqxtn2 v14.8h, v15.4s 550 uqxtn v16.4h, v16.4s 551 uqxtn2 v16.8h, v17.4s 552 553 neg v15.8h, v14.8h 554 neg v17.8h, v16.8h 555 556 bsl v4.16b, v14.16b, v15.16b 557 bsl v5.16b, v16.16b, v17.16b 558 559 cmeq v0.8h, v14.8h, #0 560 cmeq v1.8h, v16.8h, #0 561 562 st1 {v4.8h-v5.8h}, [x1] 563 564 movi v20.8b, #16 565 566 xtn v2.8b, v0.8h 567 xtn v3.8b, v1.8h 568 569 ushr v2.8b, v2.8b, #7 570 ushr v3.8b, v3.8b, #7 571 572 add v2.8b, v2.8b, v3.8b 573 addp v2.8b, v2.8b, v2.8b 574 addp v2.8b, v2.8b, v2.8b 575 addp v2.8b, v2.8b, v2.8b 576 sub v20.8b, v20.8b, v2.8b 577 st1 {v20.b}[0], [x6] 578 579 pop_v_regs 580 ret 581 582 583//***************************************************************************** 584//* 585//* function name : ih264_hadamard_quant_2x2_uv 586//* description : this function does forward hadamard transform and 587//* quantization for dc block of chroma for both planes 588//* 589//* arguments : x0 :pointer to src buffer 590// x1 :pointer to dst buffer 591// x2 :pu2_scale_matrix 592// x3 :pu2_threshold_matrix 593// w4 :u4_qbits 594// w5 :u4_round_factor 595// x6 :pu1_nnz 596// values returned : none 597// 598// register usage : 599// stack usage : 0 bytes 600// cycles : around 601// interruptiaility : interruptable 602// 603// known limitations 604// \assumptions : 605// 606// revision history : 607// dd mm yyyy author(s) changes 608// 20 2 2015 100633 first version 609// 610//***************************************************************************** 611// ih264_hadamard_quant_2x2_uv_av8(word16 *pi2_src, word16 *pi2_dst, 612// const uword16 *pu2_scale_matrix, 613// const uword16 *pu2_threshold_matrix, uword32 u4_qbits, 614// uword32 u4_round_factor,uword8 *pu1_nnz 615// ) 616 617 .global ih264_hadamard_quant_2x2_uv_av8 618ih264_hadamard_quant_2x2_uv_av8: 619 620 push_v_regs 621 622 ld2 {v0.4h-v1.4h}, [x0] //load src 623 624 ld1 {v30.h}[0], [x2] //load pu2_scale_matrix[0] 625 dup v30.4h, v30.h[0] //pu2_scale_matrix 626 uxtl v30.4s, v30.4h //pu2_scale_matrix 627 628 neg w4, w4 629 dup v24.4s, w4 //u4_qbits 630 631 dup v25.4s, w5 //round fact 632 dup v26.4s, v25.s[0] 633 634 saddl v2.4s, v0.4h, v1.4h //x0 = x4 + x5;, x2 = x6 + x7; 635 ssubl v3.4s, v0.4h, v1.4h //x1 = x4 - x5; x3 = x6 - x7; 636 637 trn1 v4.4s, v2.4s, v3.4s 638 trn2 v5.4s, v2.4s, v3.4s //q1 -> x0 x1, q2 -> x2 x3 639 640 add v0.4s, v4.4s , v5.4s // (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3); 641 sub v1.4s, v4.4s , v5.4s // (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3); 642 643 abs v2.4s, v0.4s 644 abs v3.4s, v1.4s 645 646 cmgt v4.4s, v0.4s, #0 //get the sign row 1,2 647 cmgt v5.4s, v1.4s, #0 648 649 uqxtn v4.4h, v4.4s 650 sqxtn2 v4.8h, v5.4s 651 652 mla v25.4s, v2.4s, v30.4s 653 mla v26.4s, v3.4s, v30.4s 654 655 ushl v2.4s, v25.4s, v24.4s //>>qbit 656 ushl v3.4s, v26.4s, v24.4s //>>qbit 657 658 uqxtn v2.4h, v2.4s 659 uqxtn2 v2.8h, v3.4s 660 661 neg v5.8h, v2.8h 662 663 bsl v4.16b, v2.16b, v5.16b //*sign 664 665 //rearrange such that we get each plane coeffs as continous 666 mov v5.s[0], v4.s[1] 667 mov v4.s[1], v4.s[2] 668 mov v4.s[2], v5.s[0] 669 670 cmeq v5.8h, v4.8h, #0 //compute nnz 671 xtn v5.8b, v5.8h //reduce nnz comparison to 1 bit 672 ushr v5.8b, v5.8b, #7 //reduce nnz comparison to 1 bit 673 movi v20.8b, #4 //since we add zeros, we need to subtract from 4 to get nnz 674 addp v5.8b, v5.8b, v5.8b //sum up nnz 675 addp v5.8b, v5.8b, v5.8b //sum up nnz 676 677 st1 {v4.8h}, [x1] //store the block 678 679 st1 {v4.8h}, [x1] //store the block 680 sub v20.8b, v20.8b, v5.8b //4- numzeros 681 682 st1 {v20.h}[0], [x6] //store nnz 683 684 pop_v_regs 685 ret 686 687 688 689