1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19///******************************************************************************* 20//* //file 21//* ihevc_deblk_luma_vert.s 22//* 23//* //brief 24//* contains function definitions for inter prediction interpolation. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* //author 30//* anand s 31//* 32//* //par list of functions: 33//* 34//* 35//* //remarks 36//* none 37//* 38//*******************************************************************************/ 39 40.text 41.align 4 42 43 44 45.extern gai4_ihevc_tc_table 46.extern gai4_ihevc_beta_table 47 48.globl ihevc_deblk_luma_vert_av8 49 50.type ihevc_deblk_luma_vert_av8, %function 51 52ihevc_deblk_luma_vert_av8: 53 54 sxtw x5,w5 55 sxtw x6,w6 56 stp d8,d9,[sp,#-16]! 57 stp d10,d11,[sp,#-16]! 58 stp d12,d13,[sp,#-16]! 59 stp d14,d15,[sp,#-16]! 60 stp x19, x20,[sp,#-16]! 61 stp x21, x22,[sp,#-16]! 62 mov x21,x7 63 ldr w22,[sp,#96] 64 add x3,x3,x4 65 add x3,x3,#1 66 asr x3,x3,#1 67 add x7,x3,x5,lsl #1 68 add x3,x3,x6,lsl #1 69 cmp x7,#0x33 70 mov x20,#0x33 71 csel x7, x20, x7,gt 72 bgt l1.56 73 cmp x7,#0x0 74 mov x20,#0x0 75 csel x7, x20, x7,lt // x7 has the beta_index value 76l1.56: 77 78// bic x2,x2,#1 79 asr x2,x2,#1 80 81 add x3,x3,x2,lsl #1 82 cmp x3,#0x35 83 mov x20,#0x35 84 csel x3, x20, x3,gt 85 bgt l1.88 86 cmp x3,#0x0 87 mov x20,#0x0 88 csel x3, x20, x3,lt // x3 has the tc_index value 89 90// qp_luma = (quant_param_p + quant_param_q + 1) >> 1// 91// beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)// 92// tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)// 93 94l1.88: 95 adrp x2, :got:gai4_ihevc_beta_table 96 ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table] 97 98 movi v18.8b, #0x2 99 adrp x4, :got:gai4_ihevc_tc_table 100 ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table] 101 102 ldr w5,[x2,x7,lsl #2] // beta 103 movi v16.8h, #0x2 104 ldr w6,[x4,x3,lsl #2] // tc 105 lsl x8,x6,#1 106 cmp x6,#0 107 dup v19.8b,w8 108 sub x7,x0,#4 109 movi v23.8b, #0x3 110 beq l1.964 111 112 113 sub x19,x0,#3 114 ld1 {v15.8b},[x7],x1 115 ldrb w8,[x19] // -3 value 116 ld1 {v1.8b},[x7],x1 117 ldrb w10,[x19,#1] //-2 value 118 ld1 {v29.8b},[x7],x1 119 ldrb w11,[x19,#2] //-1 value 120 ld1 {v0.8b},[x7] 121 ldrb w12,[x0,#0] // 0 value 122 ldrb w9,[x0,#1] // 1 value 123 trn1 v24.8b,v15.8b,v1.8b 124 trn2 v1.8b,v15.8b,v1.8b 125 ldrb w2,[x0,#2] // 2 value 126 trn1 v2.8b,v29.8b,v0.8b 127 trn2 v0.8b,v29.8b,v0.8b 128 add x12,x12,x2 129 subs x9,x12,x9,lsl #1 // dq0 value is stored in x9 130 csneg x9,x9,x9,pl 131//dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )// 132 mov v29.8b,v24.8b 133 trn1 v24.4h,v29.4h,v2.4h 134 trn2 v2.4h,v29.4h,v2.4h 135 add x8,x8,x11 136 mov v15.8b,v1.8b 137 trn1 v1.4h,v15.4h,v0.4h 138 trn2 v0.4h,v15.4h,v0.4h 139 subs x8,x8,x10,lsl #1 140 csneg x8,x8,x8,pl 141// dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )// 142 143 144 145 add x14,x1,x1,lsl #1 146 add x14,x0,x14 147 148 sub x19,x14,#3 149 dup v4.2s, v24.s[1] 150 ldrb w2,[x19] // -2 value 151 dup v7.2s, v2.s[1] 152 ldrb w10,[x19,#1] // -2 value 153 dup v3.2s, v2.s[0] 154 ldrb w11,[x19,#2] // -1 value 155 dup v5.2s, v1.s[1] 156 ldrb w12,[x14,#0] // 0 value 157 dup v6.2s, v1.s[0] 158 ldrb w3,[x14,#1] // 1 value 159 dup v2.2s, v0.s[0] 160 ldrb w4,[x14,#2] // 2 value 161 162 163 add x12,x12,x4 164 subs x12,x12,x3,lsl #1 // dq3value is stored in x12 165 csneg x12,x12,x12,pl 166// dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )// 167 168 169 add x2,x2,x11 170 subs x11,x2,x10,lsl #1 171 csneg x11,x11,x11,pl // dp3 value is stored in x8 172// dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )// 173 174 175 176 add x3,x8,x9 // x3 has the d0 value 177 add x4,x11,x12 // x4 has the d3 value 178 179 180// d0 = dp0 + dq0// 181// d3 = dp3 + dq3// 182 183 add x14,x8,x11 // x13 has the value dp 184 add x12,x12,x9 // x12 has the value dq 185// dp = dp0 + dp3// 186// dq = dq0 + dq3// 187 188 add x11, x3, x4 // x3 has the value d 189 190// d = d0 + d3// 191 192 193 cmp x11,x5 194 dup v22.2s, v0.s[1] 195 bge l1.964 196 197// if(d < beta) 198 199 200 // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11 201 202 // registers for use: x2,x7,x8,x9,x10, 203 uqsub v30.8b,v7.8b,v19.8b 204 asr x10,x5,#2 205 uqadd v31.8b,v7.8b,v19.8b 206 cmp x10,x3,lsl #1 207 uaddl v0.8h,v5.8b,v4.8b 208 ble l1.336 209 210 sub x19,x0,4 211 ldrb w2,[x19] 212 uaddw v0.8h, v0.8h , v2.8b 213 ldrb w7,[x19,#3] 214 umull v20.8h, v7.8b, v23.8b 215 ldrb w3,[x0,#0] 216 umlal v20.8h, v22.8b, v18.8b 217 ldrb w8,[x0,#3] 218// ubfx x7,x2,#24,#8 // has the -1 value 219// and x2,#0xff // has the -4 value 220// ubfx x8,x3,#24,#8 // has the 3 value 221// and x3,#0xff // x4 has the 0 value 222 223 add v20.8h, v20.8h , v0.8h 224 subs x8,x8,x3 225 rshrn v22.8b,v20.8h,#3 226 csneg x8,x8,x8,pl 227 subs x2,x2,x7 228 umin v21.8b, v22.8b , v31.8b 229 csneg x2,x2,x2,pl 230 umax v22.8b, v21.8b , v30.8b 231 add x8,x8,x2 232 uaddl v20.8h,v7.8b,v3.8b 233 cmp x8,x5,asr #3 234 mla v20.8h, v0.8h, v16.8h 235 bge l1.336 236 uaddw v0.8h, v0.8h , v7.8b 237 subs x7,x3,x7 238 rshrn v20.8b,v20.8h,#3 239 csneg x7,x7,x7,pl 240 rshrn v0.8b,v0.8h,#2 241 mov x10,#5 242 uqadd v30.8b,v5.8b,v19.8b 243 mul x10, x10, x6 244 uqsub v31.8b,v5.8b,v19.8b 245 add x10, x10,#1 246 cmp x7,x10,asr #1 247 bge l1.336 248 249 250// if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 251// && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 252 253 254 asr x10,x5,#2 255 uqsub v25.8b,v4.8b,v19.8b 256 cmp x10,x4,lsl #1 257 uqadd v21.8b,v4.8b,v19.8b 258 ble l1.336 259 umin v26.8b, v20.8b , v21.8b 260 add x4,x1,x1,lsl #1 261 add x4,x4,x0 262 umax v20.8b, v26.8b , v25.8b 263 sub x19,x4,#4 264 ldrb w2,[x19] 265 umin v19.8b, v0.8b , v30.8b 266 ldrb w7,[x19,#3] 267 umax v21.8b, v19.8b , v31.8b 268 ldrb w3,[x4,#0] 269 lsl x10,x6,#1 270 ldrb w8,[x4,#3] 271// ubfx x7,x2,#24,#8 // has the -1 value 272// and x2,#0xff // has the -4 value 273// ubfx x8,x3,#24,#8 // has the 3 value 274// and x3,#0xff // x4 has the 0 value 275 uaddl v0.8h,v2.8b,v3.8b 276 dup v19.8b,w10 277 subs x8,x8,x3 278 uaddw v0.8h, v0.8h , v4.8b 279 csneg x8,x8,x8,pl 280 uqadd v30.8b,v2.8b,v19.8b 281 subs x2,x2,x7 282 uqsub v31.8b,v2.8b,v19.8b 283 csneg x2,x2,x2,pl 284 uaddl v26.8h,v5.8b,v6.8b 285 add x8,x8,x2 286 mla v26.8h, v0.8h, v16.8h 287 cmp x8,x5,asr #3 288 bge l1.336 289 rshrn v26.8b,v26.8h,#3 290 subs x7,x3,x7 291 uqadd v27.8b,v3.8b,v19.8b 292 csneg x7,x7,x7,pl 293 uqsub v28.8b,v3.8b,v19.8b 294 mov x10,#5 295 umin v16.8b, v26.8b , v30.8b 296 mul x10, x10, x6 297 add x10, x10,#1 298 cmp x7,x10,asr #1 299 umax v26.8b, v16.8b , v31.8b 300 bge l1.336 301 uqadd v30.8b,v6.8b,v19.8b 302 303 mov x2,#2 304 mov x4,x21 305 uqsub v31.8b,v6.8b,v19.8b 306 mov x5,x22 307 b end_dep_deq_decision 308// x2 has the value of de 309// x6 has teh value of tc 310// x5 has the value of beta 311// x14 has the value of dp 312// x12 has the value of dq 313// x0 has the value of source address 314// x1 has the src stride 315 316l1.336: 317 mov x2,#1 318l1.424: 319 mov x11,x5 320 mov x4,x21 321 mov x5,x22 322 323 cmp x6,#1 324 mov x20,#0 325 csel x9, x20, x9,eq 326 mov x20,#0 327 csel x10, x20, x10,eq 328 beq end_dep_deq_decision 329 330 and x7,x4,x5 331 332 cmp x7,#1 333 beq both_flags_set 334 cmp x4,#0 335 beq set_flag_dep_zero 336 337 338 add x8,x11,x11,asr #1 339 mov x10,#0 340 asr x8,x8,#3 341 cmp x8,x14 342 mov x20,#1 343 csel x9, x20, x9,gt 344 mov x20,#0 345 csel x9, x20, x9,le 346 b end_dep_deq_decision 347set_flag_dep_zero: 348 349 add x8,x11,x11,asr #1 350 mov x9,#0 351 asr x8,x8,#3 352 cmp x8,x12 353 mov x20,#1 354 csel x10, x20, x10,gt 355 mov x20,#0 356 csel x10, x20, x10,le 357 b end_dep_deq_decision 358 359both_flags_set: 360 add x8,x11,x11,asr #1 361 asr x8,x8,#3 362 cmp x8,x14 363 mov x20,#1 364 csel x9, x20, x9,gt 365 mov x20,#0 366 csel x9, x20, x9,le 367 cmp x8,x12 368 mov x20,#1 369 csel x10, x20, x10,gt 370 mov x20,#0 371 csel x10, x20, x10,le 372end_dep_deq_decision: 373 374//x0=source address 375//x1=stride 376// x2 =de 377// x4=flag p 378//x5= flag q 379//x6 =tc 380// x9 =dep 381// x10=deq 382// b l1.964 383 384 385 cmp x2,#2 386// x4 has the value of de 387 bne l1.968 388 389 cmp x5,#0 390 beq l1.780 391// x5 has the flag of q 392 393 add x3,x0,#2 394 st1 {v22.b}[0],[x3],x1 395 396 st1 {v22.b}[1],[x3],x1 397 398 st1 {v22.b}[2],[x3],x1 399 400 st1 {v22.b}[3],[x3] 401 add x3,x0,x1 402 mov v29.8b,v20.8b 403 trn1 v20.8b,v29.8b,v21.8b 404 trn2 v21.8b,v29.8b,v21.8b 405 406 st1 {v20.h}[0],[x0] 407 st1 {v21.h}[0],[x3],x1 408 st1 {v20.h}[1],[x3],x1 409 st1 {v21.h}[1],[x3] 410 411 412l1.780: 413 cmp x4,#0 414 beq l1.964 415 // x4 has the flag p 416 417 418 dup v7.2s, v24.s[0] 419 sub x3,x0,#1 420 uaddw v16.8h, v0.8h , v6.8b 421 add x7,x3,x1 422 rshrn v2.8b,v16.8h,#2 423 st1 {v26.b}[0],[x3] 424 sub x0,x0,#3 425 umin v16.8b, v2.8b , v27.8b 426 st1 {v26.b}[1],[x7],x1 427 umull v2.8h, v6.8b, v23.8b 428 umlal v2.8h, v7.8b, v18.8b 429 st1 {v26.b}[2],[x7],x1 430 umax v5.8b, v16.8b , v28.8b 431 st1 {v26.b}[3],[x7] 432 add v0.8h, v2.8h , v0.8h 433 rshrn v0.8b,v0.8h,#3 434 435 436 umin v1.8b, v0.8b , v30.8b 437 umax v0.8b, v1.8b , v31.8b 438 439 mov v29.8b,v0.8b 440 trn1 v0.8b,v29.8b,v5.8b 441 trn2 v5.8b,v29.8b,v5.8b 442 st1 {v0.h}[0],[x0],x1 443 st1 {v5.h}[0],[x0],x1 444 st1 {v0.h}[1],[x0],x1 445 st1 {v5.h}[1],[x0] 446l1.964: 447 ldp x21, x22,[sp],#16 448 ldp x19, x20,[sp],#16 449 ldp d14,d15,[sp],#16 450 ldp d12,d13,[sp],#16 451 ldp d10,d11,[sp],#16 452 ldp d8,d9,[sp],#16 453 ret 454 455l1.968: 456 457 458 movi v0.8h, #0x9 459 neg x11, x6 460 cmp x4,#0 461 // checks for the flag p 462 movi v16.8h, #0x3 463 movi v24.8b, #0x1 464 465 466 dup v30.8b,w11 467 and x11,x6,#0xff 468 dup v31.8b,w11 469 470 usubl v18.8h,v4.8b,v2.8b 471 mul v18.8h, v18.8h, v0.8h 472 usubl v0.8h,v5.8b,v3.8b 473 474 475 476 mul v16.8h, v0.8h, v16.8h 477 sub v16.8h, v18.8h , v16.8h 478 srshr v16.8h,v16.8h,#4 479// delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4// 480 481 abs v0.8h, v16.8h 482 xtn v0.8b, v0.8h 483 // storing the absolute values of delta in d0 484 485 sqxtn v16.8b,v16.8h 486 // storing the clipped values of delta in d16 487 488 movi v1.8b, #0xa 489 dup v21.8b,w11 490 mul v1.8b, v1.8b, v21.8b 491 // d1 stores the value (10 * tc) 492 493//if(abs(delta) < 10 * tc) 494 495 smin v18.8b, v16.8b , v31.8b 496 smax v20.8b, v18.8b , v30.8b 497 498// delta = clip3(delta, -tc, tc)// 499 sxtl v16.8h, v20.8b 500 uxtl v18.8h, v2.8b 501 add v18.8h, v18.8h , v16.8h 502 503 sqxtun v22.8b, v18.8h 504 uxtl v18.8h, v4.8b 505 sub v16.8h, v18.8h , v16.8h 506 sqxtun v23.8b, v16.8h 507// tmp_p0 = clip_u8(pu1_src[-1] + delta)// 508// tmp_q0 = clip_u8(pu1_src[0] - delta)// 509 beq l1.1272 510 511 512 513 cmp x9,#1 514 bne l1.1212 515// checks for the flag dep 516 517 asr x3,x6,#1 518 519 520 uaddl v16.8h,v6.8b,v2.8b 521 uaddw v16.8h, v16.8h , v24.8b 522 dup v18.8b,w3 523 sub x20,x3,#0 524 neg x3, x20 525 dup v19.8b,w3 526 ushr v16.8h,v16.8h,#1 527 xtn v16.8b, v16.8h 528 529 usubl v16.8h,v16.8b,v3.8b 530 saddw v16.8h, v16.8h , v20.8b 531 sshr v16.8h,v16.8h,#1 532 sqxtn v16.8b,v16.8h 533 534 smin v17.8b, v16.8b , v18.8b 535 smax v16.8b, v19.8b , v17.8b 536 537 538 539 540 uxtl v18.8h, v3.8b 541 sxtl v16.8h, v16.8b 542 add v16.8h, v18.8h , v16.8h 543 544 sqxtun v16.8b, v16.8h 545 mov v30.8b,v3.8b 546 cmhs v3.8b,v0.8b,v1.8b 547 548 549 bsl v3.8b,v30.8b,v16.8b 550l1.1212: 551 dup v16.8b,w11 552 sub x12,x0,#3 553 sub x3,x0,#1 554// smul v16.8b, v16.8b, v1.8b 555 mov v29.8b,v6.8b 556 trn1 v6.8b,v29.8b,v3.8b 557 trn2 v3.8b,v29.8b,v3.8b 558 st1 {v6.h}[0],[x12],x1 559 cmhs v16.8b,v0.8b,v1.8b 560 st1 {v3.h}[0],[x12],x1 561 bsl v16.8b,v2.8b,v22.8b 562 st1 {v16.b}[0],[x3],x1 563 st1 {v16.b}[1],[x3],x1 564 st1 {v6.h}[1],[x12],x1 565 st1 {v16.b}[2],[x3],x1 566 st1 {v3.h}[1],[x12] 567 st1 {v16.b}[3],[x3] 568l1.1272: 569 cmp x5,#0 570 beq l1.964 571 // checks for the flag q 572 cmp x10,#1 573 bne l1.1412 574 // checks for the flag deq 575 mov v2.8b,v7.8b 576 asr x3,x6,#1 577 578 dup v6.8b,w3 579 sub x20,x3,#0 580 neg x3, x20 581 dup v16.8b,w3 582 uaddl v2.8h,v2.8b,v4.8b 583 uaddw v2.8h, v2.8h , v24.8b 584 ushr v2.8h,v2.8h,#1 585 xtn v2.8b, v2.8h 586 587 usubl v2.8h,v2.8b,v5.8b 588 ssubw v2.8h, v2.8h , v20.8b 589 sshr v2.8h,v2.8h,#1 590 sqxtn v3.8b,v2.8h 591 592 smin v2.8b, v3.8b , v6.8b 593 smax v3.8b, v16.8b , v2.8b 594 // dup v6.8b,w2 595 // smul v6.8b, v6.8b, v1.8b 596 597 598 599 uxtl v16.8h, v5.8b 600 sxtl v2.8h, v3.8b 601 add v2.8h, v16.8h , v2.8h 602 sqxtun v3.8b, v2.8h 603 mov v30.8b,v5.8b 604 cmhs v5.8b,v0.8b,v1.8b 605 606 607 bsl v5.8b,v30.8b,v3.8b 608l1.1412: 609 // dup v2.8b,w2 610 add x3,x0,#2 611 add x11,x3,x1 612 // smul v1.8b, v2.8b, v1.8b 613 st1 {v7.b}[0],[x3] 614 st1 {v7.b}[1],[x11],x1 615 st1 {v7.b}[2],[x11],x1 616 cmhs v0.8b,v0.8b,v1.8b 617 st1 {v7.b}[3],[x11] 618 bsl v0.8b,v4.8b,v23.8b 619 mov v29.8b,v0.8b 620 trn1 v0.8b,v29.8b,v5.8b 621 trn2 v5.8b,v29.8b,v5.8b 622 st1 {v0.h}[0],[x0],x1 623 st1 {v5.h}[0],[x0],x1 624 st1 {v0.h}[1],[x0],x1 625 st1 {v5.h}[1],[x0] 626 627 ldp x21, x22,[sp],#16 628 ldp x19, x20,[sp],#16 629 ldp d14,d15,[sp],#16 630 ldp d12,d13,[sp],#16 631 ldp d10,d11,[sp],#16 632 ldp d8,d9,[sp],#16 633 ret 634 635 636