1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///******************************************************************************* 19//* @file 20//* ihevc_deblk_luma_vert.s 21//* 22//* @brief 23//* contains function definitions for inter prediction interpolation. 24//* functions are coded using neon intrinsics and can be compiled using 25 26//* rvct 27//* 28//* @author 29//* anand s 30//* 31//* @par list of functions: 32//* 33//* 34//* @remarks 35//* none 36//* 37//*******************************************************************************/ 38 39.text 40.align 4 41 42 43.extern gai4_ihevc_tc_table 44.extern gai4_ihevc_beta_table 45.globl ihevc_deblk_luma_horz_av8 46 47.type ihevc_deblk_luma_horz_av8, %function 48 49ihevc_deblk_luma_horz_av8: 50 // stmfd sp!, {x3-x12,x14} 51 sxtw x5,w5 52 sxtw x6,w6 53 stp d8,d9,[sp,#-16]! // Storing d9 using { sub sp,sp,#8; str d9,[sp] } is giving bus error. 54 // d8 is used as dummy register and stored along with d9 using stp. d8 is not used in the function. 55 stp d10,d11,[sp,#-16]! 56 stp d12,d13,[sp,#-16]! 57 stp d14,d15,[sp,#-16]! 58 stp x19, x20,[sp,#-16]! 59 stp x21, x22,[sp,#-16]! 60 61 mov x21,x7 62 ldr w22,[sp,#96] 63 64 add x3,x3,x4 65 add x3,x3,#1 66 asr x3,x3,#1 67 add x7,x3,x5,lsl #1 68 add x3,x3,x6,lsl #1 69 cmp x7,#0x33 70 mov x20,#0x33 71 csel x7, x20, x7,gt 72 bgt l1.1532 73 cmp x7,#0x0 74 mov x20,#0x0 75 csel x7, x20, x7,lt // x7 has the beta_index value 76l1.1532: 77 // bic x2,x2,#1 78 asr x2,x2,#1 79 80 add x3,x3,x2,lsl #1 81 cmp x3,#0x35 82 mov x20,#0x35 83 csel x3, x20, x3,gt 84 bgt l1.1564 85 cmp x3,#0x0 86 mov x20,#0x0 87 csel x3, x20, x3,lt // x3 has the tc_index value 88 89 // qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ 90 // beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ 91 // tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ 92 93l1.1564: 94 adrp x2, :got:gai4_ihevc_beta_table 95 ldr x2, [x2, #:got_lo12:gai4_ihevc_beta_table] 96 97 adrp x4, :got:gai4_ihevc_tc_table 98 ldr x4, [x4, #:got_lo12:gai4_ihevc_tc_table] 99 100 ldr w5, [x2,x7,lsl #2] // beta 101 ldr w6, [x4,x3,lsl #2] // tc 102 103 104 105 cmp x6,#0 106 beq l1.2404 107 movi v0.4h, #0x2 108 lsl x7,x6,#1 109 add x14,x1,x1,lsl #1 110 neg x19,x14 111 ldr w8, [x0,x19] // -3 value 112 dup v1.8b,w7 113 lsl x19,x1,#1 114 neg x19,x19 115 ldr w10, [x0,x19] //-2 value 116 dup v23.2s,w8 // -3 value 117 neg x19,x1 118 ldr w11, [x0,x19] //-1 value 119 dup v24.2s,w10 // -2 value 120 and x8,x8,#0xff 121 ldr w12, [x0,#0] // 0 value 122 dup v25.2s,w11 // -1 value 123 and x10,x10,#0xff 124 ldr w9, [x0,x1] // 1 value 125 dup v26.2s,w12 // 0 value 126 and x11,x11,#0xff 127 lsl x19,x1,#1 128 ldr w2, [x0,x19] // 2 value 129 dup v27.2s,w9 // 1value 130 and x12,x12,#0xff 131 dup v28.2s,w2 // 2 value 132 and x9,x9,#0xff 133 and x2,x2,#0xff 134 135 add x12,x12,x2 136 subs x9,x12,x9,lsl #1 // dq0 value is stored in x9 137 csneg x9,x9,x9,pl 138 //dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ 139 140 add x8,x8,x11 141 subs x8,x8,x10,lsl #1 142 csneg x8,x8,x8,pl // dp0 value is stored in x8 143 // dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ 144 145 146 147 add x3,x1,x1,lsl #1 148 add x14,x0,#3 149 150 151 neg x19,x3 152 ldrb w2,[x14,x19] // -2 value 153 lsl x19,x1,#1 154 neg x19,x19 155 ldrb w10,[x14,x19] // -2 value 156 neg x19,x1 157 ldrb w11,[x14,x19] // -1 value 158 ldrb w12,[x14,#0] // 0 value 159 ldrb w3,[x14,x1] // 1 value 160 lsl x19,x1,#1 161 ldrb w4,[x14,x19] // 2 value 162 163 164 add x12,x12,x4 165 subs x12,x12,x3,lsl #1 // dq3value is stored in x12 166 csneg x12,x12,x12,pl 167 // dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ 168 169 170 add x2,x2,x11 171 subs x11,x2,x10,lsl #1 172 csneg x11,x11,x11,pl // dp3 value is stored in x8 173 // dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ 174 175 176 177 add x3,x8,x9 // x3 has the d0 value 178 add x4,x11,x12 // x4 has the d3 value 179 180 181 // d0 = dp0 + dq0@ 182 // d3 = dp3 + dq3@ 183 184 add x14,x8,x11 // x13 has the value dp 185 add x12,x12,x9 // x12 has the value dq 186 // dp = dp0 + dp3@ 187 // dq = dq0 + dq3@ 188 189 add x11, x3, x4 // x3 has the value d 190 191 // d = d0 + d3@ 192 193 194 cmp x11,x5 195 bge l1.2404 196 197 // if(d < beta) 198 199 200 // registers which cannont be altered : x3,x4 x5,x6,x12,x13,x0,x1,x11 201 202 // registers for use: x2,x7,x8,x9,x10, 203 204 asr x10,x5,#2 205 uqadd v30.8b, v26.8b , v1.8b 206 cmp x10,x3,lsl #1 207 uqsub v31.8b, v26.8b , v1.8b 208 ble l1.1840 209 add x10,x1,x1,lsl #1 210 uaddl v6.8h, v25.8b , v26.8b 211 neg x19,x1 212 ldr w2, [x0,x19,lsl #2] // has the -4 value 213 neg x19, x1 214 ldrb w7,[x0,x19] // has the -1 value 215 dup v22.2s,w2 // -4 value 216 uaddw v7.8h, v6.8h , v27.8b 217 ldrb w3,[x0,#0] // x4 has the 0 value 218 uqadd v16.8b, v27.8b , v1.8b 219 and x2,x2,#0xff 220 mul v12.8h, v7.8h, v0.4h[0] 221 ldr w8, [x0,x10] // has the 3 value 222 uaddl v10.8h, v24.8b , v28.8b 223 subs x2,x2,x7 224 uqsub v17.8b, v27.8b , v1.8b 225 dup v29.2s,w8 // 3 value 226 and x8,x8,#0xff 227 add v12.8h, v12.8h , v10.8h 228 csneg x2,x2,x2,pl 229 rshrn v20.8b, v12.8h,#3 230 subs x8,x8,x3 231 csneg x8,x8,x8,pl 232 umin v18.8b, v20.8b , v30.8b 233 add x8,x8,x2 234 235 cmp x8,x5,asr #3 236 bge l1.1840 237 uaddw v14.8h, v7.8h , v28.8b 238 subs x7,x3,x7 239 umax v4.8b, v18.8b , v31.8b 240 csneg x7,x7,x7,pl 241 uqadd v30.8b, v28.8b , v1.8b 242 mov x10,#5 243 rshrn v21.8b, v14.8h,#2 244 mul x10, x10, x6 245 uqsub v31.8b, v28.8b , v1.8b 246 add x10, x10,#1 247 cmp x7,x10,asr #1 248 umin v18.8b, v21.8b , v16.8b 249 bge l1.1840 250 251 252 // if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 253 // && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 254 255 umax v5.8b, v18.8b , v17.8b 256 asr x10,x5,#2 257 uaddl v16.8h, v29.8b , v28.8b 258 cmp x10,x4,lsl #1 259 ble l1.1840 260 261 add x10,x1,x1,lsl #1 262 mul v16.8h, v16.8h, v0.4h[0] 263 add x4,x0,#3 264 265 266 lsl x19,x1,#2 267 neg x19,x19 268 ldrb w2,[x4,x19] 269 add v16.8h, v16.8h , v14.8h 270 neg x19,x1 271 ldrb w7,[x4,x19] 272 rshrn v19.8b, v16.8h,#3 273 ldrb w3,[x4,#0] 274 ldrb w8,[x4,x10] 275 // ubfx x7,x2,#24,#8 @ has the -1 value 276 // and x2,#0xff @ has the -4 value 277 // ubfx x8,x3,#24,#8 @ has the 3 value 278 // and x3,#0xff @ x4 has the 0 value 279 280 281 282 subs x8,x8,x3 283 umin v18.8b, v19.8b , v30.8b 284 csneg x8,x8,x8,pl 285 uaddl v6.8h, v25.8b , v24.8b 286 subs x2,x2,x7 287 umax v3.8b, v18.8b , v31.8b 288 csneg x2,x2,x2,pl 289 uaddw v7.8h, v6.8h , v26.8b 290 add x8,x8,x2 291 uqadd v30.8b, v25.8b , v1.8b 292 cmp x8,x5,asr #3 293 uqsub v31.8b, v25.8b , v1.8b 294 bge l1.1840 295 mul v12.8h, v7.8h, v0.4h[0] 296 subs x7,x3,x7 297 uqadd v16.8b, v24.8b , v1.8b 298 csneg x7,x7,x7,pl 299 uaddl v10.8h, v23.8b , v27.8b 300 mov x10,#5 301 uqsub v17.8b, v24.8b , v1.8b 302 mul x10, x10, x6 303 add v12.8h, v12.8h , v10.8h 304 add x10, x10,#1 305 rshrn v20.8b, v12.8h,#3 306 cmp x7,x10,asr #1 307 uaddw v14.8h, v7.8h , v23.8b 308 bge l1.1840 309 umin v18.8b, v20.8b , v30.8b 310 mov x2,#2 311 uqadd v30.8b, v23.8b , v1.8b 312 mov w4,w21 313 umax v2.8b, v18.8b , v31.8b 314 mov w5,w22 315 rshrn v21.8b, v14.8h,#2 316 b end_dep_deq_decision_horz 317 // x2 has the value of de 318 // x6 has teh value of tc 319 // x5 has the value of beta 320 // x14 has the value of dp 321 // x12 has the value of dq 322 // x0 has the value of source address 323 // x1 has the src stride 324 325l1.1840: 326 mov x2,#1 327 328 mov x11,x5 329 mov w4,w21 330 mov w5,w22 331 332 cmp x6,#1 333 mov x20,#0 334 csel x9, x20, x9,eq 335 mov x20,#0 336 csel x10, x20, x10,eq 337 beq end_dep_deq_decision_horz 338 339 and x7,x4,x5 340 cmp x7,#1 341 beq both_flags_set_horz 342 cmp x4,#0 343 beq set_flag_dep_zero_horz 344 345 346 add x8,x11,x11,asr #1 347 mov x10,#0 348 asr x8,x8,#3 349 cmp x8,x14 350 mov x20,#1 351 csel x9, x20, x9,gt 352 mov x20,#0 353 csel x9, x20, x9,le 354 b end_dep_deq_decision_horz 355set_flag_dep_zero_horz: 356 357 add x8,x11,x11,asr #1 358 mov x9,#0 359 asr x8,x8,#3 360 cmp x8,x12 361 mov x20,#1 362 csel x10, x20, x10,gt 363 mov x20,#0 364 csel x10, x20, x10,le 365 b end_dep_deq_decision_horz 366 367both_flags_set_horz: 368 add x8,x11,x11,asr #1 369 asr x8,x8,#3 370 cmp x8,x14 371 mov x20,#1 372 csel x9, x20, x9,gt 373 mov x20,#0 374 csel x9, x20, x9,le 375 cmp x8,x12 376 mov x20,#1 377 csel x10, x20, x10,gt 378 mov x20,#0 379 csel x10, x20, x10,le 380end_dep_deq_decision_horz: 381 382 //x0=source address 383 //x1=stride 384 // x2 =de 385 // x4=flag p 386 //x5= flag q 387 //x6 =tc 388 // x9 =dep 389 // x10=deq 390 391 392 393 // add x14,x1,x1,lsl #1 394 // lsl x7,x6,#1 395 // vdup.8 d1,x7 396 // vmov.i16 d0,#0x2 397 umin v18.8b, v21.8b , v16.8b 398 cmp x2,#1 399 uqsub v31.8b, v23.8b , v1.8b 400 beq l1.2408 401 uaddl v7.8h, v23.8b , v22.8b 402 cmp x5,#1 403 404 bne strong_filtering_p 405 406strong_filtering_q: 407 mov x12,x0 408 st1 {v4.s}[0],[x12],x1 409 st1 {v5.s}[0],[x12],x1 410 st1 {v3.s}[0],[x12] 411 cmp x4,#1 412 bne l1.2404 413strong_filtering_p: 414 umax v5.8b, v18.8b , v17.8b 415 mov x12,x0 416 mul v7.8h, v7.8h, v0.4h[0] 417 sub x20,x1,#0 418 neg x11, x20 419 add v16.8h, v7.8h , v14.8h 420 add x12,x12,x11 421 rshrn v19.8b, v16.8h,#3 422 st1 {v2.s}[0],[x12],x11 423 umin v18.8b, v19.8b , v30.8b 424 st1 {v5.s}[0],[x12],x11 425 umax v3.8b, v18.8b , v31.8b 426 st1 {v3.s}[0],[x12] 427 428l1.2404: 429 // ldmfd sp!, {x3-x12,pc} 430 ldp x21, x22,[sp],#16 431 ldp x19, x20,[sp],#16 432 ldp d14,d15,[sp],#16 433 ldp d12,d13,[sp],#16 434 ldp d10,d11,[sp],#16 435 ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. 436 // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. 437 ret 438 439 // x4=flag p 440 //x5= flag q 441 //x6 =tc 442 // x9 =dep 443 // x10=deq 444 445 446 // d22 -4 value 447 448 //d23 @ -3 value 449 450 // vdup.32 d24,x11 @ -2 value 451 452 // vdup.32 d25, x11 @-1 value 453 454 // vdup.32 d26,x11 @ 0 value 455 456 // vdup.32 d27,x11 @ 1value 457 458 // vdup.32 d28,x11 @ 2 value 459 460 // vdup.32 d29,x11 @ 3 value 461 462l1.2408: 463 464 movi v0.4h, #0x9 465 466 usubl v10.8h, v26.8b , v25.8b 467 468 mul v10.8h, v10.8h, v0.4h[0] 469 470 movi v0.4h, #0x3 471 472 usubl v12.8h, v27.8b , v24.8b 473 mul v12.8h, v12.8h, v0.4h[0] 474 475 476 dup v30.8b,w6 // duplicating the +tc value 477 478 sub x20,x6,#0 479 neg x12, x20 480 dup v31.8b,w12 // duplicating the -tc value 481 482 483 484 sub v10.8h, v10.8h , v12.8h 485 486 487 488 srshr v10.8h, v10.8h,#4 489 // delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ 490 491 abs v7.8h, v10.8h 492 xtn v9.8b, v7.8h 493 // storing the absolute values of delta in d9 494 495 sqxtn v10.8b, v10.8h 496 // storing the clipped values of delta in d16 497 498 499 smin v11.8b, v10.8b , v30.8b 500 smax v7.8b, v31.8b , v11.8b // d8 has the value delta = clip3(delta, -tc, tc)// 501 502 503 uxtl v6.8h, v25.8b 504 505 saddw v4.8h, v6.8h , v7.8b 506 507 sqxtun v12.8b, v4.8h 508 uxtl v6.8h, v26.8b 509 ssubw v4.8h, v6.8h , v7.8b 510 sqxtun v13.8b, v4.8h 511 512 513 mov x11,#0xa 514 mul x12, x11, x6 515 dup v2.8b,w12 // d2 has the 10*tc value 516 mov v18.8b, v24.8b 517 dup v0.8b,w6 518 sshr v0.8b,v0.8b,#1 519 neg v1.8b, v0.8b 520 521 cmp x4,#1 522 bne l1.2724 523 cmp x9,#1 524 bne l1.2700 525 526 // d12 and d13 have the value temp_p0 and temp_q0 527 uaddl v14.8h, v23.8b , v25.8b 528 rshrn v14.8b, v14.8h,#1 529 usubl v14.8h, v14.8b , v24.8b 530 saddw v14.8h, v14.8h , v7.8b 531 sqshrn v14.8b, v14.8h,#1 532 smin v15.8b, v14.8b , v0.8b 533 smax v14.8b, v1.8b , v15.8b 534 535 // d14 has the delta p value 536 uxtl v16.8h, v24.8b 537 saddw v16.8h, v16.8h , v14.8b 538 sqxtun v14.8b, v16.8h 539 540 // d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@ 541 cmhs v18.8b,v9.8b,v2.8b 542 bsl v18.8b,v24.8b,v14.8b 543 544l1.2700: 545 mov x12,x0 546 sub x20,x1,#0 547 neg x11, x20 548 add x12,x12,x11 549 cmhs v19.8b,v9.8b,v2.8b 550 bsl v19.8b,v25.8b,v12.8b 551 st1 {v19.s}[0],[x12],x11 552 st1 {v18.s}[0],[x12] 553l1.2724: 554 cmp x5,#1 555 bne l1.2404 556 cmp x10,#1 557 mov v18.8b, v27.8b 558 bne l1.2852 559 560 uaddl v14.8h, v26.8b , v28.8b 561 rshrn v14.8b, v14.8h,#1 562 usubl v14.8h, v14.8b , v27.8b 563 ssubw v14.8h, v14.8h , v7.8b 564 sqshrn v14.8b, v14.8h,#1 565 smin v15.8b, v14.8b , v0.8b 566 smax v14.8b, v1.8b , v15.8b 567// d14 has the delta p value 568 uxtl v16.8h, v27.8b 569 saddw v16.8h, v16.8h , v14.8b 570 sqxtun v14.8b, v16.8h 571 cmhs v18.8b,v9.8b,v2.8b 572 bsl v18.8b,v27.8b,v14.8b 573l1.2852: 574 mov x12,x0 575 cmhs v19.8b,v9.8b,v2.8b 576 bsl v19.8b,v26.8b,v13.8b 577 st1 {v19.s}[0],[x12],x1 578 st1 {v18.s}[0],[x12] 579 // ldmfd sp!, {x3-x12,x15} 580 ldp x21, x22,[sp],#16 581 ldp x19, x20,[sp],#16 582 ldp d14,d15,[sp],#16 583 ldp d12,d13,[sp],#16 584 ldp d10,d11,[sp],#16 585 ldp d8,d9,[sp],#16 // Loading d9 using { ldr d9,[sp]; add sp,sp,#8 } is giving bus error. 586 // d8 is used as dummy register and loaded along with d9 using ldp. d8 is not used in the function. 587 ret 588 589 590