1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@/******************************************************************************* 20@* @file 21@* ihevc_deblk_luma_vert.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* anand s 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@*******************************************************************************/ 39 40.equ qp_q_offset, 44 41.equ beta_offset_div2_offset, 48 42.equ tc_offset_div2_offset, 52 43.equ filter_p_offset, 56 44.equ filter_q_offset, 60 45 46.text 47.align 4 48 49 50 51 52 53.extern gai4_ihevc_tc_table 54.extern gai4_ihevc_beta_table 55 56.globl ihevc_deblk_luma_vert_a9q 57 58gai4_ihevc_tc_table_addr: 59.long gai4_ihevc_tc_table - ulbl1 - 8 60 61gai4_ihevc_beta_table_addr: 62.long gai4_ihevc_beta_table - ulbl2 - 8 63 64.type ihevc_deblk_luma_vert_a9q, %function 65 66ihevc_deblk_luma_vert_a9q: 67 68 push {r3-r12,lr} 69 ldr r4,[sp,#qp_q_offset] 70 ldr r5,[sp,#beta_offset_div2_offset] 71 72 add r3,r3,r4 73 add r3,r3,#1 74 ldr r6, [sp,#tc_offset_div2_offset] 75 asr r3,r3,#1 76 add r7,r3,r5,lsl #1 77 add r3,r3,r6,lsl #1 78 cmp r7,#0x33 79 movgt r7,#0x33 80 bgt l1.56 81 cmp r7,#0x0 82 movlt r7,#0x0 @ r7 has the beta_index value 83l1.56: 84 85@ bic r2,r2,#1 86 asr r2,r2,#1 87 88 add r3,r3,r2,lsl #1 89 cmp r3,#0x35 90 movgt r3,#0x35 91 bgt l1.88 92 cmp r3,#0x0 93 movlt r3,#0x0 @ r3 has the tc_index value 94 95@ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ 96@ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ 97@ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ 98 99l1.88: 100 ldr r2,gai4_ihevc_beta_table_addr 101ulbl2: 102 add r2,r2,pc 103 vmov.i8 d18,#0x2 104 ldr r4,gai4_ihevc_tc_table_addr 105ulbl1: 106 add r4,r4,pc 107 108 ldr r5,[r2,r7,lsl #2] @ beta 109 vmov.i16 q8,#0x2 110 ldr r6,[r4,r3,lsl #2] @ tc 111 lsl r8,r6,#1 112 cmp r6,#0 113 vdup.8 d19,r8 114 sub r7,r0,#4 115 vmov.i8 d23,#0x3 116 beq l1.964 117 118 119 vld1.8 {d24},[r7],r1 120 ldrb r8,[r0,#-3] @ -3 value 121 vld1.8 {d1},[r7],r1 122 ldrb r10,[r0,#-2] @-2 value 123 vld1.8 {d2},[r7],r1 124 ldrb r11,[r0,#-1] @-1 value 125 vld1.8 {d0},[r7] 126 ldrb r12,[r0,#0] @ 0 value 127 ldrb r9,[r0,#1] @ 1 value 128 vtrn.8 d24,d1 129 ldrb r2,[r0,#2] @ 2 value 130 vtrn.8 d2,d0 131 add r12,r12,r2 132 subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9 133 rsbmi r9,r9,#0 134@dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ 135 vtrn.16 d24,d2 136 add r8,r8,r11 137 vtrn.16 d1,d0 138 subs r8,r8,r10,lsl #1 139 rsbmi r8,r8,#0 @ dp0 value is stored in r8 140@ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ 141 142 143 144 add r14,r1,r1,lsl #1 145 add r14,r0,r14 146 147 vdup.32 d4,d24[1] 148 ldrb r2,[r14,#-3] @ -2 value 149 vdup.32 d7,d2[1] 150 ldrb r10,[r14,#-2] @ -2 value 151 vdup.32 d3,d2[0] 152 ldrb r11,[r14,#-1] @ -1 value 153 vdup.32 d5,d1[1] 154 ldrb r12,[r14,#0] @ 0 value 155 vdup.32 d6,d1[0] 156 ldrb r3,[r14,#1] @ 1 value 157 vdup.32 d2,d0[0] 158 ldrb r4,[r14,#2] @ 2 value 159 160 161 add r12,r12,r4 162 subs r12,r12,r3,lsl #1 @ dq3value is stored in r12 163 rsbmi r12,r12,#0 164@ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ 165 166 167 add r2,r2,r11 168 subs r11,r2,r10,lsl #1 169 rsbmi r11,r11,#0 @ dp3 value is stored in r8 170@ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ 171 172 173 174 add r3,r8,r9 @ r3 has the d0 value 175 add r4,r11,r12 @ r4 has the d3 value 176 177 178@ d0 = dp0 + dq0@ 179@ d3 = dp3 + dq3@ 180 181 add r14,r8,r11 @ r13 has the value dp 182 add r12,r12,r9 @ r12 has the value dq 183@ dp = dp0 + dp3@ 184@ dq = dq0 + dq3@ 185 186 add r11, r3, r4 @ r3 has the value d 187 188@ d = d0 + d3@ 189 190 191 cmp r11,r5 192 vdup.32 d22,d0[1] 193 bge l1.964 194 195@ if(d < beta) 196 197 198 @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11 199 200 @ registers for use: r2,r7,r8,r9,r10, 201 vqsub.u8 d30,d7,d19 202 asr r10,r5,#2 203 vqadd.u8 d31,d7,d19 204 cmp r10,r3,lsl #1 205 vaddl.u8 q0,d5,d4 206 ble l1.336 207 208 ldrb r2,[r0,#-4] 209 vaddw.u8 q0,q0,d2 210 ldrb r7,[r0,#-1] 211 vmull.u8 q10,d7,d23 212 ldrb r3,[r0,#0] 213 vmlal.u8 q10,d22,d18 214 ldrb r8,[r0,#3] 215@ ubfx r7,r2,#24,#8 @ has the -1 value 216@ and r2,#0xff @ has the -4 value 217@ ubfx r8,r3,#24,#8 @ has the 3 value 218@ and r3,#0xff @ r4 has the 0 value 219 220 vadd.i16 q10,q10,q0 221 subs r8,r8,r3 222 vrshrn.i16 d22,q10,#3 223 rsbmi r8,r8,#0 224 subs r2,r2,r7 225 vmin.u8 d21,d22,d31 226 rsbmi r2,r2,#0 227 vmax.u8 d22,d21,d30 228 add r8,r8,r2 229 vaddl.u8 q10,d7,d3 230 cmp r8,r5,asr #3 231 vmla.i16 q10,q0,q8 232 bge l1.336 233 vaddw.u8 q0,q0,d7 234 subs r7,r3,r7 235 vrshrn.i16 d20,q10,#3 236 rsbmi r7,r7,#0 237 vrshrn.i16 d0,q0,#2 238 mov r10,#5 239 vqadd.u8 d30,d5,d19 240 mul r10,r10,r6 241 vqsub.u8 d31,d5,d19 242 add r10,#1 243 cmp r7,r10,asr #1 244 bge l1.336 245 246 247@ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 248@ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 249 250 251 asr r10,r5,#2 252 vqsub.u8 d25,d4,d19 253 cmp r10,r4,lsl #1 254 vqadd.u8 d21,d4,d19 255 ble l1.336 256 vmin.u8 d26,d20,d21 257 add r4,r1,r1,lsl #1 258 add r4,r4,r0 259 vmax.u8 d20,d26,d25 260 ldrb r2,[r4,#-4] 261 vmin.u8 d19,d0,d30 262 ldrb r7,[r4,#-1] 263 vmax.u8 d21,d19,d31 264 ldrb r3,[r4,#0] 265 lsl r10,r6,#1 266 ldrb r8,[r4,#3] 267@ ubfx r7,r2,#24,#8 @ has the -1 value 268@ and r2,#0xff @ has the -4 value 269@ ubfx r8,r3,#24,#8 @ has the 3 value 270@ and r3,#0xff @ r4 has the 0 value 271 vaddl.u8 q0,d2,d3 272 vdup.8 d19,r10 273 subs r8,r8,r3 274 vaddw.u8 q0,q0,d4 275 rsbmi r8,r8,#0 276 vqadd.u8 d30,d2,d19 277 subs r2,r2,r7 278 vqsub.u8 d31,d2,d19 279 rsbmi r2,r2,#0 280 vaddl.u8 q13,d5,d6 281 add r8,r8,r2 282 vmla.i16 q13,q0,q8 283 cmp r8,r5,asr #3 284 bge l1.336 285 vrshrn.i16 d26,q13,#3 286 subs r7,r3,r7 287 vqadd.u8 d27,d3,d19 288 rsbmi r7,r7,#0 289 vqsub.u8 d28,d3,d19 290 mov r10,#5 291 vmin.u8 d16,d26,d30 292 mul r10,r10,r6 293 add r10,#1 294 cmp r7,r10,asr #1 295 vmax.u8 d26,d16,d31 296 bge l1.336 297 vqadd.u8 d30,d6,d19 298 299 mov r2,#2 300 ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p 301 vqsub.u8 d31,d6,d19 302 ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q 303 b end_dep_deq_decision 304@ r2 has the value of de 305@ r6 has teh value of tc 306@ r5 has the value of beta 307@ r14 has the value of dp 308@ r12 has the value of dq 309@ r0 has the value of source address 310@ r1 has the src stride 311 312l1.336: 313 mov r2,#1 314l1.424: 315 mov r11,r5 316 ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p 317 ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q 318 319 cmp r6,#1 320 moveq r9,#0 321 moveq r10,#0 322 beq end_dep_deq_decision 323 324 and r7,r4,r5 325 326 cmp r7,#1 327 beq both_flags_set 328 cmp r4,#0 329 beq set_flag_dep_zero 330 331 332 add r8,r11,r11,asr #1 333 mov r10,#0 334 asr r8,#3 335 cmp r8,r14 336 movgt r9,#1 337 movle r9,#0 338 b end_dep_deq_decision 339set_flag_dep_zero: 340 341 add r8,r11,r11,asr #1 342 mov r9,#0 343 asr r8,#3 344 cmp r8,r12 345 movgt r10,#1 346 movle r10,#0 347 b end_dep_deq_decision 348 349both_flags_set: 350 add r8,r11,r11,asr #1 351 asr r8,#3 352 cmp r8,r14 353 movgt r9,#1 354 movle r9,#0 355 cmp r8,r12 356 movgt r10,#1 357 movle r10,#0 358end_dep_deq_decision: 359 360@r0=source address 361@r1=stride 362@ r2 =de 363@ r4=flag p 364@r5= flag q 365@r6 =tc 366@ r9 =dep 367@ r10=deq 368@ b l1.964 369 370 371 cmp r2,#2 372@ r4 has the value of de 373 bne l1.968 374 375 cmp r5,#0 376 beq l1.780 377@ r5 has the flag of q 378 379 add r3,r0,#2 380 vst1.8 {d22[0]},[r3],r1 381 382 vst1.8 {d22[1]},[r3],r1 383 384 vst1.8 {d22[2]},[r3],r1 385 386 vst1.8 {d22[3]},[r3] 387 add r3,r0,r1 388 vtrn.8 d20,d21 389 390 vst1.16 {d20[0]},[r0] 391 vst1.16 {d21[0]},[r3],r1 392 vst1.16 {d20[1]},[r3],r1 393 vst1.16 {d21[1]},[r3] 394 395 396l1.780: 397 cmp r4,#0 398 beq l1.964 399 @ r5 has the flag p 400 401 402 vdup.32 d7,d24[0] 403 sub r3,r0,#1 404 vaddw.u8 q8,q0,d6 405 add r7,r3,r1 406 vrshrn.i16 d2,q8,#2 407 vst1.8 {d26[0]},[r3] 408 sub r0,r0,#3 409 vmin.u8 d16,d2,d27 410 vst1.8 {d26[1]},[r7],r1 411 vmull.u8 q1,d6,d23 412 vmlal.u8 q1,d7,d18 413 vst1.8 {d26[2]},[r7],r1 414 vmax.u8 d5,d16,d28 415 vst1.8 {d26[3]},[r7] 416 vadd.i16 q0,q1,q0 417 vrshrn.i16 d0,q0,#3 418 419 420 vmin.u8 d1,d0,d30 421 vmax.u8 d0,d1,d31 422 423 vtrn.8 d0,d5 424 vst1.16 {d0[0]},[r0],r1 425 vst1.16 {d5[0]},[r0],r1 426 vst1.16 {d0[1]},[r0],r1 427 vst1.16 {d5[1]},[r0] 428l1.964: 429 pop {r3-r12,pc} 430l1.968: 431 432 433 vmov.i16 q0,#0x9 434 rsb r11,r6,#0 435 cmp r4,#0 436 @ checks for the flag p 437 vmov.i16 q8,#0x3 438 vmov.i8 d24,#0x1 439 440 441 vdup.8 d30,r11 442 and r11,r6,#0xff 443 vdup.8 d31,r11 444 445 vsubl.u8 q9,d4,d2 446 vmul.i16 q9,q9,q0 447 vsubl.u8 q0,d5,d3 448 449 450 451 vmul.i16 q8,q0,q8 452 vsub.i16 q8,q9,q8 453 vrshr.s16 q8,q8,#4 454@ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ 455 456 vabs.s16 q0,q8 457 vmovn.i16 d0,q0 458 @ storing the absolute values of delta in d0 459 460 vqmovn.s16 d16,q8 461 @ storing the clipped values of delta in d16 462 463 vmov.i8 d1,#0xa 464 vdup.8 d21,r11 465 vmul.i8 d1,d1,d21 466 @ d1 stores the value (10 * tc) 467 468@if(abs(delta) < 10 * tc) 469 470 vmin.s8 d18,d16,d31 471 vmax.s8 d20,d18,d30 472 473@ delta = clip3(delta, -tc, tc)@ 474 vmovl.s8 q8,d20 475 vmovl.u8 q9,d2 476 vadd.i16 q9,q9,q8 477 478 vqmovun.s16 d22,q9 479 vmovl.u8 q9,d4 480 vsub.i16 q8,q9,q8 481 vqmovun.s16 d23,q8 482@ tmp_p0 = clip_u8(pu1_src[-1] + delta)@ 483@ tmp_q0 = clip_u8(pu1_src[0] - delta)@ 484 beq l1.1272 485 486 487 488 cmp r9,#1 489 bne l1.1212 490@ checks for the flag dep 491 492 asr r3,r6,#1 493 494 495 vaddl.u8 q8,d6,d2 496 vaddw.u8 q8,q8,d24 497 vdup.8 d18,r3 498 rsb r3,r3,#0 499 vdup.8 d19,r3 500 vshr.u16 q8,q8,#1 501 vmovn.i16 d16,q8 502 503 vsubl.u8 q8,d16,d3 504 vaddw.s8 q8,q8,d20 505 vshr.s16 q8,q8,#1 506 vqmovn.s16 d16,q8 507 508 vmin.s8 d17,d16,d18 509 vmax.s8 d16,d19,d17 510 511 512 513 514 vmovl.u8 q9,d3 515 vmovl.s8 q8,d16 516 vadd.i16 q8,q9,q8 517 518 vqmovun.s16 d16,q8 519 vmov d30,d3 520 vcge.u8 d3,d0,d1 521 522 523 vbsl d3,d30,d16 524l1.1212: 525 vdup.8 d16,r11 526 sub r12,r0,#3 527 sub r3,r0,#1 528@ vmul.i8 d16,d16,d1 529 vtrn.8 d6,d3 530 vst1.16 {d6[0]},[r12],r1 531 vcge.u8 d16,d0,d1 532 vst1.16 {d3[0]},[r12],r1 533 vbsl d16,d2,d22 534 vst1.8 {d16[0]},[r3],r1 535 vst1.8 {d16[1]},[r3],r1 536 vst1.16 {d6[1]},[r12],r1 537 vst1.8 {d16[2]},[r3],r1 538 vst1.16 {d3[1]},[r12] 539 vst1.8 {d16[3]},[r3] 540l1.1272: 541 cmp r5,#0 542 beq l1.964 543 @ checks for the flag q 544 cmp r10,#1 545 bne l1.1412 546 @ checks for the flag deq 547 vmov d2,d7 548 asr r3,r6,#1 549 550 vdup.8 d6,r3 551 rsb r3,r3,#0 552 vdup.8 d16,r3 553 vaddl.u8 q1,d2,d4 554 vaddw.u8 q1,q1,d24 555 vshr.u16 q1,q1,#1 556 vmovn.i16 d2,q1 557 558 vsubl.u8 q1,d2,d5 559 vsubw.s8 q1,q1,d20 560 vshr.s16 q1,q1,#1 561 vqmovn.s16 d3,q1 562 563 vmin.s8 d2,d3,d6 564 vmax.s8 d3,d16,d2 565 @ vdup.8 d6,r2 566 @ vmul.i8 d6,d6,d1 567 568 569 570 vmovl.u8 q8,d5 571 vmovl.s8 q1,d3 572 vadd.i16 q1,q8,q1 573 vqmovun.s16 d3,q1 574 vmov d30,d5 575 vcge.u8 d5,d0,d1 576 577 578 vbsl d5,d30,d3 579l1.1412: 580 @ vdup.8 d2,r2 581 add r3,r0,#2 582 add r11,r3,r1 583 @ vmul.i8 d1,d2,d1 584 vst1.8 {d7[0]},[r3] 585 vst1.8 {d7[1]},[r11],r1 586 vst1.8 {d7[2]},[r11],r1 587 vcge.u8 d0,d0,d1 588 vst1.8 {d7[3]},[r11] 589 vbsl d0,d4,d23 590 vtrn.8 d0,d5 591 vst1.16 {d0[0]},[r0],r1 592 vst1.16 {d5[0]},[r0],r1 593 vst1.16 {d0[1]},[r0],r1 594 vst1.16 {d5[1]},[r0] 595 pop {r3-r12,pc} 596 597 598 599