1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/******************************************************************************* 19@* @file 20@* ihevc_deblk_luma_vert.s 21@* 22@* @brief 23@* contains function definitions for inter prediction interpolation. 24@* functions are coded using neon intrinsics and can be compiled using 25 26@* rvct 27@* 28@* @author 29@* anand s 30@* 31@* @par list of functions: 32@* 33@* 34@* @remarks 35@* none 36@* 37@*******************************************************************************/ 38 39.equ qp_q_offset, 108 40.equ beta_offset_div2_offset, 112 41.equ tc_offset_div2_offset, 116 42.equ filter_p_offset, 120 43.equ filter_q_offset, 124 44 45.text 46.align 4 47 48 49 50 51 52.extern gai4_ihevc_tc_table 53.extern gai4_ihevc_beta_table 54.globl ihevc_deblk_luma_horz_a9q 55 56gai4_ihevc_tc_table_addr: 57.long gai4_ihevc_tc_table - ulbl1 - 8 58 59gai4_ihevc_beta_table_addr: 60.long gai4_ihevc_beta_table - ulbl2 - 8 61 62.type ihevc_deblk_luma_horz_a9q, %function 63 64ihevc_deblk_luma_horz_a9q: 65 stmfd sp!, {r3-r12,lr} 66 vpush {d8 - d15} 67 68 ldr r4,[sp,#qp_q_offset] 69 ldr r5,[sp,#beta_offset_div2_offset] 70 71 add r3,r3,r4 72 add r3,r3,#1 73 ldr r6, [sp,#tc_offset_div2_offset] 74 asr r3,r3,#1 75 add r7,r3,r5,lsl #1 76 add r3,r3,r6,lsl #1 77 cmp r7,#0x33 78 movgt r7,#0x33 79 bgt l1.1532 80 cmp r7,#0x0 81 movlt r7,#0x0 @ r7 has the beta_index value 82l1.1532: 83 @ bic r2,r2,#1 84 asr r2,r2,#1 85 86 add r3,r3,r2,lsl #1 87 cmp r3,#0x35 88 movgt r3,#0x35 89 bgt l1.1564 90 cmp r3,#0x0 91 movlt r3,#0x0 @ r3 has the tc_index value 92 93 @ qp_luma = (quant_param_p + quant_param_q + 1) >> 1@ 94 @ beta_indx = clip3(qp_luma + (beta_offset_div2 << 1), 0, 51)@ 95 @ tc_indx = clip3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53)@ 96 97l1.1564: 98 ldr r2,gai4_ihevc_beta_table_addr 99ulbl2: 100 add r2,r2,pc 101 ldr r4,gai4_ihevc_tc_table_addr 102ulbl1: 103 add r4,r4,pc 104 105 ldr r5,[r2,r7,lsl #2] @ beta 106 ldr r6,[r4,r3,lsl #2] @ tc 107 108 109 110 cmp r6,#0 111 beq l1.2404 112 vmov.i16 d0,#0x2 113 lsl r7,r6,#1 114 add r14,r1,r1,lsl #1 115 ldr r8,[r0,-r14] @ -3 value 116 vdup.8 d1,r7 117 ldr r10,[r0,-r1,lsl #1] @-2 value 118 vdup.32 d23,r8 @ -3 value 119 ldr r11,[r0,-r1] @-1 value 120 vdup.32 d24,r10 @ -2 value 121 and r8,#0xff 122 ldr r12,[r0,#0] @ 0 value 123 vdup.32 d25, r11 @-1 value 124 and r10,#0xff 125 ldr r9,[r0,r1] @ 1 value 126 vdup.32 d26,r12 @ 0 value 127 and r11,#0xff 128 ldr r2,[r0,r1,lsl #1] @ 2 value 129 vdup.32 d27,r9 @ 1value 130 and r12,#0xff 131 vdup.32 d28,r2 @ 2 value 132 and r9,#0xff 133 and r2,#0xff 134 135 add r12,r12,r2 136 subs r9,r12,r9,lsl #1 @ dq0 value is stored in r9 137 rsbmi r9,r9,#0 138 @dq0 = abs( pu1_src[2] - 2 * pu1_src[1] + pu1_src[0] )@ 139 140 add r8,r8,r11 141 subs r8,r8,r10,lsl #1 142 rsbmi r8,r8,#0 @ dp0 value is stored in r8 143 @ dp0 = abs( pu1_src[-3] - 2 * pu1_src[-2] + pu1_src[-1] )@ 144 145 146 147 add r3,r1,r1,lsl #1 148 add r14,r0,#3 149 150 151 ldrb r2,[r14,-r3] @ -2 value 152 ldrb r10,[r14,-r1,lsl #1] @ -2 value 153 ldrb r11,[r14,-r1] @ -1 value 154 ldrb r12,[r14,#0] @ 0 value 155 ldrb r3,[r14,r1] @ 1 value 156 ldrb r4,[r14,r1,lsl #1] @ 2 value 157 158 159 add r12,r12,r4 160 subs r12,r12,r3,lsl #1 @ dq3value is stored in r12 161 rsbmi r12,r12,#0 162 @ dq3 = abs( pu1_src[3 * src_strd + 2] - 2 * pu1_src[3 * src_strd + 1]+ pu1_src[3 * src_strd + 0] )@ 163 164 165 add r2,r2,r11 166 subs r11,r2,r10,lsl #1 167 rsbmi r11,r11,#0 @ dp3 value is stored in r8 168 @ dp3 = abs( pu1_src[3 * src_strd - 3] - 2 * pu1_src[3 * src_strd - 2] + pu1_src[3 * src_strd - 1] )@ 169 170 171 172 add r3,r8,r9 @ r3 has the d0 value 173 add r4,r11,r12 @ r4 has the d3 value 174 175 176 @ d0 = dp0 + dq0@ 177 @ d3 = dp3 + dq3@ 178 179 add r14,r8,r11 @ r13 has the value dp 180 add r12,r12,r9 @ r12 has the value dq 181 @ dp = dp0 + dp3@ 182 @ dq = dq0 + dq3@ 183 184 add r11, r3, r4 @ r3 has the value d 185 186 @ d = d0 + d3@ 187 188 189 cmp r11,r5 190 bge l1.2404 191 192 @ if(d < beta) 193 194 195 @ registers which cannont be altered : r3,r4 r5,r6,r12,r13,r0,r1,r11 196 197 @ registers for use: r2,r7,r8,r9,r10, 198 199 asr r10,r5,#2 200 vqadd.u8 d30,d26,d1 201 cmp r10,r3,lsl #1 202 vqsub.u8 d31,d26,d1 203 ble l1.1840 204 add r10,r1,r1,lsl #1 205 vaddl.u8 q3,d25,d26 206 ldr r2,[r0,-r1,lsl #2] @ has the -4 value 207 ldrb r7,[r0,-r1] @ has the -1 value 208 vdup.32 d22,r2 @ -4 value 209 vaddw.u8 q4,q3,d27 210 ldrb r3,[r0,#0] @ r4 has the 0 value 211 vqadd.u8 d16,d27,d1 212 and r2,#0xff 213 vmul.i16 q6,q4,d0[0] 214 ldr r8,[r0,r10] @ has the 3 value 215 vaddl.u8 q5,d24,d28 216 subs r2,r2,r7 217 vqsub.u8 d17,d27,d1 218 vdup.32 d29,r8 @ 3 value 219 and r8,#0xff 220 vadd.i16 q6,q6,q5 221 rsbmi r2,r2,#0 222 vrshrn.i16 d20,q6,#3 223 subs r8,r8,r3 224 rsbmi r8,r8,#0 225 vmin.u8 d18,d20,d30 226 add r8,r8,r2 227 228 cmp r8,r5,asr #3 229 bge l1.1840 230 vaddw.u8 q7,q4,d28 231 subs r7,r3,r7 232 vmax.u8 d4,d18,d31 233 rsbmi r7,r7,#0 234 vqadd.u8 d30,d28,d1 235 mov r10,#5 236 vrshrn.i16 d21,q7,#2 237 mul r10,r10,r6 238 vqsub.u8 d31,d28,d1 239 add r10,#1 240 cmp r7,r10,asr #1 241 vmin.u8 d18,d21,d16 242 bge l1.1840 243 244 245 @ if( (2 * d3 < (beta >> 2)&& ( abs(pu1_src[3] - pu1_src[0]) + abs(pu1_src[-1] - pu1_src[-4]) < (beta >> 3) ) 246 @ && abs(pu1_src[0] - pu1_src[-1]) < ( (5 * tc + 1) >> 1 ) ) 247 248 vmax.u8 d5,d18,d17 249 asr r10,r5,#2 250 vaddl.u8 q8,d29,d28 251 cmp r10,r4,lsl #1 252 ble l1.1840 253 254 add r10,r1,r1,lsl #1 255 vmul.i16 q8,q8,d0[0] 256 add r4,r0,#3 257 258 259 ldrb r2,[r4,-r1,lsl #2] 260 vadd.i16 q8,q8,q7 261 ldrb r7,[r4,-r1] 262 vrshrn.i16 d19,q8,#3 263 ldrb r3,[r4,#0] 264 ldrb r8,[r4,r10] 265 @ ubfx r7,r2,#24,#8 @ has the -1 value 266 @ and r2,#0xff @ has the -4 value 267 @ ubfx r8,r3,#24,#8 @ has the 3 value 268 @ and r3,#0xff @ r4 has the 0 value 269 270 271 272 subs r8,r8,r3 273 vmin.u8 d18,d19,d30 274 rsbmi r8,r8,#0 275 vaddl.u8 q3,d25,d24 276 subs r2,r2,r7 277 vmax.u8 d3,d18,d31 278 rsbmi r2,r2,#0 279 vaddw.u8 q4,q3,d26 280 add r8,r8,r2 281 vqadd.u8 d30,d25,d1 282 cmp r8,r5,asr #3 283 vqsub.u8 d31,d25,d1 284 bge l1.1840 285 vmul.i16 q6,q4,d0[0] 286 subs r7,r3,r7 287 vqadd.u8 d16,d24,d1 288 rsbmi r7,r7,#0 289 vaddl.u8 q5,d23,d27 290 mov r10,#5 291 vqsub.u8 d17,d24,d1 292 mul r10,r10,r6 293 vadd.i16 q6,q6,q5 294 add r10,#1 295 vrshrn.i16 d20,q6,#3 296 cmp r7,r10,asr #1 297 vaddw.u8 q7,q4,d23 298 bge l1.1840 299 vmin.u8 d18,d20,d30 300 mov r2,#2 301 vqadd.u8 d30,d23,d1 302 ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p 303 vmax.u8 d2,d18,d31 304 ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q 305 vrshrn.i16 d21,q7,#2 306 b end_dep_deq_decision_horz 307 @ r2 has the value of de 308 @ r6 has teh value of tc 309 @ r5 has the value of beta 310 @ r14 has the value of dp 311 @ r12 has the value of dq 312 @ r0 has the value of source address 313 @ r1 has the src stride 314 315l1.1840: 316 mov r2,#1 317 318 mov r11,r5 319 ldr r4,[sp,#filter_p_offset] @ loading the filter_flag_p 320 ldr r5,[sp,#filter_q_offset] @ loading the filter_flag_q 321 322 cmp r6,#1 323 moveq r9,#0 324 moveq r10,#0 325 beq end_dep_deq_decision_horz 326 327 and r7,r4,r5 328 cmp r7,#1 329 beq both_flags_set_horz 330 cmp r4,#0 331 beq set_flag_dep_zero_horz 332 333 334 add r8,r11,r11,asr #1 335 mov r10,#0 336 asr r8,#3 337 cmp r8,r14 338 movgt r9,#1 339 movle r9,#0 340 b end_dep_deq_decision_horz 341set_flag_dep_zero_horz: 342 343 add r8,r11,r11,asr #1 344 mov r9,#0 345 asr r8,#3 346 cmp r8,r12 347 movgt r10,#1 348 movle r10,#0 349 b end_dep_deq_decision_horz 350 351both_flags_set_horz: 352 add r8,r11,r11,asr #1 353 asr r8,#3 354 cmp r8,r14 355 movgt r9,#1 356 movle r9,#0 357 cmp r8,r12 358 movgt r10,#1 359 movle r10,#0 360end_dep_deq_decision_horz: 361 362 @r0=source address 363 @r1=stride 364 @ r2 =de 365 @ r4=flag p 366 @r5= flag q 367 @r6 =tc 368 @ r9 =dep 369 @ r10=deq 370 371 372 373 @ add r14,r1,r1,lsl #1 374 @ lsl r7,r6,#1 375 @ vdup.8 d1,r7 376 @ vmov.i16 d0,#0x2 377 vmin.u8 d18,d21,d16 378 cmp r2,#1 379 vqsub.u8 d31,d23,d1 380 beq l1.2408 381 vaddl.u8 q4,d23,d22 382 cmp r5,#1 383 384 bne strong_filtering_p 385 386strong_filtering_q: 387 mov r12,r0 388 vst1.32 d4[0],[r12],r1 389 vst1.32 d5[0],[r12],r1 390 vst1.32 d3[0],[r12] 391 cmp r4,#1 392 bne l1.2404 393strong_filtering_p: 394 vmax.u8 d5,d18,d17 395 mov r12,r0 396 vmul.i16 q4,q4,d0[0] 397 rsb r11,r1,#0 398 vadd.i16 q8,q4,q7 399 add r12,r12,r11 400 vrshrn.i16 d19,q8,#3 401 vst1.32 d2[0],[r12],r11 402 vmin.u8 d18,d19,d30 403 vst1.32 d5[0],[r12],r11 404 vmax.u8 d3,d18,d31 405 vst1.32 d3[0],[r12] 406 407l1.2404: 408 vpop {d8 - d15} 409 ldmfd sp!, {r3-r12,pc} 410 411 @ r4=flag p 412 @r5= flag q 413 @r6 =tc 414 @ r9 =dep 415 @ r10=deq 416 417 418 @ d22 -4 value 419 420 @d23 @ -3 value 421 422 @ vdup.32 d24,r11 @ -2 value 423 424 @ vdup.32 d25, r11 @-1 value 425 426 @ vdup.32 d26,r11 @ 0 value 427 428 @ vdup.32 d27,r11 @ 1value 429 430 @ vdup.32 d28,r11 @ 2 value 431 432 @ vdup.32 d29,r11 @ 3 value 433 434l1.2408: 435 436 vmov.i16 d0,#0x9 437 438 vsubl.u8 q5,d26,d25 439 440 vmul.i16 q5,q5,d0[0] 441 442 vmov.i16 d0,#0x3 443 444 vsubl.u8 q6,d27,d24 445 vmul.i16 q6,q6,d0[0] 446 447 448 vdup.8 d30,r6 @ duplicating the +tc value 449 450 rsb r12,r6,#0 451 vdup.8 d31,r12 @ duplicating the -tc value 452 453 454 455 vsub.i16 q5,q5,q6 456 457 458 459 vrshr.s16 q5,q5,#4 460 @ delta = ( 9 * (pu1_src[0] - pu1_src[-1]) - 3 * (pu1_src[1] - pu1_src[-2]) + 8 ) >> 4@ 461 462 vabs.s16 q4,q5 463 vmovn.i16 d9,q4 464 @ storing the absolute values of delta in d9 465 466 vqmovn.s16 d10,q5 467 @ storing the clipped values of delta in d16 468 469 470 vmin.s8 d11,d10,d30 471 vmax.s8 d8,d31,d11 @ d8 has the value delta = clip3(delta, -tc, tc)@ 472 473 474 vmovl.u8 q3,d25 475 476 vaddw.s8 q2,q3,d8 477 478 vqmovun.s16 d12,q2 479 vmovl.u8 q3,d26 480 vsubw.s8 q2,q3,d8 481 vqmovun.s16 d13,q2 482 483 484 mov r11,#0xa 485 mul r12,r11,r6 486 vdup.8 d2,r12 @ d2 has the 10*tc value 487 vmov d18,d24 488 vdup.8 d0,r6 489 vshr.s8 d0,#1 490 vneg.s8 d1,d0 491 492 cmp r4,#1 493 bne l1.2724 494 cmp r9,#1 495 bne l1.2700 496 497 @ d12 and d13 have the value temp_p0 and temp_q0 498 vaddl.u8 q7,d23,d25 499 vrshrn.u16 d14,q7,#1 500 vsubl.u8 q7,d14,d24 501 vaddw.s8 q7,q7,d8 502 vqshrn.s16 d14,q7,#1 503 vmin.s8 d15,d14,d0 504 vmax.s8 d14,d1,d15 505 506 @ d14 has the delta p value 507 vmovl.u8 q8,d24 508 vaddw.s8 q8,q8,d14 509 vqmovun.s16 d14,q8 510 511 @ d14 =tmp_p1 = clip_u8(pu1_src[-2 * src_strd] + delta_p)@ 512 vcge.u8 d18,d9,d2 513 vbsl d18,d24,d14 514 515l1.2700: 516 mov r12,r0 517 rsb r11,r1,#0 518 add r12,r11 519 vcge.u8 d19,d9,d2 520 vbsl d19,d25,d12 521 vst1.32 {d19[0]},[r12],r11 522 vst1.32 {d18[0]},[r12] 523l1.2724: 524 cmp r5,#1 525 bne l1.2404 526 cmp r10,#1 527 vmov d18, d27 528 bne l1.2852 529 530 vaddl.u8 q7,d26,d28 531 vrshrn.u16 d14,q7,#1 532 vsubl.u8 q7,d14,d27 533 vsubw.s8 q7,q7,d8 534 vqshrn.s16 d14,q7,#1 535 vmin.s8 d15,d14,d0 536 vmax.s8 d14,d1,d15 537@ d14 has the delta p value 538 vmovl.u8 q8,d27 539 vaddw.s8 q8,q8,d14 540 vqmovun.s16 d14,q8 541 vcge.u8 d18,d9,d2 542 vbsl d18,d27,d14 543l1.2852: 544 mov r12,r0 545 vcge.u8 d19,d9,d2 546 vbsl d19,d26,d13 547 vst1.32 {d19[0]},[r12],r1 548 vst1.32 {d18[0]},[r12] 549 550 vpop {d8 - d15} 551 ldmfd sp!, {r3-r12,r15} 552 553 554 555