1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_filters_planar.s 22@* 23@* @brief 24@* contains function definitions for inter prediction interpolation. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for planar input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_planar(uword8* pu1_ref, 78@ word32 src_strd, 79@ uword8* pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode, 83@ word32 pi1_coeff) 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #104 91@ nt 92@ mode 93@ pi1_coeff 94 95.equ nt_offset, 104 96 97.text 98.align 4 99 100 101 102 103.globl ihevc_intra_pred_luma_planar_a9q 104.extern gau1_ihevc_planar_factor 105.extern gau1_ihevc_planar_factor_1 106 107gau1_ihevc_planar_factor_addr: 108.long gau1_ihevc_planar_factor - ulbl1 - 8 109 110gau1_ihevc_planar_factor_1_addr: 111.long gau1_ihevc_planar_factor_1 - ulbl2 - 8 112 113 114.type ihevc_intra_pred_luma_planar_a9q, %function 115 116ihevc_intra_pred_luma_planar_a9q: 117 118 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 119 vpush {d8 - d15} 120 ldr r4,[sp,#nt_offset] @loads nt 121 ldr r11, gau1_ihevc_planar_factor_addr @loads table of coeffs 122ulbl1: 123 add r11,r11,pc 124 125 clz r5, r4 126 rsb r5, r5, #32 127 vdup.16 q7, r5 128 vneg.s16 q7, q7 @shr value (so vneg) 129 vdup.8 d2, r4 @nt 130 vdup.s16 q8, r4 @nt 131 132 sub r6, r4, #1 @nt-1 133 add r6, r6, r0 134 ldr r7, [r6] 135 vdup.s8 d0, r7 @src[nt-1] 136 137 add r6, r4, r4,lsl #1 @3nt 138 add r6, r6, #1 @3nt + 1 139 add r6, r6, r0 140 ldr r7, [r6] 141 vdup.s8 d1, r7 @src[3nt+1] 142 143 add r6, r4, r4 @2nt 144 add r14, r6, #1 @2nt+1 145 sub r6, r6, #1 @2nt-1 146 add r6, r6, r0 @&src[2nt-1] 147 add r14, r14, r0 @&src[2nt+1] 148 149 mov r8, #1 @row+1 (row is first 0) 150 sub r9, r4, r8 @nt-1-row (row is first 0) 151 152 vdup.s8 d5, r8 @row + 1 153 vdup.s8 d6, r9 @nt - 1 - row 154 vmov d7, d5 @mov #1 to d7 to used for inc for row+1 and dec for nt-1-row 155 156 add r12, r11, #1 @coeffs (to be reloaded after every row) 157 mov r1, r4 @nt (row counter) (dec after every row) 158 mov r5, r2 @dst (to be reloaded after every row and inc by dst_strd) 159 mov r10, #8 @increment for the coeffs 160 mov r0, r14 @&src[2nt+1] (to be reloaded after every row) 161 162 cmp r4, #4 163 beq tf_sz_4 164 165@@ ========== ***************** ===================== 166prolog: 167tf_sz_8_16_32: 168 169 mov r7, r4 @column counter (set to no of cols) 170 mov r9, r4, lsr #3 @divide nt by 8 171 mul r7, r7, r9 @multiply width * height 172 ldr r5, gau1_ihevc_planar_factor_1_addr @loads table of coeffs 173ulbl2: 174 add r5,r5,pc 175 sub r6, r6, #7 176 mov r8, r2 177 lsl r9, r3, #3 @4*stride 178 rsb r9, r9, #8 @8-4*stride 179 mov r10, r4 @nt 180 sub r10, r10, #8 @nt - 8 181 182col_loop_8_16_32: 183 184 vld1.s8 d8, [r12] @(1-8)load 8 coeffs [col+1] 185 vdup.16 q6, r4 @(1) 186 vld1.s8 d4, [r6] @(1-8)src[2nt-1-row] 187 vsub.s8 d9, d2, d8 @(1-8)[nt-1-col] 188 189 190 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 191 192 vld1.s8 d3, [r14] @(1-8)load 8 src[2nt+1+col] 193 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 194 195 vdup.s8 d20, d4[7] @(1) 196 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 197 198 vdup.s8 d21, d4[6] @(2) 199 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 200 201 vdup.16 q15, r4 @(2) 202 vadd.s8 d5, d5, d7 @(1) 203 204 vsub.s8 d6, d6, d7 @(1) 205 206 vdup.s8 d22, d4[5] @(3) 207 vmlal.u8 q15, d5, d0 @(2) 208 209 vdup.16 q14, r4 @(3) 210 vmlal.u8 q15, d8, d1 @(2) 211 212 vmlal.u8 q15, d6, d3 @(2) 213 vmlal.u8 q15, d9, d21 @(2) 214 215 vshl.s16 q6, q6, q7 @(1)shr 216 217 vadd.s8 d5, d5, d7 @(2) 218 vsub.s8 d6, d6, d7 @(2) 219 220 vmovn.i16 d12, q6 @(1) 221 vmlal.u8 q14, d5, d0 @(3) 222 223 vdup.8 d23, d4[4] @(4) 224 vmlal.u8 q14, d8, d1 @(3) 225 226 vdup.16 q5, r4 @(4) 227 vmlal.u8 q14, d6, d3 @(3) 228 229 vst1.s8 d12, [r2], r3 @(1)str 8 values 230 vmlal.u8 q14, d9, d22 @(3) 231 232 vshl.s16 q15, q15, q7 @(2)shr 233 234 vadd.s8 d5, d5, d7 @(3) 235 vsub.s8 d6, d6, d7 @(3) 236 237 vmovn.i16 d30, q15 @(2) 238 vmlal.u8 q5, d5, d0 @(4) 239 240 vdup.8 d20, d4[3] @(5) 241 vmlal.u8 q5, d8, d1 @(4) 242 243 vdup.16 q8, r4 @(5) 244 vmlal.u8 q5, d6, d3 @(4) 245 246 vst1.s8 d30, [r2], r3 @(2)str 8 values 247 vmlal.u8 q5, d9, d23 @(4) 248 249 vshl.s16 q14, q14, q7 @(3)shr 250 251 vadd.s8 d5, d5, d7 @(4) 252 vsub.s8 d6, d6, d7 @(4) 253 254 vmovn.i16 d28, q14 @(3) 255 vmlal.u8 q8, d5, d0 @(5) 256 257 vdup.8 d21, d4[2] @(6) 258 vmlal.u8 q8, d8, d1 @(5) 259 260 vdup.16 q9, r4 @(6) 261 vmlal.u8 q8, d6, d3 @(5) 262 263 vst1.s8 d28, [r2], r3 @(3)str 8 values 264 vmlal.u8 q8, d9, d20 @(5) 265 266 vshl.s16 q5, q5, q7 @(4)shr 267 vadd.s8 d5, d5, d7 @(5) 268 vsub.s8 d6, d6, d7 @(5) 269 270 vmovn.i16 d10, q5 @(4) 271 vmlal.u8 q9, d5, d0 @(6) 272 273 vdup.8 d22, d4[1] @(7) 274 vmlal.u8 q9, d8, d1 @(6) 275 276 vdup.16 q13, r4 @(7) 277 vmlal.u8 q9, d6, d3 @(6) 278 279 vst1.s8 d10, [r2], r3 @(4)str 8 values 280 vmlal.u8 q9, d9, d21 @(6) 281 282 vshl.s16 q8, q8, q7 @(5)shr 283 284 vadd.s8 d5, d5, d7 @(6) 285 vsub.s8 d6, d6, d7 @(6) 286 287 vmovn.i16 d16, q8 @(5) 288 vmlal.u8 q13, d5, d0 @(7) 289 290 vdup.8 d23, d4[0] @(8) 291 vmlal.u8 q13, d8, d1 @(7) 292 293 vdup.16 q12, r4 @(8) 294 vmlal.u8 q13, d6, d3 @(7) 295 296 vst1.s8 d16, [r2], r3 @(5)str 8 values 297 vmlal.u8 q13, d9, d22 @(7) 298 299 vshl.s16 q9, q9, q7 @(6)shr 300 301 vadd.s8 d5, d5, d7 @(7) 302 vsub.s8 d6, d6, d7 @(7) 303 304 vmovn.i16 d18, q9 @(6) 305 vmlal.u8 q12, d5, d0 @(8) 306 307 308 vmlal.u8 q12, d8, d1 @(8) 309 310 vmlal.u8 q12, d6, d3 @(8) 311 312 vst1.s8 d18, [r2], r3 @(6)str 8 values 313 vmlal.u8 q12, d9, d23 @(8) 314 315 vshl.s16 q13, q13, q7 @(7)shr 316 317 subs r7, r7, #8 318 319 beq epilog 320 321 subs r1, r1, #8 @row counter 322 addgt r12, r12, #8 @col inc 323 addgt r14, r14, #8 @also for col inc 324 movle r1, r4 @nt reloaded (refresh the value) 325 addle r12, r11, #1 @r12 reset 326 327 movle r14, r0 @r14 reset 328 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 329 330 suble r6, r6, #8 @for next set of rows 331 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 332 333 addle r5, r5, #8 334 vdup.16 q6, r4 @(1n)(1) 335 336 vld1.s8 d5, [r5] 337 338 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 339 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 340 341 vdup.s8 d20, d4[7] @(1n)(1) 342 vsub.s8 d6, d2, d5 343 344 beq epilog 345 346kernel_plnr: 347 348 cmp r1, #0 @ (cond loop) 349 vshl.s16 q12, q12, q7 @(8)shr 350 351 vmovn.i16 d26, q13 @(7) 352 vmlal.u8 q6, d5, d0 @(1)(row+1) * src[nt-1] 353 354 vmovn.i16 d24, q12 @(8) 355 vmlal.u8 q6, d8, d1 @(1)(col+1) * src[3nt+1] 356 357 vdup.s8 d21, d4[6] @(2) 358 vmlal.u8 q6, d6, d3 @(1)(nt-1-row) * src[2nt+1+col] 359 360 vdup.16 q15, r4 @(2) 361 vmlal.u8 q6, d9, d20 @(1)(nt-1-col) * src[2nt-1-row] 362 363 vst1.s8 d26, [r2], r3 @(7)str 8 values 364 vadd.s8 d5, d5, d7 @(1) 365 366 vst1.s8 d24, [r2], r3 @(8)str 8 values 367 vsub.s8 d6, d6, d7 @(1) 368 369 addgt r2, r2, r9 @since more cols to fill, dst + 8 - 6*strd (cond loop) 370 vmlal.u8 q15, d5, d0 @(2) 371 372 suble r2, r2, r10 @else go to next set of rows, dst - (nt-8) (cond loop) 373 vmlal.u8 q15, d8, d1 @(2) 374 375 vdup.s8 d22, d4[5] @(3) 376 vmlal.u8 q15, d6, d3 @(2) 377 378 vdup.16 q14, r4 @(3) 379 vmlal.u8 q15, d9, d21 @(2) 380 381 vshl.s16 q6, q6, q7 @(1)shr 382 383 vadd.s8 d5, d5, d7 @(2) 384 movle r1, r4 @nt reloaded (refresh the value) (cond loop) 385 386 vsub.s8 d6, d6, d7 @(2) 387 subs r1, r1, #8 @row counter (loop) 388 389 vmovn.i16 d12, q6 @(1) 390 vmlal.u8 q14, d5, d0 @(3) 391 392 vdup.8 d23, d4[4] @(4) 393 vmlal.u8 q14, d8, d1 @(3) 394 395 vdup.16 q5, r4 @(4) 396 vmlal.u8 q14, d6, d3 @(3) 397 398 vst1.s8 d12, [r2], r3 @(1)str 8 values 399 vmlal.u8 q14, d9, d22 @(3) 400 401 vshl.s16 q15, q15, q7 @(2)shr 402 403 vadd.s8 d5, d5, d7 @(3) 404 405 vsub.s8 d6, d6, d7 @(3) 406 407 vmovn.i16 d30, q15 @(2) 408 vmlal.u8 q5, d5, d0 @(4) 409 410 vdup.8 d20, d4[3] @(5) 411 vmlal.u8 q5, d8, d1 @(4) 412 413 vdup.16 q8, r4 @(5) 414 vmlal.u8 q5, d6, d3 @(4) 415 416 vst1.s8 d30, [r2], r3 @(2)str 8 values 417 vmlal.u8 q5, d9, d23 @(4) 418 419 vshl.s16 q14, q14, q7 @(3)shr 420 421 vadd.s8 d5, d5, d7 @(4) 422 423 vsub.s8 d6, d6, d7 @(4) 424 425 vmovn.i16 d28, q14 @(3) 426 vmlal.u8 q8, d5, d0 @(5) 427 428 vdup.8 d21, d4[2] @(6) 429 vmlal.u8 q8, d8, d1 @(5) 430 431 vdup.16 q9, r4 @(6) 432 vmlal.u8 q8, d6, d3 @(5) 433 434 vst1.s8 d28, [r2], r3 @(3)str 8 values 435 vmlal.u8 q8, d9, d20 @(5) 436 437 addle r12, r11, #1 @r12 reset (cond loop) 438 vshl.s16 q5, q5, q7 @(4)shr 439 440 addgt r12, r12, #8 @col inc (cond loop) 441 vadd.s8 d5, d5, d7 @(5) 442 443 addgt r14, r14, #8 @also for col inc (cond loop) 444 vsub.s8 d6, d6, d7 @(5) 445 446 vmovn.i16 d10, q5 @(4) 447 vmlal.u8 q9, d5, d0 @(6) 448 449 vdup.8 d22, d4[1] @(7) 450 vmlal.u8 q9, d8, d1 @(6) 451 452 vdup.16 q13, r4 @(7) 453 vmlal.u8 q9, d6, d3 @(6) 454 455 vst1.s8 d10, [r2], r3 @(4)str 8 values 456 vmlal.u8 q9, d9, d21 @(6) 457 458 movle r14, r0 @r14 reset (cond loop) 459 vshl.s16 q8, q8, q7 @(5)shr 460 461 suble r6, r6, #8 @for next set of rows (cond loop) 462 vadd.s8 d5, d5, d7 @(6) 463 464 addle r5, r5, #8 @ (cond loop) 465 vsub.s8 d6, d6, d7 @(6) 466 467 vmovn.i16 d16, q8 @(5) 468 vmlal.u8 q13, d5, d0 @(7) 469 470 vdup.8 d23, d4[0] @(8) 471 vmlal.u8 q13, d8, d1 @(7) 472 473 vdup.16 q12, r4 @(8) 474 vmlal.u8 q13, d6, d3 @(7) 475 476 vst1.s8 d16, [r2], r3 @(5)str 8 values 477 vmlal.u8 q13, d9, d22 @(7) 478 479 vld1.s8 d4, [r6] @(1n)(1-8)src[2nt-1-row] 480 vshl.s16 q9, q9, q7 @(6)shr 481 482 vadd.s8 d5, d5, d7 @(7) 483 484 vsub.s8 d6, d6, d7 @(7) 485 486 vmovn.i16 d18, q9 @(6) 487 vmlal.u8 q12, d5, d0 @(8) 488 489 vld1.s8 d5, [r5] @(row+1 value) 490 vmlal.u8 q12, d8, d1 @(8) 491 492 vdup.s8 d20, d4[7] @(1n)(1) 493 vmlal.u8 q12, d6, d3 @(8) 494 495 vst1.s8 d18, [r2], r3 @(6)str 8 values 496 vmlal.u8 q12, d9, d23 @(8) 497 498 vld1.s8 d8, [r12] @(1n)(1-8)load 8 coeffs [col+1] 499 vsub.s8 d6, d2, d5 @(nt-1-row) value 500 501 subs r7, r7, #8 @col counter 502 503 vld1.s8 d3, [r14] @(1n)(1-8)load 8 src[2nt+1+col] 504 vshl.s16 q13, q13, q7 @(7)shr 505 506 vdup.16 q6, r4 @(1n)(1) 507 vsub.s8 d9, d2, d8 @(1n)(1-8)[nt-1-col] 508 509 bne kernel_plnr 510 511epilog: 512 513 vmovn.i16 d26, q13 @(7) 514 vst1.s8 d26, [r2], r3 @(7)str 8 values 515 516 vshl.s16 q12, q12, q7 @(8)shr 517 vmovn.i16 d24, q12 @(8) 518 vst1.s8 d24, [r2], r3 @(8)str 8 values 519 520@@ ========== ***************** ===================== 521 522 beq end_loop 523 524tf_sz_4: 525 vld1.s8 d10, [r14] @load src[2nt+1+col] 526 vld1.s8 d8, [r12], r10 @load 8 coeffs [col+1] 527loop_sz_4: 528 mov r10, #4 @reduce inc to #4 for 4x4 529 ldr r7, [r6], #-1 @src[2nt-1-row] (dec to take into account row) 530 vdup.s8 d4, r7 @src[2nt-1-row] 531 532 vsub.s8 d9, d2, d8 @[nt-1-col] 533 534 vmull.u8 q6, d5, d0 @(row+1) * src[nt-1] 535 vmlal.u8 q6, d6, d10 @(nt-1-row) * src[2nt+1+col] 536 vmlal.u8 q6, d8, d1 @(col+1) * src[3nt+1] 537 vmlal.u8 q6, d9, d4 @(nt-1-col) * src[2nt-1-row] 538@ vadd.i16 q6, q6, q8 @add (nt) 539@ vshl.s16 q6, q6, q7 @shr 540@ vmovn.i16 d12, q6 541 vrshrn.s16 d12,q6,#3 542 vst1.s32 {d12[0]}, [r2], r3 543 544 vadd.s8 d5, d5, d7 @row++ [(row+1)++] 545 vsub.s8 d6, d6, d7 @[nt-1-row]-- 546 subs r1, r1, #1 547 548 bne loop_sz_4 549 550end_loop: 551 vpop {d8 - d15} 552 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 553 554 555 556 557 558 559 560 561