1@/****************************************************************************** 2@ * 3@ * Copyright (C) 2015 The Android Open Source Project 4@ * 5@ * Licensed under the Apache License, Version 2.0 (the "License"); 6@ * you may not use this file except in compliance with the License. 7@ * You may obtain a copy of the License at: 8@ * 9@ * http://www.apache.org/licenses/LICENSE-2.0 10@ * 11@ * Unless required by applicable law or agreed to in writing, software 12@ * distributed under the License is distributed on an "AS IS" BASIS, 13@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@ * See the License for the specific language governing permissions and 15@ * limitations under the License. 16@ * 17@ ***************************************************************************** 18@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19@*/ 20@* 21@ ******************************************************************************* 22@ * @file 23@ * ih264_padding_neon.s 24@ * 25@ * @brief 26@ * Contains function definitions padding 27@ * 28@ * @author 29@ * Ittiam 30@ * 31@ * @par List of Functions: 32@ * - ih264_pad_top_a9q() 33@ * - ih264_pad_left_luma_a9q() 34@ * - ih264_pad_left_chroma_a9q() 35@ * - ih264_pad_right_luma_a9q() 36@ * - ih264_pad_right_chroma_a9q() 37@ * 38@ * @remarks 39@ * None 40@ * 41@ ******************************************************************************* 42@* 43 44 45@** 46@******************************************************************************* 47@* 48@* @brief pad at the top of a 2d array 49@* 50@* @par Description: 51@* The top row of a 2d array is replicated for pad_size times at the top 52@* 53@* @param[in] pu1_src 54@* UWORD8 pointer to the source 55@* 56@* @param[in] src_strd 57@* integer source stride 58@* 59@* @param[in] wd 60@* integer width of the array 61@* 62@* @param[in] pad_size 63@* integer -padding size of the array 64@* 65@* @returns none 66@* 67@* @remarks none 68@* 69@******************************************************************************* 70@* 71@void ih264_pad_top(UWORD8 *pu1_src, 72@ WORD32 src_strd, 73@ WORD32 wd, 74@ WORD32 pad_size) 75@**************Variables Vs Registers************************* 76@ r0 => *pu1_src 77@ r1 => src_strd 78@ r2 => wd 79@ r3 => pad_size 80 81.text 82.p2align 2 83 84 .global ih264_pad_top_a9q 85 86ih264_pad_top_a9q: 87 88 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 89 90 sub r5, r0, r1 91 rsb r6, r1, #0 92 93loop_neon_memcpy_mul_16: 94 @ Load 16 bytes 95 vld1.8 {d0, d1}, [r0]! 96 mov r4, r5 97 mov r7, r3 98 add r5, r5, #16 99 100loop_neon_pad_top: 101 vst1.8 {d0, d1}, [r4], r6 102 subs r7, r7, #1 103 bne loop_neon_pad_top 104 105 subs r2, r2, #16 106 bne loop_neon_memcpy_mul_16 107 108 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 109 110 111 112 113@** 114@******************************************************************************* 115@* 116@* @brief 117@* Padding (luma block) at the left of a 2d array 118@* 119@* @par Description: 120@* The left column of a 2d array is replicated for pad_size times at the left 121@* 122@* 123@* @param[in] pu1_src 124@* UWORD8 pointer to the source 125@* 126@* @param[in] src_strd 127@* integer source stride 128@* 129@* @param[in] ht 130@* integer height of the array 131@* 132@* @param[in] wd 133@* integer width of the array 134@* 135@* @param[in] pad_size 136@* integer -padding size of the array 137@* 138@* @param[in] ht 139@* integer height of the array 140@* 141@* @param[in] wd 142@* integer width of the array 143@* 144@* @returns 145@* 146@* @remarks 147@* None 148@* 149@******************************************************************************* 150@* 151@#if PAD_LEFT_LUMA == C 152@void ih264_pad_left_luma(UWORD8 *pu1_src, 153@ WORD32 src_strd, 154@ WORD32 ht, 155@ WORD32 pad_size) 156@**************Variables Vs Registers************************* 157@ r0 => *pu1_src 158@ r1 => src_strd 159@ r2 => ht 160@ r3 => pad_size 161 162 163 164 .global ih264_pad_left_luma_a9q 165 166ih264_pad_left_luma_a9q: 167 168 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 169 170 171 sub r4, r0, r3 172 sub r6, r1, #16 173 subs r5, r3, #16 174 bne loop_32 175loop_16: @ /*hard coded for width=16 ,height =8,16*/ 176 ldrb r8, [r0], r1 177 ldrb r9, [r0], r1 178 vdup.u8 q0, r8 179 ldrb r10, [r0], r1 180 vst1.8 {q0}, [r4], r1 @ 16 bytes store 181 vdup.u8 q1, r9 182 vst1.8 {q1}, [r4], r1 @ 16 bytes store 183 ldrb r11, [r0], r1 184 vdup.u8 q2, r10 185 vdup.u8 q3, r11 186 vst1.8 {q2}, [r4], r1 @ 16 bytes store 187 ldrb r8, [r0], r1 188 vst1.8 {q3}, [r4], r1 @ 16 bytes store 189 ldrb r9, [r0], r1 190 vdup.u8 q0, r8 191 ldrb r10, [r0], r1 192 vst1.8 {q0}, [r4], r1 @ 16 bytes store 193 vdup.u8 q1, r9 194 ldrb r11, [r0], r1 195 vst1.8 {q1}, [r4], r1 @ 16 bytes store 196 vdup.u8 q2, r10 197 vdup.u8 q3, r11 198 subs r2, r2, #8 199 vst1.8 {q2}, [r4], r1 @ 16 bytes store 200 vst1.8 {q3}, [r4], r1 @ 16 bytes store 201 bne loop_16 202 b end_func 203 204loop_32: @ /*hard coded for width=32 ,height =8,16*/ 205 ldrb r8, [r0], r1 206 ldrb r9, [r0], r1 207 vdup.u8 q0, r8 208 ldrb r10, [r0], r1 209 vst1.8 {q0}, [r4]! @ 16 bytes store 210 vdup.u8 q1, r9 211 vst1.8 {q0}, [r4], r6 212 vst1.8 {q1}, [r4]! @ 16 bytes store 213 vdup.u8 q2, r10 214 vst1.8 {q1}, [r4], r6 @ 16 bytes store 215 ldrb r11, [r0], r1 216 vst1.8 {q2}, [r4]! @ 16 bytes store 217 vdup.u8 q3, r11 218 vst1.8 {q2}, [r4], r6 @ 16 bytes store 219 ldrb r8, [r0], r1 220 vst1.8 {q3}, [r4]! @ 16 bytes store 221 vdup.u8 q0, r8 222 ldrb r9, [r0], r1 223 vst1.8 {q3}, [r4], r6 @ 16 bytes store 224 ldrb r10, [r0], r1 225 vst1.8 {q0}, [r4]! @ 16 bytes store 226 vdup.u8 q1, r9 227 vst1.8 {q0}, [r4], r6 @ 16 bytes store 228 ldrb r11, [r0], r1 229 vst1.8 {q1}, [r4]! @ 16 bytes store 230 vdup.u8 q2, r10 231 vst1.8 {q1}, [r4], r6 @ 16 bytes store 232 vst1.8 {q2}, [r4]! @ 16 bytes store 233 vdup.u8 q3, r11 234 vst1.8 {q2}, [r4], r6 @ 16 bytes store 235 subs r2, r2, #8 236 vst1.8 {q3}, [r4]! @ 16 bytes store 237 vst1.8 {q3}, [r4], r6 @ 16 bytes store 238 bne loop_32 239 240 241 242end_func: 243 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 244 245 246 247 248 249@** 250@******************************************************************************* 251@* 252@* @brief 253@* Padding (chroma block) at the left of a 2d array 254@* 255@* @par Description: 256@* The left column of a 2d array is replicated for pad_size times at the left 257@* 258@* 259@* @param[in] pu1_src 260@* UWORD8 pointer to the source 261@* 262@* @param[in] src_strd 263@* integer source stride 264@* 265@* @param[in] ht 266@* integer height of the array 267@* 268@* @param[in] wd 269@* integer width of the array (each colour component) 270@* 271@* @param[in] pad_size 272@* integer -padding size of the array 273@* 274@* @param[in] ht 275@* integer height of the array 276@* 277@* @param[in] wd 278@* integer width of the array 279@* 280@* @returns 281@* 282@* @remarks 283@* None 284@* 285@******************************************************************************* 286@* 287@#if PAD_LEFT_CHROMA == C 288@void ih264_pad_left_chroma(UWORD8 *pu1_src, 289@ WORD32 src_strd, 290@ WORD32 ht, 291@ WORD32 pad_size) 292@{ 293@ r0 => *pu1_src 294@ r1 => src_strd 295@ r2 => ht 296@ r3 => pad_size 297 298 299 300 .global ih264_pad_left_chroma_a9q 301 302ih264_pad_left_chroma_a9q: 303 304 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 305 306 sub r4, r0, r3 307 sub r6, r1, #16 308 309 310loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/ 311 ldrh r8, [r0], r1 312 ldrh r9, [r0], r1 313 vdup.u16 q0, r8 314 ldrh r10, [r0], r1 315 vst1.8 {q0}, [r4]! @ 16 bytes store 316 vdup.u16 q1, r9 317 vst1.8 {q0}, [r4], r6 @ 16 bytes store 318 ldrh r11, [r0], r1 319 vst1.8 {q1}, [r4]! @ 16 bytes store 320 vdup.u16 q2, r10 321 vst1.8 {q1}, [r4], r6 @ 16 bytes store 322 vdup.u16 q3, r11 323 vst1.8 {q2}, [r4]! @ 16 bytes store 324 vst1.8 {q2}, [r4], r6 @ 16 bytes store 325 subs r2, r2, #4 326 vst1.8 {q3}, [r4]! @ 16 bytes store 327 vst1.8 {q3}, [r4], r6 @ 16 bytes store 328 329 330 beq end_func_l_c @/* Branching when ht=4*/ 331 332 ldrh r8, [r0], r1 333 ldrh r9, [r0], r1 334 vdup.u16 q0, r8 335 ldrh r10, [r0], r1 336 vst1.8 {q0}, [r4]! @ 16 bytes store 337 vdup.u16 q1, r9 338 vst1.8 {q0}, [r4], r6 339 ldrh r11, [r0], r1 340 vst1.8 {q1}, [r4]! @ 16 bytes store 341 vdup.u16 q2, r10 342 vst1.8 {q1}, [r4], r6 @ 16 bytes store 343 vdup.u16 q3, r11 344 vst1.8 {q2}, [r4]! @ 16 bytes store 345 vst1.8 {q2}, [r4], r6 @ 16 bytes store 346 subs r2, r2, #4 347 vst1.8 {q3}, [r4]! @ 16 bytes store 348 vst1.8 {q3}, [r4], r6 @ 16 bytes store 349 350 beq end_func_l_c @/* Branching when ht=8*/ 351 bne loop_32_l_c 352 353 ldrh r8, [r0], r1 354 ldrh r9, [r0], r1 355 vdup.u16 q0, r8 356 ldrh r10, [r0], r1 357 vst1.8 {q0}, [r4]! @ 16 bytes store 358 vdup.u16 q1, r9 359 vst1.8 {q0}, [r4], r6 360 ldrh r11, [r0], r1 361 vst1.8 {q1}, [r4]! @ 16 bytes store 362 vdup.u16 q2, r10 363 vst1.8 {q1}, [r4], r6 @ 16 bytes store 364 vdup.u16 q3, r11 365 vst1.8 {q2}, [r4]! @ 16 bytes store 366 vst1.8 {q2}, [r4], r6 @ 16 bytes store 367 vst1.8 {q3}, [r4]! @ 16 bytes store 368 vst1.8 {q3}, [r4], r6 @ 16 bytes store 369 370end_func_l_c: 371 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 372 373 374 375 376 377@** 378@******************************************************************************* 379@* 380@* @brief 381@* Padding (luma block) at the right of a 2d array 382@* 383@* @par Description: 384@* The right column of a 2d array is replicated for pad_size times at the right 385@* 386@* 387@* @param[in] pu1_src 388@* UWORD8 pointer to the source 389@* 390@* @param[in] src_strd 391@* integer source stride 392@* 393@* @param[in] ht 394@* integer height of the array 395@* 396@* @param[in] wd 397@* integer width of the array 398@* 399@* @param[in] pad_size 400@* integer -padding size of the array 401@* 402@* @param[in] ht 403@* integer height of the array 404@* 405@* @param[in] wd 406@* integer width of the array 407@* 408@* @returns 409@* 410@* @remarks 411@* None 412@* 413@******************************************************************************* 414@* 415@#if PAD_RIGHT_LUMA == C 416@void ih264_pad_right_luma(UWORD8 *pu1_src, 417@ WORD32 src_strd, 418@ WORD32 ht, 419@ WORD32 pad_size) 420@{ 421@ WORD32 row; 422@ 423@ for(row = 0; row < ht; row++) 424@ { 425@ memset(pu1_src, *(pu1_src -1), pad_size); 426@ 427@ pu1_src += src_strd; 428@ } 429@} 430@ 431@ r0 => *pu1_src 432@ r1 => src_strd 433@ r2 => ht 434@ r3 => pad_size 435 436 437 438 .global ih264_pad_right_luma_a9q 439 440ih264_pad_right_luma_a9q: 441 442 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 443 444 mov r4, r0 445 sub r6, r1, #16 446 sub r0, r0, #1 447 subs r5, r3, #16 448 bne loop_32 449loop_16_r: @ /*hard coded for width=16 ,height =8,16*/ 450 ldrb r8, [r0], r1 451 ldrb r9, [r0], r1 452 vdup.u8 q0, r8 453 ldrb r10, [r0], r1 454 vst1.8 {q0}, [r4], r1 @ 16 bytes store 455 vdup.u8 q1, r9 456 vst1.8 {q1}, [r4], r1 @ 16 bytes store 457 ldrb r11, [r0], r1 458 vdup.u8 q2, r10 459 vdup.u8 q3, r11 460 vst1.8 {q2}, [r4], r1 @ 16 bytes store 461 ldrb r8, [r0], r1 462 vst1.8 {q3}, [r4], r1 @ 16 bytes store 463 ldrb r9, [r0], r1 464 vdup.u8 q0, r8 465 ldrb r10, [r0], r1 466 vst1.8 {q0}, [r4], r1 @ 16 bytes store 467 vdup.u8 q1, r9 468 ldrb r11, [r0], r1 469 vst1.8 {q1}, [r4], r1 @ 16 bytes store 470 vdup.u8 q2, r10 471 vdup.u8 q3, r11 472 subs r2, r2, #8 473 vst1.8 {q2}, [r4], r1 @ 16 bytes store 474 vst1.8 {q3}, [r4], r1 @ 16 bytes store 475 bne loop_16_r 476 b end_func_r 477 478loop_32_r: @ /*hard coded for width=32 ,height =8,16*/ 479 ldrb r8, [r0], r1 480 ldrb r9, [r0], r1 481 vdup.u8 q0, r8 482 ldrb r10, [r0], r1 483 vst1.8 {q0}, [r4]! @ 16 bytes store 484 vdup.u8 q1, r9 485 vst1.8 {q0}, [r4], r6 486 vst1.8 {q1}, [r4]! @ 16 bytes store 487 vdup.u8 q2, r10 488 vst1.8 {q1}, [r4], r6 @ 16 bytes store 489 ldrb r11, [r0], r1 490 vst1.8 {q2}, [r4]! @ 16 bytes store 491 vdup.u8 q3, r11 492 vst1.8 {q2}, [r4], r6 @ 16 bytes store 493 ldrb r8, [r0], r1 494 vst1.8 {q3}, [r4]! @ 16 bytes store 495 ldrb r9, [r0], r1 496 vdup.u8 q0, r8 497 vst1.8 {q3}, [r4], r6 @ 16 bytes store 498 ldrb r10, [r0], r1 499 vst1.8 {q0}, [r4]! @ 16 bytes store 500 vdup.u8 q1, r9 501 vst1.8 {q0}, [r4], r6 @ 16 bytes store 502 ldrb r11, [r0], r1 503 vst1.8 {q1}, [r4]! @ 16 bytes store 504 vdup.u8 q2, r10 505 vst1.8 {q1}, [r4], r6 @ 16 bytes store 506 vst1.8 {q2}, [r4]! @ 16 bytes store 507 vdup.u8 q3, r11 508 vst1.8 {q2}, [r4], r6 @ 16 bytes store 509 subs r2, r2, #8 510 vst1.8 {q3}, [r4]! @ 16 bytes store 511 vst1.8 {q3}, [r4], r6 @ 16 bytes store 512 bne loop_32_r 513 514 515 516end_func_r: 517 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 518 519 520 521 522 523@** 524@******************************************************************************* 525@* 526@* @brief 527@;* Padding (chroma block) at the right of a 2d array 528@* 529@* @par Description: 530@* The right column of a 2d array is replicated for pad_size times at the right 531@* 532@* 533@* @param[in] pu1_src 534@;* UWORD8 pointer to the source 535@* 536@* @param[in] src_strd 537@* integer source stride 538@* 539@* @param[in] ht 540@;* integer height of the array 541@* 542@* @param[in] wd 543@* integer width of the array (each colour component) 544@* 545@* @param[in] pad_size 546@* integer -padding size of the array 547@* 548@* @param[in] ht 549@;* integer height of the array 550@* 551@* @param[in] wd 552@* integer width of the array 553@* 554@* @returns 555@* 556@* @remarks 557@* None 558@* 559@******************************************************************************* 560@* 561@#if PAD_RIGHT_CHROMA == C 562@void ih264_pad_right_chroma(UWORD8 *pu1_src, 563@ WORD32 src_strd, 564@ WORD32 ht, 565@ WORD32 pad_size) 566@ r0 => *pu1_src 567@ r1 => src_strd 568@ r2 => ht 569@ r3 => pad_size 570 571 572 573 .global ih264_pad_right_chroma_a9q 574 575ih264_pad_right_chroma_a9q: 576 577 stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments 578 579 mov r4, r0 580 sub r6, r1, #16 581 sub r0, r0, #2 582loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/ 583 ldrh r8, [r0], r1 584 ldrh r9, [r0], r1 585 vdup.u16 q0, r8 586 ldrh r10, [r0], r1 587 vst1.8 {q0}, [r4]! @ 16 bytes store 588 vdup.u16 q1, r9 589 vst1.8 {q0}, [r4], r6 590 vst1.8 {q1}, [r4]! @ 16 bytes store 591 vdup.u16 q2, r10 592 vst1.8 {q1}, [r4], r6 @ 16 bytes store 593 subs r2, r2, #4 594 ldrh r11, [r0], r1 595 vst1.8 {q2}, [r4]! @ 16 bytes store 596 vdup.u16 q3, r11 597 vst1.8 {q2}, [r4], r6 @ 16 bytes store 598 vst1.8 {q3}, [r4]! @ 16 bytes store 599 vst1.8 {q3}, [r4], r6 @ 16 bytes store 600 601 beq end_func_r_c @/* Branching when ht=4*/ 602 603 ldrh r8, [r0], r1 604 vdup.u16 q0, r8 605 ldrh r9, [r0], r1 606 ldrh r10, [r0], r1 607 vst1.8 {q0}, [r4]! @ 16 bytes store 608 vdup.u16 q1, r9 609 vst1.8 {q0}, [r4], r6 @ 16 bytes store 610 ldrh r11, [r0], r1 611 vst1.8 {q1}, [r4]! @ 16 bytes store 612 vdup.u16 q2, r10 613 vst1.8 {q1}, [r4], r6 @ 16 bytes store 614 vst1.8 {q2}, [r4]! @ 16 bytes store 615 vdup.u16 q3, r11 616 vst1.8 {q2}, [r4], r6 @ 16 bytes store 617 subs r2, r2, #4 618 vst1.8 {q3}, [r4]! @ 16 bytes store 619 vst1.8 {q3}, [r4], r6 @ 16 bytes store 620 621 beq end_func_r_c @/* Branching when ht=8*/ 622 bne loop_32_r_c 623 624 ldrh r8, [r0], r1 625 vdup.u16 q0, r8 626 ldrh r9, [r0], r1 627 ldrh r10, [r0], r1 628 vst1.8 {q0}, [r4]! @ 16 bytes store 629 vdup.u16 q1, r9 630 vst1.8 {q0}, [r4], r6 @ 16 bytes store 631 ldrh r11, [r0], r1 632 vst1.8 {q1}, [r4]! @ 16 bytes store 633 vdup.u16 q2, r10 634 vst1.8 {q1}, [r4], r6 @ 16 bytes store 635 vst1.8 {q2}, [r4]! @ 16 bytes store 636 vdup.u16 q3, r11 637 vst1.8 {q2}, [r4], r6 @ 16 bytes store 638 vst1.8 {q3}, [r4]! @ 16 bytes store 639 vst1.8 {q3}, [r4], r6 @ 16 bytes store 640 641end_func_r_c: 642 ldmfd sp!, {r4-r11, pc} @Reload the registers from SP 643 644 645 646 647 648