1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21// ******************************************************************************* 22// * @file 23// * ih264_padding_neon.s 24// * 25// * @brief 26// * Contains function definitions padding 27// * 28// * @author 29// * Ittiam 30// * 31// * @par List of Functions: 32// * - ih264_pad_top_av8() 33// * - ih264_pad_left_luma_av8() 34// * - ih264_pad_left_chroma_av8() 35// * - ih264_pad_right_luma_av8() 36// * - ih264_pad_right_chroma_av8() 37// * 38// * @remarks 39// * None 40// * 41// ******************************************************************************* 42//*/ 43 44.text 45.p2align 2 46.include "ih264_neon_macros.s" 47///** 48//******************************************************************************* 49//* 50//* @brief pad at the top of a 2d array 51//* 52//* @par Description: 53//* The top row of a 2d array is replicated for pad_size times at the top 54//* 55//* @param[in] pu1_src 56//* UWORD8 pointer to the source 57//* 58//* @param[in] src_strd 59//* integer source stride 60//* 61//* @param[in] wd 62//* integer width of the array 63//* 64//* @param[in] pad_size 65//* integer -padding size of the array 66//* 67//* @returns none 68//* 69//* @remarks none 70//* 71//******************************************************************************* 72//*/ 73//void ih264_pad_top(UWORD8 *pu1_src, 74// WORD32 src_strd, 75// WORD32 wd, 76// WORD32 pad_size) 77//**************Variables Vs Registers************************* 78// x0 => *pu1_src 79// w1 => src_strd 80// w2 => wd 81// w3 => pad_size 82 83 .global ih264_pad_top_av8 84 85ih264_pad_top_av8: 86 87 // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments 88 push_v_regs 89 sxtw x1, w1 90 stp x19, x20, [sp, #-16]! 91 92 sub x5, x0, x1 93 neg x6, x1 94 95loop_neon_memcpy_mul_16: 96 // Load 16 bytes 97 ld1 {v0.8b, v1.8b}, [x0], #16 98 mov x4, x5 99 mov w7, w3 100 add x5, x5, #16 101 102loop_neon_pad_top: 103 st1 {v0.8b, v1.8b}, [x4], x6 104 subs w7, w7, #1 105 bne loop_neon_pad_top 106 107 subs w2, w2, #16 108 bne loop_neon_memcpy_mul_16 109 110 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP 111 ldp x19, x20, [sp], #16 112 pop_v_regs 113 ret 114 115 116 117 118///** 119//******************************************************************************* 120//* 121//* @brief 122//* Padding (luma block) at the left of a 2d array 123//* 124//* @par Description: 125//* The left column of a 2d array is replicated for pad_size times at the left 126//* 127//* 128//* @param[in] pu1_src 129//* UWORD8 pointer to the source 130//* 131//* @param[in] src_strd 132//* integer source stride 133//* 134//* @param[in] ht 135//* integer height of the array 136//* 137//* @param[in] wd 138//* integer width of the array 139//* 140//* @param[in] pad_size 141//* integer -padding size of the array 142//* 143//* @param[in] ht 144//* integer height of the array 145//* 146//* @param[in] wd 147//* integer width of the array 148//* 149//* @returns 150//* 151//* @remarks 152//* None 153//* 154//******************************************************************************* 155//*/ 156//#if PAD_LEFT_LUMA == C 157//void ih264_pad_left_luma(UWORD8 *pu1_src, 158// WORD32 src_strd, 159// WORD32 ht, 160// WORD32 pad_size) 161//**************Variables Vs Registers************************* 162// x0 => *pu1_src 163// w1 => src_strd 164// w2 => ht 165// w3 => pad_size 166 167 168 169 .global ih264_pad_left_luma_av8 170 171ih264_pad_left_luma_av8: 172 173 // STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments 174 push_v_regs 175 sxtw x1, w1 176 sxtw x3, w3 177 stp x19, x20, [sp, #-16]! 178 179 180 sub x4, x0, x3 181 sub x6, x1, #16 182 subs x5, x3, #16 183 bne loop_32 184loop_16: // /*hard coded for width=16 ,height =8,16*/ 185 ldrb w8, [x0] 186 add x0, x0, x1 187 ldrb w9, [x0] 188 add x0, x0, x1 189 dup v0.16b, w8 190 ldrb w10, [x0] 191 add x0, x0, x1 192 st1 {v0.16b}, [x4], x1 // 16 bytes store 193 dup v2.16b, w9 194 st1 {v2.16b}, [x4], x1 // 16 bytes store 195 ldrb w11, [x0] 196 add x0, x0, x1 197 dup v4.16b, w10 198 dup v6.16b, w11 199 st1 {v4.16b}, [x4], x1 // 16 bytes store 200 ldrb w8, [x0] 201 add x0, x0, x1 202 st1 {v6.16b}, [x4], x1 // 16 bytes store 203 ldrb w9, [x0] 204 add x0, x0, x1 205 dup v0.16b, w8 206 ldrb w10, [x0] 207 add x0, x0, x1 208 st1 {v0.16b}, [x4], x1 // 16 bytes store 209 dup v2.16b, w9 210 ldrb w11, [x0] 211 add x0, x0, x1 212 st1 {v2.16b}, [x4], x1 // 16 bytes store 213 dup v4.16b, w10 214 dup v6.16b, w11 215 subs w2, w2, #8 216 st1 {v4.16b}, [x4], x1 // 16 bytes store 217 st1 {v6.16b}, [x4], x1 // 16 bytes store 218 bne loop_16 219 b end_func 220 221loop_32: // /*hard coded for width=32 ,height =8,16*/ 222 ldrb w8, [x0] 223 add x0, x0, x1 224 ldrb w9, [x0] 225 add x0, x0, x1 226 dup v0.16b, w8 227 ldrb w10, [x0] 228 add x0, x0, x1 229 st1 {v0.16b}, [x4], #16 // 16 bytes store 230 dup v2.16b, w9 231 st1 {v0.16b}, [x4], x6 232 st1 {v2.16b}, [x4], #16 // 16 bytes store 233 dup v4.16b, w10 234 st1 {v2.16b}, [x4], x6 // 16 bytes store 235 ldrb w11, [x0] 236 add x0, x0, x1 237 st1 {v4.16b}, [x4], #16 // 16 bytes store 238 dup v6.16b, w11 239 st1 {v4.16b}, [x4], x6 // 16 bytes store 240 ldrb w8, [x0] 241 add x0, x0, x1 242 st1 {v6.16b}, [x4], #16 // 16 bytes store 243 dup v0.16b, w8 244 ldrb w9, [x0] 245 add x0, x0, x1 246 st1 {v6.16b}, [x4], x6 // 16 bytes store 247 ldrb w10, [x0] 248 add x0, x0, x1 249 st1 {v0.16b}, [x4], #16 // 16 bytes store 250 dup v2.16b, w9 251 st1 {v0.16b}, [x4], x6 // 16 bytes store 252 ldrb w11, [x0] 253 add x0, x0, x1 254 st1 {v2.16b}, [x4], #16 // 16 bytes store 255 dup v4.16b, w10 256 st1 {v2.16b}, [x4], x6 // 16 bytes store 257 st1 {v4.16b}, [x4], #16 // 16 bytes store 258 dup v6.16b, w11 259 st1 {v4.16b}, [x4], x6 // 16 bytes store 260 subs w2, w2, #8 261 st1 {v6.16b}, [x4], #16 // 16 bytes store 262 st1 {v6.16b}, [x4], x6 // 16 bytes store 263 bne loop_32 264 265 266 267end_func: 268 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP 269 ldp x19, x20, [sp], #16 270 pop_v_regs 271 ret 272 273 274 275 276 277///** 278//******************************************************************************* 279//* 280//* @brief 281//* Padding (chroma block) at the left of a 2d array 282//* 283//* @par Description: 284//* The left column of a 2d array is replicated for pad_size times at the left 285//* 286//* 287//* @param[in] pu1_src 288//* UWORD8 pointer to the source 289//* 290//* @param[in] src_strd 291//* integer source stride 292//* 293//* @param[in] ht 294//* integer height of the array 295//* 296//* @param[in] wd 297//* integer width of the array (each colour component) 298//* 299//* @param[in] pad_size 300//* integer -padding size of the array 301//* 302//* @param[in] ht 303//* integer height of the array 304//* 305//* @param[in] wd 306//* integer width of the array 307//* 308//* @returns 309//* 310//* @remarks 311//* None 312//* 313//******************************************************************************* 314//*/ 315//#if PAD_LEFT_CHROMA == C 316//void ih264_pad_left_chroma(UWORD8 *pu1_src, 317// WORD32 src_strd, 318// WORD32 ht, 319// WORD32 pad_size) 320//{ 321// x0 => *pu1_src 322// w1 => src_strd 323// w2 => ht 324// w3 => pad_size 325 326 327 328 .global ih264_pad_left_chroma_av8 329 330ih264_pad_left_chroma_av8: 331 332 // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments 333 push_v_regs 334 sxtw x1, w1 335 sxtw x3, w3 336 stp x19, x20, [sp, #-16]! 337 338 sub x4, x0, x3 339 sub x6, x1, #16 340 341 342loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/ 343 ldrh w8, [x0] 344 add x0, x0, x1 345 ldrh w9, [x0] 346 add x0, x0, x1 347 dup v0.8h, w8 348 ldrh w10, [x0] 349 add x0, x0, x1 350 st1 {v0.16b}, [x4], #16 // 16 bytes store 351 dup v2.8h, w9 352 st1 {v0.16b}, [x4], x6 // 16 bytes store 353 ldrh w11, [x0] 354 add x0, x0, x1 355 st1 {v2.16b}, [x4], #16 // 16 bytes store 356 dup v4.8h, w10 357 st1 {v2.16b}, [x4], x6 // 16 bytes store 358 dup v6.8h, w11 359 st1 {v4.16b}, [x4], #16 // 16 bytes store 360 st1 {v4.16b}, [x4], x6 // 16 bytes store 361 subs w2, w2, #4 362 st1 {v6.16b}, [x4], #16 // 16 bytes store 363 st1 {v6.16b}, [x4], x6 // 16 bytes store 364 365 366 beq end_func_l_c ///* Branching when ht=4*/ 367 368 ldrh w8, [x0] 369 add x0, x0, x1 370 ldrh w9, [x0] 371 add x0, x0, x1 372 dup v0.8h, w8 373 ldrh w10, [x0] 374 add x0, x0, x1 375 st1 {v0.16b}, [x4], #16 // 16 bytes store 376 dup v2.8h, w9 377 st1 {v0.16b}, [x4], x6 378 ldrh w11, [x0] 379 add x0, x0, x1 380 st1 {v2.16b}, [x4], #16 // 16 bytes store 381 dup v4.8h, w10 382 st1 {v2.16b}, [x4], x6 // 16 bytes store 383 dup v6.8h, w11 384 st1 {v4.16b}, [x4], #16 // 16 bytes store 385 st1 {v4.16b}, [x4], x6 // 16 bytes store 386 subs w2, w2, #4 387 st1 {v6.16b}, [x4], #16 // 16 bytes store 388 st1 {v6.16b}, [x4], x6 // 16 bytes store 389 390 beq end_func_l_c ///* Branching when ht=8*/ 391 bne loop_32_l_c 392 393 ldrh w8, [x0] 394 add x0, x0, x1 395 ldrh w9, [x0] 396 add x0, x0, x1 397 dup v0.8h, w8 398 ldrh w10, [x0] 399 add x0, x0, x1 400 st1 {v0.16b}, [x4], #16 // 16 bytes store 401 dup v2.8h, w9 402 st1 {v0.16b}, [x4], x6 403 ldrh w11, [x0] 404 add x0, x0, x1 405 st1 {v2.16b}, [x4], #16 // 16 bytes store 406 dup v4.8h, w10 407 st1 {v2.16b}, [x4], x6 // 16 bytes store 408 dup v6.8h, w11 409 st1 {v4.16b}, [x4], #16 // 16 bytes store 410 st1 {v4.16b}, [x4], x6 // 16 bytes store 411 st1 {v6.16b}, [x4], #16 // 16 bytes store 412 st1 {v6.16b}, [x4], x6 // 16 bytes store 413 414end_func_l_c: 415 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP 416 ldp x19, x20, [sp], #16 417 pop_v_regs 418 ret 419 420 421 422 423 424///** 425//******************************************************************************* 426//* 427//* @brief 428//* Padding (luma block) at the right of a 2d array 429//* 430//* @par Description: 431//* The right column of a 2d array is replicated for pad_size times at the right 432//* 433//* 434//* @param[in] pu1_src 435//* UWORD8 pointer to the source 436//* 437//* @param[in] src_strd 438//* integer source stride 439//* 440//* @param[in] ht 441//* integer height of the array 442//* 443//* @param[in] wd 444//* integer width of the array 445//* 446//* @param[in] pad_size 447//* integer -padding size of the array 448//* 449//* @param[in] ht 450//* integer height of the array 451//* 452//* @param[in] wd 453//* integer width of the array 454//* 455//* @returns 456//* 457//* @remarks 458//* None 459//* 460//******************************************************************************* 461//*/ 462//#if PAD_RIGHT_LUMA == C 463//void ih264_pad_right_luma(UWORD8 *pu1_src, 464// WORD32 src_strd, 465// WORD32 ht, 466// WORD32 pad_size) 467//{ 468// WORD32 row; 469// 470// for(row = 0; row < ht; row++) 471// { 472// memset(pu1_src, *(pu1_src -1), pad_size); 473// 474// pu1_src += src_strd; 475// } 476//} 477// 478// x0 => *pu1_src 479// w1 => src_strd 480// w2 => ht 481// w3 => pad_size 482 483 484 485 .global ih264_pad_right_luma_av8 486 487ih264_pad_right_luma_av8: 488 489 // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments 490 push_v_regs 491 sxtw x1, w1 492 sxtw x3, w3 493 stp x19, x20, [sp, #-16]! 494 495 mov x4, x0 496 sub x6, x1, #16 497 sub x0, x0, #1 498 subs x5, x3, #16 499 bne loop_32 500loop_16_r: // /*hard coded for width=16 ,height =8,16*/ 501 ldrb w8, [x0] 502 add x0, x0, x1 503 ldrb w9, [x0] 504 add x0, x0, x1 505 dup v0.16b, w8 506 ldrb w10, [x0] 507 add x0, x0, x1 508 st1 {v0.16b}, [x4], x1 // 16 bytes store 509 dup v2.16b, w9 510 st1 {v2.16b}, [x4], x1 // 16 bytes store 511 ldrb w11, [x0] 512 add x0, x0, x1 513 dup v4.16b, w10 514 dup v6.16b, w11 515 st1 {v4.16b}, [x4], x1 // 16 bytes store 516 ldrb w8, [x0] 517 add x0, x0, x1 518 st1 {v6.16b}, [x4], x1 // 16 bytes store 519 ldrb w9, [x0] 520 add x0, x0, x1 521 dup v0.16b, w8 522 ldrb w10, [x0] 523 add x0, x0, x1 524 st1 {v0.16b}, [x4], x1 // 16 bytes store 525 dup v2.16b, w9 526 ldrb w11, [x0] 527 add x0, x0, x1 528 st1 {v2.16b}, [x4], x1 // 16 bytes store 529 dup v4.16b, w10 530 dup v6.16b, w11 531 subs w2, w2, #8 532 st1 {v4.16b}, [x4], x1 // 16 bytes store 533 st1 {v6.16b}, [x4], x1 // 16 bytes store 534 bne loop_16_r 535 b end_func_r 536 537loop_32_r: // /*hard coded for width=32 ,height =8,16*/ 538 ldrb w8, [x0] 539 add x0, x0, x1 540 ldrb w9, [x0] 541 add x0, x0, x1 542 dup v0.16b, w8 543 ldrb w10, [x0] 544 add x0, x0, x1 545 st1 {v0.16b}, [x4], #16 // 16 bytes store 546 dup v2.16b, w9 547 st1 {v0.16b}, [x4], x6 548 st1 {v2.16b}, [x4], #16 // 16 bytes store 549 dup v4.16b, w10 550 st1 {v2.16b}, [x4], x6 // 16 bytes store 551 ldrb w11, [x0] 552 add x0, x0, x1 553 st1 {v4.16b}, [x4], #16 // 16 bytes store 554 dup v6.16b, w11 555 st1 {v4.16b}, [x4], x6 // 16 bytes store 556 ldrb w8, [x0] 557 add x0, x0, x1 558 st1 {v6.16b}, [x4], #16 // 16 bytes store 559 ldrb w9, [x0] 560 add x0, x0, x1 561 dup v0.16b, w8 562 st1 {v6.16b}, [x4], x6 // 16 bytes store 563 ldrb w10, [x0] 564 add x0, x0, x1 565 st1 {v0.16b}, [x4], #16 // 16 bytes store 566 dup v2.16b, w9 567 st1 {v0.16b}, [x4], x6 // 16 bytes store 568 ldrb w11, [x0] 569 add x0, x0, x1 570 st1 {v2.16b}, [x4], #16 // 16 bytes store 571 dup v4.16b, w10 572 st1 {v2.16b}, [x4], x6 // 16 bytes store 573 st1 {v4.16b}, [x4], #16 // 16 bytes store 574 dup v6.16b, w11 575 st1 {v4.16b}, [x4], x6 // 16 bytes store 576 subs w2, w2, #8 577 st1 {v6.16b}, [x4], #16 // 16 bytes store 578 st1 {v6.16b}, [x4], x6 // 16 bytes store 579 bne loop_32_r 580 581 582 583end_func_r: 584 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP 585 ldp x19, x20, [sp], #16 586 pop_v_regs 587 ret 588 589 590 591 592 593///** 594//******************************************************************************* 595//* 596//* @brief 597//;* Padding (chroma block) at the right of a 2d array 598//* 599//* @par Description: 600//* The right column of a 2d array is replicated for pad_size times at the right 601//* 602//* 603//* @param[in] pu1_src 604//;* UWORD8 pointer to the source 605//* 606//* @param[in] src_strd 607//* integer source stride 608//* 609//* @param[in] ht 610//;* integer height of the array 611//* 612//* @param[in] wd 613//* integer width of the array (each colour component) 614//* 615//* @param[in] pad_size 616//* integer -padding size of the array 617//* 618//* @param[in] ht 619//;* integer height of the array 620//* 621//* @param[in] wd 622//* integer width of the array 623//* 624//* @returns 625//* 626//* @remarks 627//* None 628//* 629//******************************************************************************* 630//*/ 631//#if PAD_RIGHT_CHROMA == C 632//void ih264_pad_right_chroma(UWORD8 *pu1_src, 633// WORD32 src_strd, 634// WORD32 ht, 635// WORD32 pad_size) 636// x0 => *pu1_src 637// w1 => src_strd 638// w2 => ht 639// w3 => pad_size 640 641 642 643 .global ih264_pad_right_chroma_av8 644 645ih264_pad_right_chroma_av8: 646 647 // STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments 648 push_v_regs 649 sxtw x1, w1 650 sxtw x3, w3 651 stp x19, x20, [sp, #-16]! 652 653 mov x4, x0 654 sub x6, x1, #16 655 sub x0, x0, #2 656loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/ 657 ldrh w8, [x0] 658 add x0, x0, x1 659 ldrh w9, [x0] 660 add x0, x0, x1 661 dup v0.8h, w8 662 ldrh w10, [x0] 663 add x0, x0, x1 664 st1 {v0.16b}, [x4], #16 // 16 bytes store 665 dup v2.8h, w9 666 st1 {v0.16b}, [x4], x6 667 st1 {v2.16b}, [x4], #16 // 16 bytes store 668 dup v4.8h, w10 669 st1 {v2.16b}, [x4], x6 // 16 bytes store 670 subs w2, w2, #4 671 ldrh w11, [x0] 672 add x0, x0, x1 673 st1 {v4.16b}, [x4], #16 // 16 bytes store 674 dup v6.8h, w11 675 st1 {v4.16b}, [x4], x6 // 16 bytes store 676 st1 {v6.16b}, [x4], #16 // 16 bytes store 677 st1 {v6.16b}, [x4], x6 // 16 bytes store 678 679 beq end_func_r_c ///* Branching when ht=4*/ 680 681 ldrh w8, [x0] 682 add x0, x0, x1 683 dup v0.8h, w8 684 ldrh w9, [x0] 685 add x0, x0, x1 686 ldrh w10, [x0] 687 add x0, x0, x1 688 st1 {v0.16b}, [x4], #16 // 16 bytes store 689 dup v2.8h, w9 690 st1 {v0.16b}, [x4], x6 // 16 bytes store 691 ldrh w11, [x0] 692 add x0, x0, x1 693 st1 {v2.16b}, [x4], #16 // 16 bytes store 694 dup v4.8h, w10 695 st1 {v2.16b}, [x4], x6 // 16 bytes store 696 st1 {v4.16b}, [x4], #16 // 16 bytes store 697 dup v6.8h, w11 698 st1 {v4.16b}, [x4], x6 // 16 bytes store 699 subs w2, w2, #4 700 st1 {v6.16b}, [x4], #16 // 16 bytes store 701 st1 {v6.16b}, [x4], x6 // 16 bytes store 702 703 beq end_func_r_c ///* Branching when ht=8*/ 704 bne loop_32_r_c 705 ldrh w8, [x0] 706 add x0, x0, x1 707 dup v0.8h, w8 708 ldrh w9, [x0] 709 add x0, x0, x1 710 ldrh w10, [x0] 711 add x0, x0, x1 712 st1 {v0.16b}, [x4], #16 // 16 bytes store 713 dup v2.8h, w9 714 st1 {v0.16b}, [x4], x6 // 16 bytes store 715 ldrh w11, [x0] 716 add x0, x0, x1 717 st1 {v2.16b}, [x4], #16 // 16 bytes store 718 dup v4.8h, w10 719 st1 {v2.16b}, [x4], x6 // 16 bytes store 720 st1 {v4.16b}, [x4], #16 // 16 bytes store 721 dup v6.8h, w11 722 st1 {v4.16b}, [x4], x6 // 16 bytes store 723 st1 {v6.16b}, [x4], #16 // 16 bytes store 724 st1 {v6.16b}, [x4], x6 // 16 bytes store 725 726end_func_r_c: 727 // LDMFD sp!,{x4-x11,pc} //Reload the registers from SP 728 ldp x19, x20, [sp], #16 729 pop_v_regs 730 ret 731 732 733 734 735 736 737