1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19// ******************************************************************************* 20// * //file 21// * ihevc_padding_neon.s 22// * 23// * //brief 24// * contains function definitions padding 25// * 26// * //author 27// * naveen sr 28// * 29// * //par list of functions: 30// * - ihevc_pad_left_luma() 31// * - ihevc_pad_left_chroma() 32// * 33// * //remarks 34// * none 35// * 36// ******************************************************************************* 37//*/ 38 39///** 40//******************************************************************************* 41//* 42//* //brief 43//* padding (luma block) at the left of a 2d array 44//* 45//* //par description: 46//* the left column of a 2d array is replicated for pad_size times at the left 47//* 48//* 49//* //param[in] pu1_src 50//* uword8 pointer to the source 51//* 52//* //param[in] src_strd 53//* integer source stride 54//* 55//* //param[in] ht 56//* integer height of the array 57//* 58//* //param[in] wd 59//* integer width of the array 60//* 61//* //param[in] pad_size 62//* integer -padding size of the array 63//* 64//* //param[in] ht 65//* integer height of the array 66//* 67//* //param[in] wd 68//* integer width of the array 69//* 70//* //returns 71//* 72//* //remarks 73//* none 74//* 75//******************************************************************************* 76//*/ 77//.if pad_left_luma == c 78//void ihevc_pad_left_luma(uword8 *pu1_src, 79// word32 src_strd, 80// word32 ht, 81// word32 pad_size) 82//**************variables vs registers************************* 83// x0 => *pu1_src 84// x1 => src_strd 85// x2 => ht 86// x3 => pad_size 87 88.text 89.align 4 90 91.globl ihevc_pad_left_luma_av8 92 93.type ihevc_pad_left_luma_av8, %function 94 95ihevc_pad_left_luma_av8: 96 97loop_start_luma_left: 98 // pad size is assumed to be pad_left = 80 99 sub x4,x0,x3 100 101 ldrb w8,[x0] 102 add x0,x0,x1 103 ldrb w9,[x0] 104 add x0,x0,x1 105 ldrb w10,[x0] 106 add x0,x0,x1 107 ldrb w11,[x0] 108 add x0,x0,x1 109 110 dup v0.16b,w8 111 dup v2.16b,w9 112 dup v4.16b,w10 113 dup v6.16b,w11 114 115 add x5,x4,x1 116 117 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 118 st1 {v0.16b},[x4],#16 // 16 bytes store 119 st1 {v0.16b},[x4],#16 // 16 bytes store 120 st1 {v0.16b},[x4],#16 // 16 bytes store 121 st1 {v0.16b},[x4] // 16 bytes store 122 123 add x6,x5,x1 124 125 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 126 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 127 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 128 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 129 st1 {v2.16b},[x5] //128/8 = 16 bytes store 130 131 add x7,x6,x1 132 133 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 134 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 135 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 136 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 137 st1 {v4.16b},[x6] //128/8 = 16 bytes store 138 139 subs x2, x2,#4 140 141 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 142 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 143 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 144 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 145 st1 {v6.16b},[x7] //128/8 = 16 bytes store 146 147 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 148 149 bne loop_start_luma_left 150 151 ret 152 153 154 155 156 157///** 158//******************************************************************************* 159//* 160//* //brief 161//* padding (chroma block) at the left of a 2d array 162//* 163//* //par description: 164//* the left column of a 2d array is replicated for pad_size times at the left 165//* 166//* 167//* //param[in] pu1_src 168//* uword8 pointer to the source 169//* 170//* //param[in] src_strd 171//* integer source stride 172//* 173//* //param[in] ht 174//* integer height of the array 175//* 176//* //param[in] wd 177//* integer width of the array (each colour component) 178//* 179//* //param[in] pad_size 180//* integer -padding size of the array 181//* 182//* //param[in] ht 183//* integer height of the array 184//* 185//* //param[in] wd 186//* integer width of the array 187//* 188//* //returns 189//* 190//* //remarks 191//* none 192//* 193//******************************************************************************* 194//*/ 195//.if pad_left_chroma == c 196//void ihevc_pad_left_chroma(uword8 *pu1_src, 197// word32 src_strd, 198// word32 ht, 199// word32 pad_size) 200//{ 201// x0 => *pu1_src 202// x1 => src_strd 203// x2 => ht 204// x3 => pad_size 205 206 207 208.globl ihevc_pad_left_chroma_av8 209 210.type ihevc_pad_left_chroma_av8, %function 211 212ihevc_pad_left_chroma_av8: 213 214 215loop_start_chroma_left: 216 // pad size is assumed to be pad_left = 80 217 sub x4,x0,x3 218 219 ldrh w8,[x0] 220 add x0,x0,x1 221 ldrh w9,[x0] 222 add x0,x0,x1 223 ldrh w10,[x0] 224 add x0,x0,x1 225 ldrh w11,[x0] 226 add x0,x0,x1 227 228 dup v0.8h,w8 229 dup v2.8h,w9 230 dup v4.8h,w10 231 dup v6.8h,w11 232 233 add x5,x4,x1 234 235 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 236 st1 {v0.16b},[x4],#16 // 16 bytes store 237 st1 {v0.16b},[x4],#16 // 16 bytes store 238 st1 {v0.16b},[x4],#16 // 16 bytes store 239 st1 {v0.16b},[x4] // 16 bytes store 240 241 add x6,x5,x1 242 243 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 244 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 245 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 246 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 247 st1 {v2.16b},[x5] //128/8 = 16 bytes store 248 249 add x7,x6,x1 250 251 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 252 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 253 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 254 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 255 st1 {v4.16b},[x6] //128/8 = 16 bytes store 256 257 subs x2, x2,#4 258 259 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 260 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 261 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 262 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 263 st1 {v6.16b},[x7] //128/8 = 16 bytes store 264 265 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 266 267 bne loop_start_chroma_left 268 269 ret 270 271 272 273 274 275///** 276//******************************************************************************* 277//* 278//* //brief 279//* padding (luma block) at the right of a 2d array 280//* 281//* //par description: 282//* the right column of a 2d array is replicated for pad_size times at the right 283//* 284//* 285//* //param[in] pu1_src 286//* uword8 pointer to the source 287//* 288//* //param[in] src_strd 289//* integer source stride 290//* 291//* //param[in] ht 292//* integer height of the array 293//* 294//* //param[in] wd 295//* integer width of the array 296//* 297//* //param[in] pad_size 298//* integer -padding size of the array 299//* 300//* //param[in] ht 301//* integer height of the array 302//* 303//* //param[in] wd 304//* integer width of the array 305//* 306//* //returns 307//* 308//* //remarks 309//* none 310//* 311//******************************************************************************* 312//*/ 313//.if pad_right_luma == c 314//void ihevc_pad_right_luma(uword8 *pu1_src, 315// word32 src_strd, 316// word32 ht, 317// word32 pad_size) 318//{ 319// word32 row// 320// 321// for(row = 0// row < ht// row++) 322// { 323// memset(pu1_src, *(pu1_src -1), pad_size)// 324// 325// pu1_src += src_strd// 326// } 327//} 328// 329// x0 => *pu1_src 330// x1 => src_strd 331// x2 => ht 332// x3 => pad_size 333 334 335 336.globl ihevc_pad_right_luma_av8 337 338.type ihevc_pad_right_luma_av8, %function 339 340ihevc_pad_right_luma_av8: 341 342 343loop_start_luma_right: 344 // pad size is assumed to be pad_left = 80 345 mov x4,x0 346 347 ldrb w8,[x0, #-1] 348 add x0,x0,x1 349 ldrb w9,[x0, #-1] 350 add x0,x0,x1 351 ldrb w10,[x0, #-1] 352 add x0,x0,x1 353 ldrb w11,[x0, #-1] 354 add x0,x0,x1 355 356 add x5,x4,x1 357 add x6,x5,x1 358 add x7,x6,x1 359 360 dup v0.16b,w8 361 dup v2.16b,w9 362 dup v4.16b,w10 363 dup v6.16b,w11 364 365 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 366 st1 {v0.16b},[x4],#16 // 16 bytes store 367 st1 {v0.16b},[x4],#16 // 16 bytes store 368 st1 {v0.16b},[x4],#16 // 16 bytes store 369 st1 {v0.16b},[x4] // 16 bytes store 370 371 372 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 373 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 374 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 375 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 376 st1 {v2.16b},[x5] //128/8 = 16 bytes store 377 378 subs x2, x2,#4 379 380 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 381 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 382 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 383 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 384 st1 {v4.16b},[x6] //128/8 = 16 bytes store 385 386 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 387 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 388 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 389 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 390 st1 {v6.16b},[x7] //128/8 = 16 bytes store 391 392 393 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 394 395 396 bne loop_start_luma_right 397 398 ret 399 400 401 402 403 404///** 405//******************************************************************************* 406//* 407//* //brief 408////* padding (chroma block) at the right of a 2d array 409//* 410//* //par description: 411//* the right column of a 2d array is replicated for pad_size times at the right 412//* 413//* 414//* //param[in] pu1_src 415////* uword8 pointer to the source 416//* 417//* //param[in] src_strd 418//* integer source stride 419//* 420//* //param[in] ht 421////* integer height of the array 422//* 423//* //param[in] wd 424//* integer width of the array (each colour component) 425//* 426//* //param[in] pad_size 427//* integer -padding size of the array 428//* 429//* //param[in] ht 430////* integer height of the array 431//* 432//* //param[in] wd 433//* integer width of the array 434//* 435//* //returns 436//* 437//* //remarks 438//* none 439//* 440//******************************************************************************* 441//*/ 442//.if pad_right_chroma == c 443//void ihevc_pad_right_chroma(uword8 *pu1_src, 444// word32 src_strd, 445// word32 ht, 446// word32 pad_size) 447// x0 => *pu1_src 448// x1 => src_strd 449// x2 => ht 450// x3 => pad_size 451 452 453 454.globl ihevc_pad_right_chroma_av8 455 456.type ihevc_pad_right_chroma_av8, %function 457 458ihevc_pad_right_chroma_av8: 459 460 461loop_start_chroma_right: 462 // pad size is assumed to be pad_left = 80 463 mov x4,x0 464 465 ldrh w8,[x0, #-2] 466 add x0,x0,x1 467 ldrh w9,[x0, #-2] 468 add x0,x0,x1 469 ldrh w10,[x0, #-2] 470 add x0,x0,x1 471 ldrh w11,[x0, #-2] 472 add x0,x0,x1 473 474 dup v0.8h,w8 475 dup v2.8h,w9 476 dup v4.8h,w10 477 dup v6.8h,w11 478 479 add x5,x4,x1 480 481 st1 {v0.16b},[x4],#16 //128/8 = 16 bytes store 482 st1 {v0.16b},[x4],#16 // 16 bytes store 483 st1 {v0.16b},[x4],#16 // 16 bytes store 484 st1 {v0.16b},[x4],#16 // 16 bytes store 485 st1 {v0.16b},[x4] // 16 bytes store 486 487 add x6,x5,x1 488 489 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 490 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 491 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 492 st1 {v2.16b},[x5],#16 //128/8 = 16 bytes store 493 st1 {v2.16b},[x5] //128/8 = 16 bytes store 494 495 add x7,x6,x1 496 497 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 498 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 499 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 500 st1 {v4.16b},[x6],#16 //128/8 = 16 bytes store 501 st1 {v4.16b},[x6] //128/8 = 16 bytes store 502 503 subs x2, x2,#4 504 505 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 506 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 507 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 508 st1 {v6.16b},[x7],#16 //128/8 = 16 bytes store 509 st1 {v6.16b},[x7] //128/8 = 16 bytes store 510 511 // total of 4rows*(16*5) = 4 * 80 = 4 * pad_left store 512 513 bne loop_start_chroma_right 514 515 ret 516 517 518 519 520 521 522 523 524