1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_intra_pred_luma_16x16_av8.s 24//* 25//* @brief 26//* Contains function definitions for intra 16x16 Luma prediction . 27//* 28//* @author 29//* Ittiam 30//* 31//* @par List of Functions: 32//* 33//* - ih264_intra_pred_luma_16x16_mode_vert_av8() 34//* - ih264_intra_pred_luma_16x16_mode_horz_av8() 35//* - ih264_intra_pred_luma_16x16_mode_dc_av8() 36//* - ih264_intra_pred_luma_16x16_mode_plane_av8() 37//* 38//* @remarks 39//* None 40//* 41//******************************************************************************* 42//*/ 43 44///* All the functions here are replicated from ih264_intra_pred_filters.c 45// 46 47///** 48///** 49///** 50// 51 52 53.text 54.p2align 2 55.include "ih264_neon_macros.s" 56.extern ih264_gai1_intrapred_luma_plane_coeffs 57 58 59 60///** 61//******************************************************************************* 62//* 63//*ih264_intra_pred_luma_16x16_mode_vert 64//* 65//* @brief 66//* Perform Intra prediction for luma_16x16 mode:vertical 67//* 68//* @par Description: 69//* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1 70//* 71//* @param[in] pu1_src 72//* UWORD8 pointer to the source 73//* 74//* @param[out] pu1_dst 75//* UWORD8 pointer to the destination 76//* 77//* @param[in] src_strd 78//* integer source stride 79//* 80//* @param[in] dst_strd 81//* integer destination stride 82//* 83//* @param[in] ui_neighboravailability 84//* availability of neighbouring pixels(Not used in this function) 85//* 86//* @returns 87//* 88//* @remarks 89//* None 90//* 91//******************************************************************************* 92//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, 93// UWORD8 *pu1_dst, 94// WORD32 src_strd, 95// WORD32 dst_strd, 96// WORD32 ui_neighboravailability) 97 98//**************Variables Vs Registers***************************************** 99// x0 => *pu1_src 100// x1 => *pu1_dst 101// x2 => src_strd 102// x3 => dst_strd 103// x4 => ui_neighboravailability 104 105 106 .global ih264_intra_pred_luma_16x16_mode_vert_av8 107 108ih264_intra_pred_luma_16x16_mode_vert_av8: 109 110 push_v_regs 111 112 113 add x0, x0, #17 114 ld1 {v0.8b, v1.8b}, [x0] 115 116 st1 {v0.8b, v1.8b}, [x1], x3 117 st1 {v0.8b, v1.8b}, [x1], x3 118 st1 {v0.8b, v1.8b}, [x1], x3 119 st1 {v0.8b, v1.8b}, [x1], x3 120 st1 {v0.8b, v1.8b}, [x1], x3 121 st1 {v0.8b, v1.8b}, [x1], x3 122 st1 {v0.8b, v1.8b}, [x1], x3 123 st1 {v0.8b, v1.8b}, [x1], x3 124 st1 {v0.8b, v1.8b}, [x1], x3 125 st1 {v0.8b, v1.8b}, [x1], x3 126 st1 {v0.8b, v1.8b}, [x1], x3 127 st1 {v0.8b, v1.8b}, [x1], x3 128 st1 {v0.8b, v1.8b}, [x1], x3 129 st1 {v0.8b, v1.8b}, [x1], x3 130 st1 {v0.8b, v1.8b}, [x1], x3 131 st1 {v0.8b, v1.8b}, [x1], x3 132 133 pop_v_regs 134 ret 135 136 137 138 139 140///****************************************************************************** 141 142 143///** 144//******************************************************************************* 145//* 146//*ih264_intra_pred_luma_16x16_mode_horz 147//* 148//* @brief 149//* Perform Intra prediction for luma_16x16 mode:horizontal 150//* 151//* @par Description: 152//* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2 153//* 154//* @param[in] pu1_src 155//* UWORD8 pointer to the source 156//* 157//* @param[out] pu1_dst 158//* UWORD8 pointer to the destination 159//* 160//* @param[in] src_strd 161//* integer source stride 162//* 163//* @param[in] dst_strd 164//* integer destination stride 165//* 166//* @param[in] ui_neighboravailability 167//* availability of neighbouring pixels(Not used in this function) 168//* 169//* @returns 170//* 171//* @remarks 172//* None 173//* 174//******************************************************************************* 175//*/ 176//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, 177// UWORD8 *pu1_dst, 178// WORD32 src_strd, 179// WORD32 dst_strd, 180// WORD32 ui_neighboravailability) 181//**************Variables Vs Registers***************************************** 182// x0 => *pu1_src 183// x1 => *pu1_dst 184// x2 => src_strd 185// x3 => dst_strd 186// x4 => ui_neighboravailability 187 188 .global ih264_intra_pred_luma_16x16_mode_horz_av8 189 190ih264_intra_pred_luma_16x16_mode_horz_av8: 191 192 193 194 push_v_regs 195 196 ld1 {v0.16b}, [x0] 197 198 199 200 dup v10.16b, v0.b[15] 201 dup v11.16b, v0.b[14] 202 dup v12.16b, v0.b[13] 203 dup v13.16b, v0.b[12] 204 st1 {v10.16b}, [x1], x3 205 dup v14.16b, v0.b[11] 206 st1 {v11.16b}, [x1], x3 207 dup v15.16b, v0.b[10] 208 st1 {v12.16b}, [x1], x3 209 dup v16.16b, v0.b[9] 210 st1 {v13.16b}, [x1], x3 211 dup v17.16b, v0.b[8] 212 st1 {v14.16b}, [x1], x3 213 dup v18.16b, v0.b[7] 214 st1 {v15.16b}, [x1], x3 215 dup v19.16b, v0.b[6] 216 st1 {v16.16b}, [x1], x3 217 dup v20.16b, v0.b[5] 218 st1 {v17.16b}, [x1], x3 219 dup v21.16b, v0.b[4] 220 st1 {v18.16b}, [x1], x3 221 dup v22.16b, v0.b[3] 222 st1 {v19.16b}, [x1], x3 223 dup v23.16b, v0.b[2] 224 st1 {v20.16b}, [x1], x3 225 dup v24.16b, v0.b[1] 226 st1 {v21.16b}, [x1], x3 227 dup v25.16b, v0.b[0] 228 st1 {v22.16b}, [x1], x3 229 st1 {v23.16b}, [x1], x3 230 st1 {v24.16b}, [x1], x3 231 st1 {v25.16b}, [x1], x3 232 233 pop_v_regs 234 ret 235 236 237 238 239 240 241 242///****************************************************************************** 243 244 245///** 246//******************************************************************************* 247//* 248//*ih264_intra_pred_luma_16x16_mode_dc 249//* 250//* @brief 251//* Perform Intra prediction for luma_16x16 mode:DC 252//* 253//* @par Description: 254//* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3 255//* 256//* @param[in] pu1_src 257//* UWORD8 pointer to the source 258//* 259//* @param[out] pu1_dst 260//* UWORD8 pointer to the destination 261//* 262//* @param[in] src_strd 263//* integer source stride 264//* 265//* @param[in] dst_strd 266//* integer destination stride 267//* 268//* @param[in] ui_neighboravailability 269//* availability of neighbouring pixels 270//* 271//* @returns 272//* 273//* @remarks 274//* None 275//* 276//*******************************************************************************/ 277//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, 278// UWORD8 *pu1_dst, 279// WORD32 src_strd, 280// WORD32 dst_strd, 281// WORD32 ui_neighboravailability) 282 283//**************Variables Vs Registers***************************************** 284// x0 => *pu1_src 285// x1 => *pu1_dst 286// x2 => src_strd 287// x3 => dst_strd 288// x4 => ui_neighboravailability 289 290 .global ih264_intra_pred_luma_16x16_mode_dc_av8 291 292ih264_intra_pred_luma_16x16_mode_dc_av8: 293 294 295 296 push_v_regs 297 stp x19, x20, [sp, #-16]! 298 299 sub v0.16b, v0.16b, v0.16b 300 sub v1.16b, v1.16b, v1.16b 301 mov w10, #0 302 mov w11 , #3 303 ands x6, x4, #0x01 304 beq top_available //LEFT NOT AVAILABLE 305 ld1 {v0.16b}, [x0] 306 add w10, w10, #8 307 add w11, w11, #1 308top_available: 309 ands x6, x4, #0x04 310 beq none_available 311 add x6, x0, #17 312 ld1 {v1.16b}, [x6] 313 add w10, w10, #8 314 add w11, w11, #1 315 b summation 316none_available: 317 cmp x4, #0 318 bne summation 319 mov w15, #128 320 dup v20.16b, w15 321 b store 322summation: 323 uaddl v2.8h, v0.8b, v1.8b 324 uaddl2 v3.8h, v0.16b, v1.16b 325 dup v10.8h, w10 326 neg w11, w11 327 dup v20.8h, w11 328 add v0.8h, v2.8h, v3.8h 329 mov v1.d[0], v0.d[1] 330 add v0.4h, v0.4h, v1.4h 331 addp v0.4h, v0.4h , v0.4h 332 addp v0.4h, v0.4h , v0.4h 333 add v0.4h, v0.4h, v10.4h 334 uqshl v0.8h, v0.8h, v20.8h 335 sqxtun v0.8b, v0.8h 336 dup v20.16b, v0.b[0] 337 338store: 339 340 st1 { v20.16b}, [x1], x3 341 st1 { v20.16b}, [x1], x3 342 st1 { v20.16b}, [x1], x3 343 st1 { v20.16b}, [x1], x3 344 st1 { v20.16b}, [x1], x3 345 st1 { v20.16b}, [x1], x3 346 st1 { v20.16b}, [x1], x3 347 st1 { v20.16b}, [x1], x3 348 st1 { v20.16b}, [x1], x3 349 st1 { v20.16b}, [x1], x3 350 st1 { v20.16b}, [x1], x3 351 st1 { v20.16b}, [x1], x3 352 st1 { v20.16b}, [x1], x3 353 st1 { v20.16b}, [x1], x3 354 st1 { v20.16b}, [x1], x3 355 st1 { v20.16b}, [x1], x3 356 357 358 359end_func: 360 361 ldp x19, x20, [sp], #16 362 pop_v_regs 363 ret 364 365 366 367 368 369///****************************************************************************** 370 371 372///** 373//******************************************************************************* 374//* 375//*ih264_intra_pred_luma_16x16_mode_plane 376//* 377//* @brief 378//* Perform Intra prediction for luma_16x16 mode:PLANE 379//* 380//* @par Description: 381//* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4 382//* 383//* @param[in] pu1_src 384//* UWORD8 pointer to the source 385//* 386//* @param[out] pu1_dst 387//* UWORD8 pointer to the destination 388//* 389//* @param[in] src_strd 390//* integer source stride 391//* 392//* @param[in] dst_strd 393//* integer destination stride 394//* 395//* @param[in] ui_neighboravailability 396//* availability of neighbouring pixels 397//* 398//* @returns 399//* 400//* @remarks 401//* None 402//* 403//*******************************************************************************/ 404//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, 405// UWORD8 *pu1_dst, 406// WORD32 src_strd, 407// WORD32 dst_strd, 408// WORD32 ui_neighboravailability) 409 410//**************Variables Vs Registers***************************************** 411// x0 => *pu1_src 412// x1 => *pu1_dst 413// x2 => src_strd 414// x3 => dst_strd 415// x4 => ui_neighboravailability 416 417 .global ih264_intra_pred_luma_16x16_mode_plane_av8 418ih264_intra_pred_luma_16x16_mode_plane_av8: 419 420 push_v_regs 421 stp x19, x20, [sp, #-16]! 422 mov x2, x1 423 add x1, x0, #17 424 add x0, x0, #15 425 mov x8, #9 426 sub x1, x1, #1 427 mov x10, x1 //top_left 428 mov x4, #-1 429 ld1 {v2.2s}, [x1], x8 430 431 adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs 432 ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs] 433 434 ld1 {v0.2s}, [x1] 435 rev64 v2.8b, v2.8b 436 ld1 {v6.2s, v7.2s}, [x7] 437 usubl v0.8h, v0.8b, v2.8b 438 uxtl v16.8h, v6.8b 439 mul v0.8h, v0.8h , v16.8h 440 uxtl v18.8h, v7.8b 441 add x7, x0, x4, lsl #3 442 sub x0, x7, x4, lsl #1 443 sub x20, x4, #0x0 444 neg x14, x20 445 addp v0.8h, v0.8h, v1.8h 446 ldrb w8, [x7], #-1 447 sxtw x8, w8 448 ldrb w9, [x0], #1 449 sxtw x9, w9 450 saddlp v0.2s, v0.4h 451 sub x12, x8, x9 452 ldrb w8, [x7], #-1 453 sxtw x8, w8 454 saddlp v0.1d, v0.2s 455 ldrb w9, [x0], #1 456 sxtw x9, w9 457 sub x8, x8, x9 458 shl v2.2s, v0.2s, #2 459 add x12, x12, x8, lsl #1 460 add v0.2s, v0.2s , v2.2s 461 ldrb w8, [x7], #-1 462 sxtw x8, w8 463 ldrb w9, [x0], #1 464 sxtw x9, w9 465 srshr v0.2s, v0.2s, #6 // i_b = D0[0] 466 sub x8, x8, x9 467 ldrb w5, [x7], #-1 468 sxtw x5, w5 469 add x8, x8, x8, lsl #1 470 dup v4.8h, v0.h[0] 471 add x12, x12, x8 472 ldrb w9, [x0], #1 473 sxtw x9, w9 474 mul v0.8h, v4.8h , v16.8h 475 sub x5, x5, x9 476 mul v2.8h, v4.8h , v18.8h 477 add x12, x12, x5, lsl #2 478 ldrb w8, [x7], #-1 479 sxtw x8, w8 480 ldrb w9, [x0], #1 481 sxtw x9, w9 482 sub x8, x8, x9 483 ldrb w5, [x7], #-1 484 sxtw x5, w5 485 add x8, x8, x8, lsl #2 486 ldrb w6, [x0], #1 487 sxtw x6, w6 488 add x12, x12, x8 489 ldrb w8, [x7], #-1 490 sxtw x8, w8 491 ldrb w9, [x0], #1 492 sxtw x9, w9 493 sub x5, x5, x6 494 sub x8, x8, x9 495 add x5, x5, x5, lsl #1 496 sub x20, x8, x8, lsl #3 497 neg x8, x20 498 add x12, x12, x5, lsl #1 499 ldrb w5, [x7], #-1 500 sxtw x5, w5 501 ldrb w6, [x10] //top_left 502 sxtw x6, w6 503 add x12, x12, x8 504 sub x9, x5, x6 505 ldrb w6, [x1, #7] 506 sxtw x6, w6 507 add x12, x12, x9, lsl #3 // i_c = x12 508 add x8, x5, x6 509 add x12, x12, x12, lsl #2 510 lsl x8, x8, #4 // i_a = x8 511 add x12, x12, #0x20 512 lsr x12, x12, #6 513 shl v28.8h, v4.8h, #3 514 dup v6.8h, w12 515 dup v30.8h, w8 516 shl v26.8h, v6.8h, #3 517 sub v30.8h, v30.8h , v28.8h 518 sub v30.8h, v30.8h , v26.8h 519 add v28.8h, v30.8h , v6.8h 520 add v26.8h, v28.8h , v0.8h 521 add v28.8h, v28.8h , v2.8h 522 sqrshrun v20.8b, v26.8h, #5 523 sqrshrun v21.8b, v28.8h, #5 524 add v26.8h, v26.8h , v6.8h 525 add v28.8h, v28.8h , v6.8h 526 sqrshrun v22.8b, v26.8h, #5 527 st1 {v20.2s, v21.2s}, [x2], x3 528 sqrshrun v23.8b, v28.8h, #5 529 add v26.8h, v26.8h , v6.8h 530 add v28.8h, v28.8h , v6.8h 531 sqrshrun v20.8b, v26.8h, #5 532 st1 {v22.2s, v23.2s}, [x2], x3 533 sqrshrun v21.8b, v28.8h, #5 534 add v26.8h, v26.8h , v6.8h 535 add v28.8h, v28.8h , v6.8h 536 sqrshrun v22.8b, v26.8h, #5 537 st1 {v20.2s, v21.2s}, [x2], x3 538 sqrshrun v23.8b, v28.8h, #5 539 add v26.8h, v26.8h , v6.8h 540 add v28.8h, v28.8h , v6.8h 541 sqrshrun v20.8b, v26.8h, #5 542 st1 {v22.2s, v23.2s}, [x2], x3 543 sqrshrun v21.8b, v28.8h, #5 544 add v26.8h, v26.8h , v6.8h 545 add v28.8h, v28.8h , v6.8h 546 sqrshrun v22.8b, v26.8h, #5 547 st1 {v20.2s, v21.2s}, [x2], x3 548 sqrshrun v23.8b, v28.8h, #5 549 add v26.8h, v26.8h , v6.8h 550 add v28.8h, v28.8h , v6.8h 551 sqrshrun v20.8b, v26.8h, #5 552 st1 {v22.2s, v23.2s}, [x2], x3 553 sqrshrun v21.8b, v28.8h, #5 554 add v26.8h, v26.8h , v6.8h 555 add v28.8h, v28.8h , v6.8h 556 sqrshrun v22.8b, v26.8h, #5 557 st1 {v20.2s, v21.2s}, [x2], x3 558 sqrshrun v23.8b, v28.8h, #5 559 add v26.8h, v26.8h , v6.8h 560 add v28.8h, v28.8h , v6.8h 561 sqrshrun v20.8b, v26.8h, #5 562 st1 {v22.2s, v23.2s}, [x2], x3 563 sqrshrun v21.8b, v28.8h, #5 564 add v26.8h, v26.8h , v6.8h 565 add v28.8h, v28.8h , v6.8h 566 sqrshrun v22.8b, v26.8h, #5 567 st1 {v20.2s, v21.2s}, [x2], x3 568 sqrshrun v23.8b, v28.8h, #5 569 add v26.8h, v26.8h , v6.8h 570 add v28.8h, v28.8h , v6.8h 571 sqrshrun v20.8b, v26.8h, #5 572 st1 {v22.2s, v23.2s}, [x2], x3 573 sqrshrun v21.8b, v28.8h, #5 574 add v26.8h, v26.8h , v6.8h 575 add v28.8h, v28.8h , v6.8h 576 sqrshrun v22.8b, v26.8h, #5 577 st1 {v20.2s, v21.2s}, [x2], x3 578 sqrshrun v23.8b, v28.8h, #5 579 add v26.8h, v26.8h , v6.8h 580 add v28.8h, v28.8h , v6.8h 581 sqrshrun v20.8b, v26.8h, #5 582 st1 {v22.2s, v23.2s}, [x2], x3 583 sqrshrun v21.8b, v28.8h, #5 584 add v26.8h, v26.8h , v6.8h 585 add v28.8h, v28.8h , v6.8h 586 sqrshrun v22.8b, v26.8h, #5 587 st1 {v20.2s, v21.2s}, [x2], x3 588 sqrshrun v23.8b, v28.8h, #5 589 add v26.8h, v26.8h , v6.8h 590 add v28.8h, v28.8h , v6.8h 591 sqrshrun v20.8b, v26.8h, #5 592 st1 {v22.2s, v23.2s}, [x2], x3 593 sqrshrun v21.8b, v28.8h, #5 594 add v26.8h, v26.8h , v6.8h 595 add v28.8h, v28.8h , v6.8h 596 sqrshrun v22.8b, v26.8h, #5 597 st1 {v20.2s, v21.2s}, [x2], x3 598 sqrshrun v23.8b, v28.8h, #5 599 st1 {v22.2s, v23.2s}, [x2], x3 600 601end_func_plane: 602 603 ldp x19, x20, [sp], #16 604 pop_v_regs 605 ret 606 607