1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///** 21//****************************************************************************** 22//* @file 23//* ih264_intra_pred_luma_16x16_av8.s 24//* 25//* @brief 26//* Contains function definitions for intra 16x16 Luma prediction . 27//* 28//* @author 29//* Ittiam 30//* 31//* @par List of Functions: 32//* 33//* - ih264_intra_pred_luma_16x16_mode_vert_av8() 34//* - ih264_intra_pred_luma_16x16_mode_horz_av8() 35//* - ih264_intra_pred_luma_16x16_mode_dc_av8() 36//* - ih264_intra_pred_luma_16x16_mode_plane_av8() 37//* 38//* @remarks 39//* None 40//* 41//******************************************************************************* 42//*/ 43 44///* All the functions here are replicated from ih264_intra_pred_filters.c 45// 46 47///** 48///** 49///** 50// 51 52 53.text 54.p2align 2 55.include "ih264_neon_macros.s" 56.extern ih264_gai1_intrapred_luma_plane_coeffs 57 58 59 60///** 61//******************************************************************************* 62//* 63//*ih264_intra_pred_luma_16x16_mode_vert 64//* 65//* @brief 66//* Perform Intra prediction for luma_16x16 mode:vertical 67//* 68//* @par Description: 69//* Perform Intra prediction for luma_16x16 mode:Vertical ,described in sec 8.3.3.1 70//* 71//* @param[in] pu1_src 72//* UWORD8 pointer to the source 73//* 74//* @param[out] pu1_dst 75//* UWORD8 pointer to the destination 76//* 77//* @param[in] src_strd 78//* integer source stride 79//* 80//* @param[in] dst_strd 81//* integer destination stride 82//* 83//* @param[in] ui_neighboravailability 84//* availability of neighbouring pixels(Not used in this function) 85//* 86//* @returns 87//* 88//* @remarks 89//* None 90//* 91//******************************************************************************* 92//void ih264_intra_pred_luma_16x16_mode_vert(UWORD8 *pu1_src, 93// UWORD8 *pu1_dst, 94// WORD32 src_strd, 95// WORD32 dst_strd, 96// WORD32 ui_neighboravailability) 97 98//**************Variables Vs Registers***************************************** 99// x0 => *pu1_src 100// x1 => *pu1_dst 101// w2 => src_strd 102// w3 => dst_strd 103// w4 => ui_neighboravailability 104 105 106 .global ih264_intra_pred_luma_16x16_mode_vert_av8 107 108ih264_intra_pred_luma_16x16_mode_vert_av8: 109 110 push_v_regs 111 sxtw x3, w3 112 113 114 add x0, x0, #17 115 ld1 {v0.8b, v1.8b}, [x0] 116 117 st1 {v0.8b, v1.8b}, [x1], x3 118 st1 {v0.8b, v1.8b}, [x1], x3 119 st1 {v0.8b, v1.8b}, [x1], x3 120 st1 {v0.8b, v1.8b}, [x1], x3 121 st1 {v0.8b, v1.8b}, [x1], x3 122 st1 {v0.8b, v1.8b}, [x1], x3 123 st1 {v0.8b, v1.8b}, [x1], x3 124 st1 {v0.8b, v1.8b}, [x1], x3 125 st1 {v0.8b, v1.8b}, [x1], x3 126 st1 {v0.8b, v1.8b}, [x1], x3 127 st1 {v0.8b, v1.8b}, [x1], x3 128 st1 {v0.8b, v1.8b}, [x1], x3 129 st1 {v0.8b, v1.8b}, [x1], x3 130 st1 {v0.8b, v1.8b}, [x1], x3 131 st1 {v0.8b, v1.8b}, [x1], x3 132 st1 {v0.8b, v1.8b}, [x1], x3 133 134 pop_v_regs 135 ret 136 137 138 139 140 141///****************************************************************************** 142 143 144///** 145//******************************************************************************* 146//* 147//*ih264_intra_pred_luma_16x16_mode_horz 148//* 149//* @brief 150//* Perform Intra prediction for luma_16x16 mode:horizontal 151//* 152//* @par Description: 153//* Perform Intra prediction for luma_16x16 mode:horizontal ,described in sec 8.3.3.2 154//* 155//* @param[in] pu1_src 156//* UWORD8 pointer to the source 157//* 158//* @param[out] pu1_dst 159//* UWORD8 pointer to the destination 160//* 161//* @param[in] src_strd 162//* integer source stride 163//* 164//* @param[in] dst_strd 165//* integer destination stride 166//* 167//* @param[in] ui_neighboravailability 168//* availability of neighbouring pixels(Not used in this function) 169//* 170//* @returns 171//* 172//* @remarks 173//* None 174//* 175//******************************************************************************* 176//*/ 177//void ih264_intra_pred_luma_16x16_mode_horz(UWORD8 *pu1_src, 178// UWORD8 *pu1_dst, 179// WORD32 src_strd, 180// WORD32 dst_strd, 181// WORD32 ui_neighboravailability) 182//**************Variables Vs Registers***************************************** 183// x0 => *pu1_src 184// x1 => *pu1_dst 185// w2 => src_strd 186// w3 => dst_strd 187// w4 => ui_neighboravailability 188 189 .global ih264_intra_pred_luma_16x16_mode_horz_av8 190 191ih264_intra_pred_luma_16x16_mode_horz_av8: 192 193 194 195 push_v_regs 196 sxtw x3, w3 197 198 ld1 {v0.16b}, [x0] 199 200 201 202 dup v10.16b, v0.b[15] 203 dup v11.16b, v0.b[14] 204 dup v12.16b, v0.b[13] 205 dup v13.16b, v0.b[12] 206 st1 {v10.16b}, [x1], x3 207 dup v14.16b, v0.b[11] 208 st1 {v11.16b}, [x1], x3 209 dup v15.16b, v0.b[10] 210 st1 {v12.16b}, [x1], x3 211 dup v16.16b, v0.b[9] 212 st1 {v13.16b}, [x1], x3 213 dup v17.16b, v0.b[8] 214 st1 {v14.16b}, [x1], x3 215 dup v18.16b, v0.b[7] 216 st1 {v15.16b}, [x1], x3 217 dup v19.16b, v0.b[6] 218 st1 {v16.16b}, [x1], x3 219 dup v20.16b, v0.b[5] 220 st1 {v17.16b}, [x1], x3 221 dup v21.16b, v0.b[4] 222 st1 {v18.16b}, [x1], x3 223 dup v22.16b, v0.b[3] 224 st1 {v19.16b}, [x1], x3 225 dup v23.16b, v0.b[2] 226 st1 {v20.16b}, [x1], x3 227 dup v24.16b, v0.b[1] 228 st1 {v21.16b}, [x1], x3 229 dup v25.16b, v0.b[0] 230 st1 {v22.16b}, [x1], x3 231 st1 {v23.16b}, [x1], x3 232 st1 {v24.16b}, [x1], x3 233 st1 {v25.16b}, [x1], x3 234 235 pop_v_regs 236 ret 237 238 239 240 241 242 243 244///****************************************************************************** 245 246 247///** 248//******************************************************************************* 249//* 250//*ih264_intra_pred_luma_16x16_mode_dc 251//* 252//* @brief 253//* Perform Intra prediction for luma_16x16 mode:DC 254//* 255//* @par Description: 256//* Perform Intra prediction for luma_16x16 mode:DC ,described in sec 8.3.3.3 257//* 258//* @param[in] pu1_src 259//* UWORD8 pointer to the source 260//* 261//* @param[out] pu1_dst 262//* UWORD8 pointer to the destination 263//* 264//* @param[in] src_strd 265//* integer source stride 266//* 267//* @param[in] dst_strd 268//* integer destination stride 269//* 270//* @param[in] ui_neighboravailability 271//* availability of neighbouring pixels 272//* 273//* @returns 274//* 275//* @remarks 276//* None 277//* 278//*******************************************************************************/ 279//void ih264_intra_pred_luma_16x16_mode_dc(UWORD8 *pu1_src, 280// UWORD8 *pu1_dst, 281// WORD32 src_strd, 282// WORD32 dst_strd, 283// WORD32 ui_neighboravailability) 284 285//**************Variables Vs Registers***************************************** 286// x0 => *pu1_src 287// x1 => *pu1_dst 288// w2 => src_strd 289// w3 => dst_strd 290// w4 => ui_neighboravailability 291 292 .global ih264_intra_pred_luma_16x16_mode_dc_av8 293 294ih264_intra_pred_luma_16x16_mode_dc_av8: 295 296 297 298 push_v_regs 299 stp x19, x20, [sp, #-16]! 300 sxtw x3, w3 301 302 sub v0.16b, v0.16b, v0.16b 303 sub v1.16b, v1.16b, v1.16b 304 mov w10, #0 305 mov w11 , #3 306 ands w6, w4, #0x01 307 beq top_available //LEFT NOT AVAILABLE 308 ld1 {v0.16b}, [x0] 309 add w10, w10, #8 310 add w11, w11, #1 311top_available: 312 ands w6, w4, #0x04 313 beq none_available 314 add x6, x0, #17 315 ld1 {v1.16b}, [x6] 316 add w10, w10, #8 317 add w11, w11, #1 318 b summation 319none_available: 320 cmp w4, #0 321 bne summation 322 mov w15, #128 323 dup v20.16b, w15 324 b store 325summation: 326 uaddl v2.8h, v0.8b, v1.8b 327 uaddl2 v3.8h, v0.16b, v1.16b 328 dup v10.8h, w10 329 neg w11, w11 330 dup v20.8h, w11 331 add v0.8h, v2.8h, v3.8h 332 mov v1.d[0], v0.d[1] 333 add v0.4h, v0.4h, v1.4h 334 addp v0.4h, v0.4h , v0.4h 335 addp v0.4h, v0.4h , v0.4h 336 add v0.4h, v0.4h, v10.4h 337 uqshl v0.8h, v0.8h, v20.8h 338 sqxtun v0.8b, v0.8h 339 dup v20.16b, v0.b[0] 340 341store: 342 343 st1 { v20.16b}, [x1], x3 344 st1 { v20.16b}, [x1], x3 345 st1 { v20.16b}, [x1], x3 346 st1 { v20.16b}, [x1], x3 347 st1 { v20.16b}, [x1], x3 348 st1 { v20.16b}, [x1], x3 349 st1 { v20.16b}, [x1], x3 350 st1 { v20.16b}, [x1], x3 351 st1 { v20.16b}, [x1], x3 352 st1 { v20.16b}, [x1], x3 353 st1 { v20.16b}, [x1], x3 354 st1 { v20.16b}, [x1], x3 355 st1 { v20.16b}, [x1], x3 356 st1 { v20.16b}, [x1], x3 357 st1 { v20.16b}, [x1], x3 358 st1 { v20.16b}, [x1], x3 359 360 361 362end_func: 363 364 ldp x19, x20, [sp], #16 365 pop_v_regs 366 ret 367 368 369 370 371 372///****************************************************************************** 373 374 375///** 376//******************************************************************************* 377//* 378//*ih264_intra_pred_luma_16x16_mode_plane 379//* 380//* @brief 381//* Perform Intra prediction for luma_16x16 mode:PLANE 382//* 383//* @par Description: 384//* Perform Intra prediction for luma_16x16 mode:PLANE ,described in sec 8.3.3.4 385//* 386//* @param[in] pu1_src 387//* UWORD8 pointer to the source 388//* 389//* @param[out] pu1_dst 390//* UWORD8 pointer to the destination 391//* 392//* @param[in] src_strd 393//* integer source stride 394//* 395//* @param[in] dst_strd 396//* integer destination stride 397//* 398//* @param[in] ui_neighboravailability 399//* availability of neighbouring pixels 400//* 401//* @returns 402//* 403//* @remarks 404//* None 405//* 406//*******************************************************************************/ 407//void ih264_intra_pred_luma_16x16_mode_plane(UWORD8 *pu1_src, 408// UWORD8 *pu1_dst, 409// WORD32 src_strd, 410// WORD32 dst_strd, 411// WORD32 ui_neighboravailability) 412 413//**************Variables Vs Registers***************************************** 414// x0 => *pu1_src 415// x1 => *pu1_dst 416// w2 => src_strd 417// w3 => dst_strd 418// w4 => ui_neighboravailability 419 420 .global ih264_intra_pred_luma_16x16_mode_plane_av8 421ih264_intra_pred_luma_16x16_mode_plane_av8: 422 423 push_v_regs 424 stp x19, x20, [sp, #-16]! 425 sxtw x3, w3 426 mov x2, x1 427 add x1, x0, #17 428 add x0, x0, #15 429 mov x8, #9 430 sub x1, x1, #1 431 mov x10, x1 //top_left 432 mov x4, #-1 433 ld1 {v2.2s}, [x1], x8 434 435 adrp x7, :got:ih264_gai1_intrapred_luma_plane_coeffs 436 ldr x7, [x7, #:got_lo12:ih264_gai1_intrapred_luma_plane_coeffs] 437 438 ld1 {v0.2s}, [x1] 439 rev64 v2.8b, v2.8b 440 ld1 {v6.2s, v7.2s}, [x7] 441 usubl v0.8h, v0.8b, v2.8b 442 uxtl v16.8h, v6.8b 443 mul v0.8h, v0.8h , v16.8h 444 uxtl v18.8h, v7.8b 445 add x7, x0, x4, lsl #3 446 sub x0, x7, x4, lsl #1 447 neg x14, x4 448 addp v0.8h, v0.8h, v1.8h 449 ldrb w8, [x7], #-1 450 ldrb w9, [x0], #1 451 saddlp v0.2s, v0.4h 452 sub w12, w8, w9 453 ldrb w8, [x7], #-1 454 saddlp v0.1d, v0.2s 455 ldrb w9, [x0], #1 456 sub w8, w8, w9 457 shl v2.2s, v0.2s, #2 458 add w12, w12, w8, lsl #1 459 add v0.2s, v0.2s , v2.2s 460 ldrb w8, [x7], #-1 461 ldrb w9, [x0], #1 462 srshr v0.2s, v0.2s, #6 // i_b = D0[0] 463 sub w8, w8, w9 464 ldrb w5, [x7], #-1 465 add w8, w8, w8, lsl #1 466 dup v4.8h, v0.h[0] 467 add w12, w12, w8 468 ldrb w9, [x0], #1 469 mul v0.8h, v4.8h , v16.8h 470 sub w5, w5, w9 471 mul v2.8h, v4.8h , v18.8h 472 add w12, w12, w5, lsl #2 473 ldrb w8, [x7], #-1 474 ldrb w9, [x0], #1 475 sub w8, w8, w9 476 ldrb w5, [x7], #-1 477 add w8, w8, w8, lsl #2 478 ldrb w6, [x0], #1 479 add w12, w12, w8 480 ldrb w8, [x7], #-1 481 ldrb w9, [x0], #1 482 sub w5, w5, w6 483 sub w8, w8, w9 484 add w5, w5, w5, lsl #1 485 sub w20, w8, w8, lsl #3 486 neg w8, w20 487 add w12, w12, w5, lsl #1 488 ldrb w5, [x7], #-1 489 ldrb w6, [x10] //top_left 490 add w12, w12, w8 491 sub w9, w5, w6 492 ldrb w6, [x1, #7] 493 add w12, w12, w9, lsl #3 // i_c = w12 494 add w8, w5, w6 495 add w12, w12, w12, lsl #2 496 lsl w8, w8, #4 // i_a = w8 497 add w12, w12, #0x20 498 lsr w12, w12, #6 499 shl v28.8h, v4.8h, #3 500 dup v6.8h, w12 501 dup v30.8h, w8 502 shl v26.8h, v6.8h, #3 503 sub v30.8h, v30.8h , v28.8h 504 sub v30.8h, v30.8h , v26.8h 505 add v28.8h, v30.8h , v6.8h 506 add v26.8h, v28.8h , v0.8h 507 add v28.8h, v28.8h , v2.8h 508 sqrshrun v20.8b, v26.8h, #5 509 sqrshrun v21.8b, v28.8h, #5 510 add v26.8h, v26.8h , v6.8h 511 add v28.8h, v28.8h , v6.8h 512 sqrshrun v22.8b, v26.8h, #5 513 st1 {v20.2s, v21.2s}, [x2], x3 514 sqrshrun v23.8b, v28.8h, #5 515 add v26.8h, v26.8h , v6.8h 516 add v28.8h, v28.8h , v6.8h 517 sqrshrun v20.8b, v26.8h, #5 518 st1 {v22.2s, v23.2s}, [x2], x3 519 sqrshrun v21.8b, v28.8h, #5 520 add v26.8h, v26.8h , v6.8h 521 add v28.8h, v28.8h , v6.8h 522 sqrshrun v22.8b, v26.8h, #5 523 st1 {v20.2s, v21.2s}, [x2], x3 524 sqrshrun v23.8b, v28.8h, #5 525 add v26.8h, v26.8h , v6.8h 526 add v28.8h, v28.8h , v6.8h 527 sqrshrun v20.8b, v26.8h, #5 528 st1 {v22.2s, v23.2s}, [x2], x3 529 sqrshrun v21.8b, v28.8h, #5 530 add v26.8h, v26.8h , v6.8h 531 add v28.8h, v28.8h , v6.8h 532 sqrshrun v22.8b, v26.8h, #5 533 st1 {v20.2s, v21.2s}, [x2], x3 534 sqrshrun v23.8b, v28.8h, #5 535 add v26.8h, v26.8h , v6.8h 536 add v28.8h, v28.8h , v6.8h 537 sqrshrun v20.8b, v26.8h, #5 538 st1 {v22.2s, v23.2s}, [x2], x3 539 sqrshrun v21.8b, v28.8h, #5 540 add v26.8h, v26.8h , v6.8h 541 add v28.8h, v28.8h , v6.8h 542 sqrshrun v22.8b, v26.8h, #5 543 st1 {v20.2s, v21.2s}, [x2], x3 544 sqrshrun v23.8b, v28.8h, #5 545 add v26.8h, v26.8h , v6.8h 546 add v28.8h, v28.8h , v6.8h 547 sqrshrun v20.8b, v26.8h, #5 548 st1 {v22.2s, v23.2s}, [x2], x3 549 sqrshrun v21.8b, v28.8h, #5 550 add v26.8h, v26.8h , v6.8h 551 add v28.8h, v28.8h , v6.8h 552 sqrshrun v22.8b, v26.8h, #5 553 st1 {v20.2s, v21.2s}, [x2], x3 554 sqrshrun v23.8b, v28.8h, #5 555 add v26.8h, v26.8h , v6.8h 556 add v28.8h, v28.8h , v6.8h 557 sqrshrun v20.8b, v26.8h, #5 558 st1 {v22.2s, v23.2s}, [x2], x3 559 sqrshrun v21.8b, v28.8h, #5 560 add v26.8h, v26.8h , v6.8h 561 add v28.8h, v28.8h , v6.8h 562 sqrshrun v22.8b, v26.8h, #5 563 st1 {v20.2s, v21.2s}, [x2], x3 564 sqrshrun v23.8b, v28.8h, #5 565 add v26.8h, v26.8h , v6.8h 566 add v28.8h, v28.8h , v6.8h 567 sqrshrun v20.8b, v26.8h, #5 568 st1 {v22.2s, v23.2s}, [x2], x3 569 sqrshrun v21.8b, v28.8h, #5 570 add v26.8h, v26.8h , v6.8h 571 add v28.8h, v28.8h , v6.8h 572 sqrshrun v22.8b, v26.8h, #5 573 st1 {v20.2s, v21.2s}, [x2], x3 574 sqrshrun v23.8b, v28.8h, #5 575 add v26.8h, v26.8h , v6.8h 576 add v28.8h, v28.8h , v6.8h 577 sqrshrun v20.8b, v26.8h, #5 578 st1 {v22.2s, v23.2s}, [x2], x3 579 sqrshrun v21.8b, v28.8h, #5 580 add v26.8h, v26.8h , v6.8h 581 add v28.8h, v28.8h , v6.8h 582 sqrshrun v22.8b, v26.8h, #5 583 st1 {v20.2s, v21.2s}, [x2], x3 584 sqrshrun v23.8b, v28.8h, #5 585 st1 {v22.2s, v23.2s}, [x2], x3 586 587end_func_plane: 588 589 ldp x19, x20, [sp], #16 590 pop_v_regs 591 ret 592 593