1///***************************************************************************** 2//* 3//* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//*****************************************************************************/ 18///** 19//******************************************************************************* 20//* @file 21//* ihevc_intra_pred_chroma_mode_3_to_9.s 22//* 23//* @brief 24//* contains function definitions for intra prediction dc filtering. 25//* functions are coded using neon intrinsics and can be compiled using 26 27//* rvct 28//* 29//* @author 30//* parthiban v 31//* 32//* @par list of functions: 33//* 34//* 35//* @remarks 36//* none 37//* 38//******************************************************************************* 39//*/ 40///** 41//******************************************************************************* 42//* 43//* @brief 44//* luma intraprediction filter for dc input 45//* 46//* @par description: 47//* 48//* @param[in] pu1_ref 49//* uword8 pointer to the source 50//* 51//* @param[out] pu1_dst 52//* uword8 pointer to the destination 53//* 54//* @param[in] src_strd 55//* integer source stride 56//* 57//* @param[in] dst_strd 58//* integer destination stride 59//* 60//* @param[in] nt 61//* size of tranform block 62//* 63//* @param[in] mode 64//* type of filtering 65//* 66//* @returns 67//* 68//* @remarks 69//* none 70//* 71//******************************************************************************* 72//*/ 73//void ihevc_intra_pred_chroma_mode_3_to_9(uword8 *pu1_ref, 74// word32 src_strd, 75// uword8 *pu1_dst, 76// word32 dst_strd, 77// word32 nt, 78// word32 mode) 79//**************variables vs registers***************************************** 80//x0 => *pu1_ref 81//x1 => src_strd 82//x2 => *pu1_dst 83//x3 => dst_strd 84 85//stack contents from #40 86// nt 87// mode 88 89.text 90.align 4 91 92.include "ihevc_neon_macros.s" 93 94 95 96.globl ihevc_intra_pred_chroma_mode_3_to_9_av8 97.extern gai4_ihevc_ang_table 98.extern gai4_ihevc_inv_ang_table 99.extern col_for_intra_chroma 100.extern idx_neg_idx_chroma_3_9 101 102.type ihevc_intra_pred_chroma_mode_3_to_9_av8, %function 103 104ihevc_intra_pred_chroma_mode_3_to_9_av8: 105 106 // stmfd sp!, {x4-x12, x14} //stack stores the values of the arguments 107 108 stp d13,d14,[sp,#-16]! 109 stp d8,d15,[sp,#-16]! // Storing d15 using { sub sp,sp,#8; str d15,[sp] } is giving bus error. 110 // d8 is used as dummy register and stored along with d15 using stp. d8 is not used in the function. 111 stp x19, x20,[sp,#-16]! 112 113 adrp x7, :got:gai4_ihevc_ang_table 114 ldr x7, [x7, #:got_lo12:gai4_ihevc_ang_table] 115 116 adrp x8, :got:gai4_ihevc_inv_ang_table 117 ldr x8, [x8, #:got_lo12:gai4_ihevc_inv_ang_table] 118 119 add x7, x7, x5, lsl #2 //gai4_ihevc_ang_table[mode] 120 ldr w7, [x7] //intra_pred_ang 121 sxtw x7,w7 122 dup v30.8b,w7 //intra_pred_ang 123 124 adrp x14, :got:col_for_intra_chroma 125 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 126 127prologue_8_16_32: 128 lsr x10, x4, #3 129 ld1 {v31.8b},[x14],#8 130 mul x10, x4, x10 //block counter (dec by #8) 131 132 lsl x11, x4, #1 //col counter to be inc/dec by #8 133 smull v22.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 134 135 sub x7, x5, #3 136 adrp x12, :got:idx_neg_idx_chroma_3_9 //load most idx table 137 ldr x12, [x12, #:got_lo12:idx_neg_idx_chroma_3_9] 138 139 add x12, x12, x7, lsl #4 140 mov x8, x12 141 142 mov x7, #8 143 sub x7, x7, x3, lsl #3 //x7 = 8-8x3 144 145 ldr w9, [x8] 146 sxtw x9,w9 147 lsl x9, x9, #1 148 add x1, x0, x4, lsl #2 //pu1_ref + 4*nt 149 150 xtn v6.8b, v22.8h 151 dup v26.8b,w9 //most idx added to final idx values 152 sub x1, x1, #26 //ref_main_idx + 2nt - (8 + 1)(two_nt - idx - row ) for 8 & 8 - 1row 153 154 sub x6, x1, x9 155 156 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from most idx) 157 sshr v22.8h, v22.8h,#5 158 159 movi v29.8b, #31 //contains #31 for vand operation 160 161 movi v28.8b, #32 162 163 sqxtn v2.8b, v22.8h 164 shl v2.8b, v2.8b,#1 // 2 * idx 165 166 and v6.8b, v6.8b , v29.8b //fract values in d1/ idx values in d0 167 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 168 169 mov x0,#0x302 // idx value for v is +1 of u 170 dup v27.4h,w0 171 mov x0,#0 172 173 movi v3.8b, #22 //row 0 to 7 174 175 sub v2.8b, v2.8b , v27.8b //ref_main_idx (sub row) 176 sub v2.8b, v26.8b , v2.8b //ref_main_idx (row 0) 177 add v2.8b, v2.8b , v3.8b //to compensate the pu1_src idx incremented by 8 178 sub v3.8b, v2.8b , v29.8b //ref_main_idx + 1 (row 0) 179 tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0) 180 sub v7.8b, v28.8b , v6.8b //32-fract 181 182 tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0) 183 sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1) 184 sub v5.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 1) 185 186 movi v29.8b, #4 187 188 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) 189 umull v24.8h, v25.8b, v7.8b //mul (row 0) 190 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 191 192 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) 193 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2) 194 sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 2) 195 196 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 197 198 tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2) 199 umull v22.8h, v16.8b, v7.8b //mul (row 1) 200 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 201 202 tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2) 203 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) 204 sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 3) 205 206 st1 {v24.8b},[x2], x3 //st (row 0) 207 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 208 209 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) 210 umull v20.8h, v14.8b, v7.8b //mul (row 2) 211 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 212 213 tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) 214 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4) 215 sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 4) 216 217 st1 {v22.8b},[x2], x3 //st (row 1) 218 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 219 220 tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4) 221 umull v18.8h, v19.8b, v7.8b //mul (row 3) 222 umlal v18.8h, v23.8b, v6.8b //mul (row 3) 223 224 tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4) 225 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) 226 sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 5) 227 228 st1 {v20.8b},[x2], x3 //st (row 2) 229 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 230 231 tbl v16.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) 232 umull v24.8h, v25.8b, v7.8b //mul (row 4) 233 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 234 235 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) 236 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6) 237 sub v3.8b, v3.8b , v29.8b //ref_main_idx + 1 (row 6) 238 239 st1 {v18.8b},[x2], x3 //st (row 3) 240 cmp x4,#4 241 beq end_func 242 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 243 244 tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6) 245 umull v22.8h, v16.8b, v7.8b //mul (row 5) 246 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 247 248 tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6) 249 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) 250 sub v5.8b, v5.8b , v29.8b //ref_main_idx + 1 (row 7) 251 252 st1 {v24.8b},[x2], x3 //st (row 4) 253 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 254 255 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 256 umull v20.8h, v14.8b, v7.8b //mul (row 6) 257 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 258 259 tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) 260 umull v18.8h, v19.8b, v7.8b //mul (row 7) 261 umlal v18.8h, v23.8b, v6.8b //mul (row 7) 262 263 st1 {v22.8b},[x2], x3 //st (row 5) 264 rshrn v20.8b, v20.8h,#5 //round shft (row 6) 265 rshrn v18.8b, v18.8h,#5 //round shft (row 7) 266 267 st1 {v20.8b},[x2], x3 //st (row 6) 268 269 subs x10, x10, #4 //subtract 8 and go to end if 8x8 270 271 st1 {v18.8b},[x2], x3 //st (row 7) 272 273 beq end_func 274 275 subs x11, x11, #8 //decrement the processed col 276 add x20, x8, #4 277 csel x8, x20, x8,gt 278 add x20, x2, x7 279 csel x2, x20, x2,gt 280 csel x8, x12, x8,le 281 sub x20, x2, x4 282 csel x2, x20, x2,le 283 add x20, x2, #8 284 csel x2, x20, x2,le 285 lsl x20, x4, #1 286 csel x11,x20,x11,le 287 bgt lbl284 288 adrp x14, :got:col_for_intra_chroma 289 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 290lbl284: 291 add x20, x0, #8 292 csel x0, x20, x0,le 293 294 ld1 {v31.8b},[x14],#8 295 smull v25.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 296 xtn v19.8b, v25.8h 297 sshr v25.8h, v25.8h,#5 298 sqxtn v23.8b, v25.8h 299 shl v23.8b, v23.8b,#1 300 mov x5, #0x302 //idx value for v is +1 of u 301 dup v27.4h,w5 //row value inc or reset accordingly 302 ldr w9, [x8] //loads index value 303 sxtw x9,w9 304 lsl x9, x9, #1 305 mov x5, #22 306 sub x5, x5, x0, lsl #1 307 dup v16.8b,w5 308 dup v26.8b,w9 309 310 mov x5,x2 311 sub v23.8b, v23.8b , v27.8b //ref_main_idx (sub row) 312 313kernel_8_16_32: 314 movi v29.8b, #2 //contains #2 for adding to get ref_main_idx + 1 315 sub v2.8b, v26.8b , v23.8b //ref_main_idx 316 mov v26.8b, v19.8b 317 318 subs x11, x11, #8 319 sub x6, x1, x9 320 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 321 add v2.8b, v2.8b , v16.8b //to compensate the pu1_src idx incremented by 8 322 323 umull v20.8h, v14.8b, v7.8b //mul (row 6) 324 tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx - 1 (row 7) 325 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 326 327 add x20, x0, #8 328 csel x0, x20, x0,le 329 sub v3.8b, v2.8b , v29.8b //ref_main_idx - 2 330 add x20, x8, #4 331 csel x8, x20, x8,gt 332 333 ld1 {v0.16b, v1.16b}, [x6] //stores the 32 values reqd based on indices values (from most idx) 334 rshrn v22.8b, v22.8h,#5 //round shft (row 5) 335 336 bgt lbl326 337 adrp x14, :got:col_for_intra_chroma 338 ldr x14, [x14, #:got_lo12:col_for_intra_chroma] 339lbl326: 340 st1 {v24.8b},[x5], x3 //st (row 4) 341 csel x8, x12, x8,le 342 343 mov x9,#0x302 344 dup v27.4h,w9 //row value inc or reset accordingly 345 sub v4.8b, v2.8b , v29.8b //ref_main_idx (row 1) 346 347 sub v5.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 1) 348 tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 0) 349 movi v29.8b, #31 //contains #2 for adding to get ref_main_idx + 1 350 351 umull v18.8h, v19.8b, v7.8b //mul (row 7) 352 tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 0) 353 umlal v18.8h, v23.8b, v6.8b //mul (row 7) 354 355 ld1 {v31.8b},[x14],#8 356 and v6.8b, v29.8b , v26.8b //fract values in d1/ idx values in d0 357 358 lsl x20, x4, #1 359 csel x11,x20,x11,le 360 movi v29.8b, #4 //contains #2 for adding to get ref_main_idx + 1 361 ldr w9, [x8] 362 sxtw x9,w9 363 364 st1 {v22.8b},[x5], x3 //(from previous loop)st (row 5) 365 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 366 367 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 2) 368 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 1) 369 sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 2) 370 371 lsl x9, x9, #1 372 sub v7.8b, v28.8b , v6.8b //32-fract 373 374 umull v24.8h, v25.8b, v7.8b //mul (row 0) 375 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 1) 376 umlal v24.8h, v13.8b, v6.8b //mul (row 0) 377 378 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 379 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 380 381 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 3) 382 tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 2) 383 sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 3) 384 385 umull v22.8h, v19.8b, v7.8b //mul (row 1) 386 tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 2) 387 umlal v22.8h, v17.8b, v6.8b //mul (row 1) 388 389 rshrn v24.8b, v24.8h,#5 //round shft (row 0) 390 st1 {v18.8b},[x5], x3 //(from previous loop)st (row 7) 391 392 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 4) 393 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 3) 394 sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 4) 395 396 umull v20.8h, v14.8b, v7.8b //mul (row 2) 397 tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 3) 398 umlal v20.8h, v15.8b, v6.8b //mul (row 2) 399 400 add x5,x2,x3,lsl#2 401 smull v14.8h, v30.8b, v31.8b //(col+1)*intra_pred_angle [0:7](col) 402 add x9, x9, x0, lsl #1 403 404 st1 {v24.8b},[x2], x3 //st (row 0) 405 rshrn v22.8b, v22.8h,#5 //round shft (row 1) 406 407 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 5) 408 tbl v25.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 4) 409 sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 5) 410 411 umull v18.8h, v19.8b, v7.8b //mul (row 3) 412 tbl v13.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 4) 413 umlal v18.8h, v23.8b, v6.8b //mul (row 3) 414 415 st1 {v22.8b},[x2], x3 //st (row 1) 416 rshrn v20.8b, v20.8h,#5 //round shft (row 2) 417 418 xtn v19.8b, v14.8h 419 sshr v14.8h, v14.8h,#5 420 421 sub v2.8b, v2.8b , v29.8b //ref_main_idx (row 6) 422 tbl v21.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 5) 423 sub v3.8b, v3.8b , v29.8b //ref_main_idx - 1 (row 6) 424 425 umull v24.8h, v25.8b, v7.8b //mul (row 4) 426 tbl v17.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 5) 427 sqxtn v23.8b, v14.8h 428 429 st1 {v20.8b},[x2], x3 //st (row 2) 430 umlal v24.8h, v13.8b, v6.8b //mul (row 4) 431 432 rshrn v18.8b, v18.8h,#5 //round shft (row 3) 433 dup v26.8b,w9 434 435 sub v4.8b, v4.8b , v29.8b //ref_main_idx (row 7) 436 tbl v14.8b, { v0.16b, v1.16b}, v2.8b //load from ref_main_idx (row 6) 437 sub v5.8b, v5.8b , v29.8b //ref_main_idx - 1 (row 7) 438 439 mov x6, #22 //to compensate the 2*row value 440 shl v23.8b, v23.8b,#1 441 sub x6, x6, x0, lsl #1 442 443 umull v22.8h, v21.8b, v7.8b //mul (row 5) 444 tbl v15.8b, { v0.16b, v1.16b}, v3.8b //load from ref_main_idx + 1 (row 6) 445 umlal v22.8h, v17.8b, v6.8b //mul (row 5) 446 447 st1 {v18.8b},[x2], x3 //st (row 3) 448 rshrn v24.8b, v24.8h,#5 //round shft (row 4) 449 450 add x2,x2,x3, lsl #2 451 dup v16.8b,w6 452 add x20, x7, x2 453 csel x2, x20, x2,gt 454 455 sub x20, x2, x4 456 csel x2, x20, x2,le 457 sub v23.8b, v23.8b , v27.8b //ref_main_idx (add row) 458 sub x20,x2,#8 459 csel x2, x20, x2,le 460 461 subs x10, x10, #4 //subtract 8 and go to end if 8x8 462 463 bne kernel_8_16_32 464 465epil_8_16_32: 466 tbl v19.8b, { v0.16b, v1.16b}, v4.8b //load from ref_main_idx (row 7) 467 468 umull v20.8h, v14.8b, v7.8b //mul (row 6) 469 tbl v23.8b, { v0.16b, v1.16b}, v5.8b //load from ref_main_idx + 1 (row 7) 470 umlal v20.8h, v15.8b, v6.8b //mul (row 6) 471 472 st1 {v24.8b},[x5], x3 //st (row 4) 473 rshrn v24.8b, v22.8h,#5 //round shft (row 5) 474 475 umull v18.8h, v19.8b, v7.8b //mul (row 7) 476 umlal v18.8h, v23.8b, v6.8b //mul (row 7) 477 478 st1 {v24.8b},[x5], x3 //(from previous loop)st (row 5) 479 rshrn v20.8b, v20.8h,#5 //(from previous loop)round shft (row 6) 480 481 st1 {v20.8b},[x5], x3 //(from previous loop)st (row 6) 482 rshrn v18.8b, v18.8h,#5 //(from previous loop)round shft (row 7) 483 484 st1 {v18.8b},[x5], x3 //st (row 7) 485 486end_func: 487 // ldmfd sp!,{x4-x12,x15} //reload the registers from sp 488 ldp x19, x20,[sp],#16 489 ldp d8,d15,[sp],#16 // Loading d15 using { ldr d15,[sp]; add sp,sp,#8 } is giving bus error. 490 // d8 is used as dummy register and loaded along with d15 using ldp. d8 is not used in the function. 491 ldp d13,d14,[sp],#16 492 ret 493 494 495 496 497 498 499 500 501