1@/***************************************************************************** 2@* 3@* Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4@* 5@* Licensed under the Apache License, Version 2.0 (the "License"); 6@* you may not use this file except in compliance with the License. 7@* You may obtain a copy of the License at: 8@* 9@* http://www.apache.org/licenses/LICENSE-2.0 10@* 11@* Unless required by applicable law or agreed to in writing, software 12@* distributed under the License is distributed on an "AS IS" BASIS, 13@* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14@* See the License for the specific language governing permissions and 15@* limitations under the License. 16@* 17@*****************************************************************************/ 18@/** 19@******************************************************************************* 20@* @file 21@* ihevc_intra_pred_filters_dc.s 22@* 23@* @brief 24@* contains function definitions for intra prediction dc filtering. 25@* functions are coded using neon intrinsics and can be compiled using 26 27@* rvct 28@* 29@* @author 30@* akshaya mukund 31@* 32@* @par list of functions: 33@* 34@* 35@* @remarks 36@* none 37@* 38@******************************************************************************* 39@*/ 40@/** 41@******************************************************************************* 42@* 43@* @brief 44@* luma intraprediction filter for dc input 45@* 46@* @par description: 47@* 48@* @param[in] pu1_ref 49@* uword8 pointer to the source 50@* 51@* @param[out] pu1_dst 52@* uword8 pointer to the destination 53@* 54@* @param[in] src_strd 55@* integer source stride 56@* 57@* @param[in] dst_strd 58@* integer destination stride 59@* 60@* @param[in] pi1_coeff 61@* word8 pointer to the planar coefficients 62@* 63@* @param[in] nt 64@* size of tranform block 65@* 66@* @param[in] mode 67@* type of filtering 68@* 69@* @returns 70@* 71@* @remarks 72@* none 73@* 74@******************************************************************************* 75@*/ 76 77@void ihevc_intra_pred_luma_dc(uword8 *pu1_ref, 78@ word32 src_strd, 79@ uword8 *pu1_dst, 80@ word32 dst_strd, 81@ word32 nt, 82@ word32 mode) 83@ 84@**************variables vs registers***************************************** 85@r0 => *pu1_ref 86@r1 => src_strd 87@r2 => *pu1_dst 88@r3 => dst_strd 89 90@stack contents from #104 91@ nt 92@ mode 93@ pi1_coeff 94 95.equ nt_offset, 104 96 97.text 98.align 4 99 100 101 102 103.globl ihevc_intra_pred_luma_dc_a9q 104 105.type ihevc_intra_pred_luma_dc_a9q, %function 106 107ihevc_intra_pred_luma_dc_a9q: 108 109 stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments 110 vpush {d8 - d15} 111 ldr r4,[sp,#nt_offset] @loads nt 112 113@********** testing 114 @mov r6, #128 115 @b prologue_cpy_32 116@********** testing 117 118 mov r11, #2 @mov #2 to r11 (to be used to add to 2dc_val & 3dc_val) 119 mov r9, #0 120 vmov d17, r11, r9 121 122 clz r5, r4 123 124 add r6, r0, r4 @&src[nt] 125 rsb r5, r5, #32 @log2nt 126 add r7, r0, r4, lsl #1 @&src[2nt] 127 128 add r8, r7, #1 @&src[2nt+1] 129 mvn r5, r5 130 add r5, r5, #1 131 vdup.32 d8, r5 132 133 ldrb r14, [r8] 134 vshl.i64 d8, d8, #32 135 136 sub r9, r7, #1 @&src[2nt-1] 137 vshr.s64 d8, d8, #32 138 139 mov r7, r8 @r7 also stores 2nt+1 140 141 ldrb r12, [r9] 142 add r14, r14, r12 @src[2nt+1] + src[2nt-1] 143 add r14, r14, r11 @src[2nt+1] + src[2nt-1] + 2 144 145 cmp r4, #4 146 beq dc_4 147 148 mov r10, r4 @nt 149 150add_loop: 151 vld1.s8 d0, [r6]! @load from src[nt] 152 mov r5, #0 @ 153 vld1.s8 d1, [r8]! @load from src[2nt+1] 154 155 vpaddl.u8 d2, d0 156 157 vmov d6, r4, r5 @store nt to accumulate 158 vpaddl.u8 d3, d1 159 160 vld1.s8 d0, [r6]! @load from src[nt] (extra load for 8) 161 162 vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 8) 163 vadd.u16 d4, d2, d3 164 165 166 vpaddl.u16 d5, d4 167 168 169 vpadal.u32 d6, d5 @accumulate all inp into d6 (end for nt==8) 170 171 subs r10, #8 172 beq epil_add_loop 173 174core_loop_add: 175 vpaddl.u8 d2, d0 176 subs r10, #8 177 vpaddl.u8 d3, d1 178 179 180 181 vadd.u16 d4, d2, d3 182 vld1.s8 d0, [r6]! @load from src[nt] (extra load for 16) 183 184 vpaddl.u16 d5, d4 185 vld1.s8 d1, [r8]! @load from src[2nt+1] (extra load for 16) 186 187 vpadal.u32 d6, d5 @accumulate all inp into d6 188 bne core_loop_add 189 190epil_add_loop: 191 192 vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1 193 cmp r4, #32 194 195 vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28 196 moveq r6, #128 197 198 vdup.8 d16, d9[0] @dc_val 199 vshl.s64 d13, d9, #1 @2*dc 200 201 beq prologue_cpy_32 202 203 vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val 204 movne r6, #0 @nt 205 206 vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0] 207 movne r10, r4 208 209 vadd.i64 d11, d13, d9 @3*dc 210 sub r12, r3, r3, lsl #3 @-7*strd 211 212 vadd.i64 d11, d11, d17 @3*dc + 2 213 add r12, r12, #8 @offset after one 8x8 block (-7*strd + 8) 214 215 vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes) 216 sub r0, r3, r4 @strd - nt 217 218prologue_col: 219 @0th column and 0-7 rows done here 220 @r8 and r9 (2nt+1+col 2nt-1-row) 221 222 mov r8, r7 @&src[2nt+1] 223 224 add r0, r0, #8 @strd - nt + 8 225 vld1.s8 d0, [r8]! @col 1::7 load (prol) 226 sub r9, r9, #7 @&src[2nt-1-row] 227 228 vld1.s8 d1, [r9] @row 7::1 (0 also) load (prol) 229 sub r9, r9, #8 230 231 vmovl.u8 q10, d0 232 233 vld1.s8 d6, [r8] @col 8::15 load (prol extra) 234 vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol) 235 236 vmovl.u8 q11, d1 237 vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol) 238 239 vmovl.u8 q13, d6 240 vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol) 241 242 vmov.i64 d19, #0x00000000000000ff @ 243 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 244 245 vbsl d19, d15, d2 @first row with dst[0] 246 vadd.i16 q13, q13, q12 @col 8::15 add 3dc+2 (prol extra) 247 248 vrev64.8 d3, d3 249 250 vst1.8 d19, [r2], r3 @store row 0 (prol) 251 vshr.s64 d3, d3, #8 @row 0 shift (prol) (first value to be ignored) 252 253 vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol) 254 255loop_again_col_row: 256 257 vbsl d20, d3, d16 @row 1 (prol) 258 259 vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol) 260 vshr.s64 d3, d3, #8 @row 1 shift (prol) 261 262 vst1.8 d20, [r2], r3 @store row 1 (prol) 263 vqshrun.s16 d4, q13, #2 @columns shr2 movn (prol extra) 264 265 266 vbsl d21, d3, d16 @row 2 (prol) 267 268 vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol) 269 vshr.s64 d3, d3, #8 @row 2 shift (prol) 270 271 vst1.8 d21, [r2], r3 @store row 2 (prol) 272 273 274 vbsl d20, d3, d16 @row 3 (prol) 275 276 vmov.i64 d21, #0x00000000000000ff @byte mask row 4 (prol) 277 vshr.s64 d3, d3, #8 @row 3 shift (prol) 278 279 vst1.8 d20, [r2], r3 @store row 3 (prol) 280 281 282 vbsl d21, d3, d16 @row 4 (prol) 283 284 vmov.i64 d20, #0x00000000000000ff @byte mask row 5 (prol) 285 vshr.s64 d3, d3, #8 @row 4 shift (prol) 286 287 vst1.8 d21, [r2], r3 @store row 4 (prol) 288 289 290 vbsl d20, d3, d16 @row 5 (prol) 291 292 vmov.i64 d21, #0x00000000000000ff @byte mask row 6 (prol) 293 vshr.s64 d3, d3, #8 @row 5 shift (prol) 294 295 vst1.8 d20, [r2], r3 @store row 5 (prol) 296 297 vld1.s8 d1, [r9] @row 8::15 load (prol extra) 298 299 vbsl d21, d3, d16 @row 6 (prol) 300 301 vmovl.u8 q11, d1 302 303 vmov.i64 d20, #0x00000000000000ff @byte mask row 7 (prol) 304 vshr.s64 d3, d3, #8 @row 6 shift (prol) 305 306 vst1.8 d21, [r2], r3 @store row 6 (prol) 307 308 vbsl d20, d3, d16 @row 7 (prol) 309 vadd.i16 q11, q11, q12 @row 8::15 add 3dc+2 (prol extra) 310 311 vshr.s64 d3, d3, #8 @row 7 shift (prol) 312 vst1.8 d20, [r2], r12 @store row 7 (prol) 313 314 subs r10, r10, #8 @counter for cols 315 316 beq end_func 317 blt copy_16 318 319 320 vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol) 321 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 322 323 vrev64.8 d3, d3 324 325 vst1.8 d4, [r2], r3 @store 2nd col (for 16x16) 326 327 vst1.8 d16, [r2], r3 328 vst1.8 d16, [r2], r3 329 vst1.8 d16, [r2], r3 330 vst1.8 d16, [r2], r3 331 vst1.8 d16, [r2], r3 332 vst1.8 d16, [r2], r3 333 vst1.8 d16, [r2], r0 @go to next row for 16 334 335 336 vbsl d20, d3, d16 @row 9 (prol) 337 subs r10, r10, #8 338 339 vst1.8 d20, [r2], r3 @store row 9 (prol) 340 vshr.s64 d3, d3, #8 @row 9 shift (prol) 341 342 vmov.i64 d20, #0x00000000000000ff @byte mask row 9 (prol) 343 344 b loop_again_col_row 345 346 347copy_16: 348 vst1.8 d16, [r2], r3 349 vst1.8 d16, [r2], r3 350 vst1.8 d16, [r2], r3 351 vst1.8 d16, [r2], r3 352 vst1.8 d16, [r2], r3 353 vst1.8 d16, [r2], r3 354 vst1.8 d16, [r2], r3 355 vst1.8 d16, [r2] 356 357 b end_func 358 359prologue_cpy_32: 360 mov r9, #128 361 @sub r7, r3, #-24 362 add r5, r2, r3 363 add r8, r5, r3 364 add r10, r8, r3 365 vdup.8 q10, d16[0] 366 lsl r6, r3, #2 367 add r6, r6, #0xfffffff0 368 369 vst1.8 {d20,d21}, [r2]! 370 vst1.8 {d20,d21}, [r5]! 371 vst1.8 {d20,d21}, [r8]! 372 vst1.8 {d20,d21}, [r10]! 373 374 vst1.8 {d20,d21}, [r2], r6 375 vst1.8 {d20,d21}, [r5], r6 376 vst1.8 {d20,d21}, [r8], r6 377 vst1.8 {d20,d21}, [r10], r6 378 379 sub r9, r9, #32 @32x32 prol/epil counter dec 380 381kernel_copy: 382 vst1.8 {d20,d21}, [r2]! 383 vst1.8 {d20,d21}, [r5]! 384 vst1.8 {d20,d21}, [r8]! 385 vst1.8 {d20,d21}, [r10]! 386 387 vst1.8 {d20,d21}, [r2], r6 388 vst1.8 {d20,d21}, [r5], r6 389 vst1.8 {d20,d21}, [r8], r6 390 vst1.8 {d20,d21}, [r10], r6 391 392 subs r9, r9, #32 393 394 vst1.8 {d20,d21}, [r2]! 395 vst1.8 {d20,d21}, [r5]! 396 vst1.8 {d20,d21}, [r8]! 397 vst1.8 {d20,d21}, [r10]! 398 399 vst1.8 {d20,d21}, [r2], r6 400 vst1.8 {d20,d21}, [r5], r6 401 vst1.8 {d20,d21}, [r8], r6 402 vst1.8 {d20,d21}, [r10], r6 403 404 bne kernel_copy 405 406epilogue_copy: 407 vst1.8 {d20,d21}, [r2]! 408 vst1.8 {d20,d21}, [r5]! 409 vst1.8 {d20,d21}, [r8]! 410 vst1.8 {d20,d21}, [r10]! 411 412 vst1.8 {d20,d21}, [r2] 413 vst1.8 {d20,d21}, [r5] 414 vst1.8 {d20,d21}, [r8] 415 vst1.8 {d20,d21}, [r10] 416 417 b end_func 418 419 420dc_4: 421 vld1.s8 d0, [r6]! @load from src[nt] 422 vld1.s8 d1, [r8]! @load from src[2nt+1] 423 424 vpaddl.u8 d2, d0 425 mov r5, #0 @ 426 vmov d6, r4, r5 @store nt to accumulate 427 vpaddl.u8 d3, d1 428 429 vadd.u16 d4, d2, d3 430 431 432 vpaddl.u16 d5, d4 433 vmov.i64 d30, #0x00000000ffffffff 434 435 vand d5, d5, d30 436 437 vmov d28, r14, r5 @src[2nt+1]+2+src[2nt-1] moved to d28 438 vadd.i64 d6, d6, d5 @accumulate all inp into d6 (end for nt==8) 439 440 vshl.s64 d9, d6, d8 @(dc_val) shr by log2nt+1 441 mov r8, r7 @&src[2nt+1] 442 443 vshl.s64 d13, d9, #1 @2*dc 444 sub r9, r9, #3 @&src[2nt-1-row] 445 446 vdup.8 d16, d9[0] @dc_val 447 vadd.i64 d14, d13, d28 @src[2nt+1]+2+src[2nt-1]+2dc_val 448 449 vshr.u16 d15, d14, #2 @final dst[0]'s value in d15[0] 450 sub r12, r3, r3, lsl #2 @-3*strd 451 vadd.i64 d11, d13, d9 @3*dc 452 453 vadd.i64 d11, d11, d17 @3*dc + 2 454 add r12, r12, #4 @offset after one 4x4 block (-3*strd + 4) 455 456 vdup.16 q12, d11[0] @3*dc + 2 (moved to all lanes) 457 sub r0, r3, r4 @strd - nt 458 459 460 vld1.s8 d0, [r8] @col 1::3 load (prol) 461 vld1.s8 d1, [r9] @row 3::1 (0 also) load (prol) 462 463 vmovl.u8 q10, d0 464 465 vmovl.u8 q11, d1 466 vadd.i16 q10, q10, q12 @col 1::7 add 3dc+2 (prol) 467 468 vadd.i16 q11, q11, q12 @row 1::7 add 3dc+2 (prol) 469 470 vmov.i64 d19, #0x00000000000000ff @ 471 vqshrun.s16 d2, q10, #2 @columns shr2 movn (prol) 472 473 vmov.i64 d20, #0x00000000000000ff @byte mask row 1 (prol) 474 vqshrun.s16 d3, q11, #2 @rows shr2 movn (prol) 475 476 477 vbsl d19, d15, d2 @first row with dst[0] 478 479 vrev64.8 d3, d3 480 481 vst1.32 d19[0], [r2], r3 @store row 0 (prol) 482 vshr.s64 d3, d3, #40 @row 0 shift (prol) (first value to be ignored) 483 484 vmov.i64 d21, #0x00000000000000ff @byte mask row 2 (prol) 485 486 vbsl d20, d3, d16 @row 1 (prol) 487 vshr.s64 d3, d3, #8 @row 1 shift (prol) 488 489 vst1.32 d20[0], [r2], r3 @store row 1 (prol) 490 491 vbsl d21, d3, d16 @row 2 (prol) 492 493 vmov.i64 d20, #0x00000000000000ff @byte mask row 3 (prol) 494 495 vshr.s64 d3, d3, #8 @row 2 shift (prol) 496 vst1.32 d21[0], [r2], r3 @store row 2 (prol) 497 498 vbsl d20, d3, d16 @row 3 (prol) 499 vst1.32 d20[0], [r2] @store row 3 (prol) 500 501epilogue_end: 502end_func: 503 vpop {d8 - d15} 504 ldmfd sp!,{r4-r12,r15} @reload the registers from sp 505 506 507 508 509 510 511 512