1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON 34#include "arm_arch_common_macro.S" 35 36.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4 37 vld1.32 {q15}, [\arg0], \arg2 38 vld1.32 {q14}, [\arg1], \arg2 39 vabal.u8 \arg3, d30, d28 40 vabal.u8 \arg4, d31, d29 41.endm 42 43.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4 44 vld1.32 {q15}, [\arg0], \arg2 45 vld1.32 {q14}, [\arg1], \arg2 46 vabdl.u8 \arg3, d30, d28 47 vabdl.u8 \arg4, d31, d29 48 49 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 50 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 51 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 52 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 53 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 54 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 55 ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4 56.endm 57 58.macro SAD_8X16BITS arg0, arg1, arg2 59 vadd.u16 d31, \arg0, \arg1 60 vpaddl.u16 d31, d31 61 vpaddl.u32 \arg2, d31 62.endm 63 64 65WELS_ASM_FUNC_BEGIN VAACalcSad_neon 66 67 stmdb sp!, {r4-r8} 68 69 ldr r4, [sp, #20] //load pic_stride 70 ldr r5, [sp, #28] //load psad8x8 71 72 //Initial the Q8 register for save the "psadframe" 73 vmov.s64 q8, #0 74 75 //Get the jump distance to use on loop codes 76 lsl r8, r4, #4 77 sub r7, r8, #16 //R7 keep the 16*pic_stride-16 78 sub r8, r2 //R8 keep the 16*pic_stride-pic_width 79 80vaa_calc_sad_loop0: 81 82 //R6 keep the pic_width 83 mov r6, r2 84 85vaa_calc_sad_loop1: 86 87 //Process the 16x16 bytes 88 ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1 89 ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3 90 91 //Do the SAD 92 SAD_8X16BITS d0, d1, d0 93 SAD_8X16BITS d2, d3, d1 94 SAD_8X16BITS d4, d5, d2 95 SAD_8X16BITS d6, d7, d3 96 97 //Write to "psad8x8" buffer 98 vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]! 99 100 101 //Adjust the input address 102 sub r0, r7 103 sub r1, r7 104 105 subs r6, #16 106 107 //Save to calculate "psadframe" 108 vadd.u32 q0, q1 109 vadd.u32 q8, q0 110 111 bne vaa_calc_sad_loop1 112 113 //Adjust the input address 114 add r0, r8 115 add r1, r8 116 117 subs r3, #16 118 bne vaa_calc_sad_loop0 119 120 ldr r6, [sp, #24] //load psadframe 121 vadd.u32 d16, d17 122 vst1.32 {d16[0]}, [r6] 123 124 ldmia sp!, {r4-r8} 125 126WELS_ASM_FUNC_END 127 128 129.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6 130 vld1.32 {q0}, [\arg0], \arg2 131 vld1.32 {q1}, [\arg1], \arg2 132 133 vpadal.u8 \arg3, q0 134 vpadal.u8 \arg4, q1 135 136 vabd.u8 q0, q0, q1 137 vmax.u8 \arg5, q0 138 vpadal.u8 \arg6, q0 139.endm 140 141.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5 142 vld1.32 {q0}, [\arg0], \arg2 143 vld1.32 {q1}, [\arg1], \arg2 144 145 vpaddl.u8 q2, q0 146 vpaddl.u8 q3, q1 147 148 vabd.u8 \arg3, q0, q1 149 vpaddl.u8 \arg4, \arg3 //abs_diff 150 151 152 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 153 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 154 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 155 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 156 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 157 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 158 SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4 159 160 vsub.u16 \arg5, q2, q3 161.endm 162 163.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4 164 vpmax.u8 d0, \arg0, \arg1 //8bytes 165 vpmax.u8 d0, d0, d0 //4bytes 166 vpmax.u8 \arg2, d0, d0 //2bytes 167 168 vpaddl.u16 \arg3, \arg3 169 vpaddl.u32 \arg3, \arg3 170 vpaddl.s16 \arg4, \arg4 171 vpaddl.s32 \arg4, \arg4 172.endm 173 174WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon 175 176 stmdb sp!, {r4-r10} 177 178 ldr r4, [sp, #28] //load pic_stride 179 ldr r5, [sp, #36] //load psad8x8 180 ldr r6, [sp, #40] //load psd8x8 181 ldr r7, [sp, #44] //load pmad8x8 182 183 //Initial the Q4 register for save the "psadframe" 184 vmov.s64 q15, #0 185 186 //Get the jump distance to use on loop codes 187 lsl r10, r4, #4 188 sub r9, r10, #16 //R9 keep the 16*pic_stride-16 189 sub r10, r2 //R10 keep the 16*pic_stride-pic_width 190 191vaa_calc_sad_bgd_loop0: 192 193 //R6 keep the pic_width 194 mov r8, r2 195 196vaa_calc_sad_bgd_loop1: 197 198 //Process the 16x16 bytes pmad psad psd 199 SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9 200 SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10 201 202 SAD_SD_MAD_CALC d26, d27, d16, q11, q9 203 SAD_SD_MAD_CALC d28, d29, d17, q12, q10 204 205 //Write to "psad8x8" buffer 206 vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]! 207 //Adjust the input address 208 sub r0, r9 209 sub r1, r9 210 //Write to "psd8x8" buffer 211 vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]! 212 subs r8, #16 213 //Write to "pmad8x8" buffer 214 vst2.16 {d16[0],d17[0]}, [r7]! 215 //Save to calculate "psadframe" 216 vadd.u32 q11, q12 217 vadd.u32 q15, q11 218 219 bne vaa_calc_sad_bgd_loop1 220 221 //Adjust the input address 222 add r0, r10 223 add r1, r10 224 225 subs r3, #16 226 bne vaa_calc_sad_bgd_loop0 227 228 ldr r8, [sp, #32] //load psadframe 229 vadd.u32 d30, d31 230 vst1.32 {d30[0]}, [r8] 231 ldmia sp!, {r4-r10} 232 233WELS_ASM_FUNC_END 234 235 236.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3 237 vmull.u8 \arg3, \arg0, \arg0 238 vpaddl.u16 \arg2, \arg3 239 240 vmull.u8 \arg3, \arg1, \arg1 241 vpadal.u16 \arg2, \arg3 242.endm 243 244.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3 245 vmull.u8 \arg3, \arg0, \arg0 246 vpadal.u16 \arg2, \arg3 247 248 vmull.u8 \arg3, \arg1, \arg1 249 vpadal.u16 \arg2, \arg3 250.endm 251 252.macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3 253 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 254 255 vpadal.u8 q3, q0 //add cur_row together 256 vpadal.u8 q4, q1 //add ref_row together 257 258 vabd.u8 q2, q0, q1 //abs_diff 259 260 vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 261 262 vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 263 264 SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 265 266 vld1.8 {q1}, [\arg1], \arg2 //load ref_row 267 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 268 269 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 270.endm 271 272//the last row of a 16x16 block 273.macro SAD_SSD_BGD_16_end arg0, arg1, arg2 274 vld1.8 {q0}, [\arg0], \arg1 //load cur_row 275 276 vpadal.u8 q3, q0 //add cur_row together 277 vpadal.u8 q4, q1 //add ref_row together 278 279 vabd.u8 q2, q0, q1 //abs_diff 280 281 vmax.u8 q5, q2 //l_mad for 16 bytes reset for every 8x16 282 283 vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 284 285 SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 286 287 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 288 289 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 290.endm 291 292//for the begin of a 8x16 block, use some instructions to reset the register 293.macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3 294 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 295 296 vpaddl.u8 q3, q0 //add cur_row together 297 vpaddl.u8 q4, q1 //add ref_row together 298 299 vabd.u8 q2, q0, q1 //abs_diff 300 301 vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 302 303 vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 304 305 306 SSD_MUL_SUM_16BYTES d4,d5, q8, q11 //q8 for l_sqiff reset for every 16x16 307 308 vld1.8 {q1}, [\arg1], \arg2 //load ref_row 309 310 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 311 312 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 313.endm 314 315//for the begin of a 16x16 block, use some instructions to reset the register 316.macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3 317 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 318 vld1.8 {q1}, [\arg1], \arg2 //load ref_row 319 320 vpaddl.u8 q3, q0 //add cur_row together 321 vpaddl.u8 q4, q1 //add ref_row together 322 323 vabd.u8 q2, q0, q1 //abs_diff 324 325 vmov q5,q2 //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16 326 327 vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 328 329 SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 330 331 vld1.8 {q1}, [\arg1], \arg2 //load ref_row 332 333 vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 334 335 SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11 //q10 for lsqsum reset for every 16x16 336.endm 337 338//for each 8x16 block 339.macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2 340 341 vpmax.u8 d10, d10, d11 //4 numbers 342 vpmax.u8 d10, d10, d10 //2 numbers 343 vpmax.u8 d10, d10, d10 //1 number1 344 345 vmov \arg0, d10 //d26 d27 keeps the l_mad 346 347 //p_sd8x8 348 vpaddl.u16 q3, q3 349 vpaddl.u16 q4, q4 350 351 vsub.i32 \arg1, q3, q4 352 vpaddl.u32 \arg1, \arg1 353 354 //psad8x8 355 vpaddl.u16 \arg2, \arg2 356 vpaddl.u32 \arg2, \arg2 357 358 //psadframe 359 vadd.i32 q12, \arg2 360.endm 361 362.macro SAD_SSD_BGD_16x16 arg0, arg1, arg2 363 //for one 8x16 364 SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 365 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 366 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 367 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 368 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 369 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 370 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 371 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6 372 373 SAD_SSD_BGD_CALC_8x16 d26, q14, q6 374 375 //for another 8x16 376 SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 377 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 378 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 379 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 380 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 381 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 382 SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7 383 SAD_SSD_BGD_16_end \arg0, \arg2, q7 384 385 SAD_SSD_BGD_CALC_8x16 d27, q15, q7 386.endm 387 388.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2 389 vpaddl.s16 \arg0, \arg0 390 vpaddl.s32 \arg0, \arg0 391 vadd.i32 \arg1, \arg1, \arg2 392.endm 393 394 395WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon 396 stmdb sp!, {r0-r12, r14} 397 vpush {q4-q7} 398 399 ldr r4, [sp, #120] //r4 keeps the pic_stride 400 401 sub r5, r4, #1 402 lsl r5, r5, #4 //r5 keeps the little step 403 404 lsl r6, r4, #4 405 sub r6, r2, r6 //r6 keeps the big step 406 407 408 ldr r8, [sp, #128]//psad8x8 409 ldr r9, [sp, #132]//psum16x16 410 ldr r10, [sp, #136]//psqsum16x16 411 ldr r11, [sp, #140]//psqdiff16x16 412 ldr r12, [sp, #144]//p_sd8x8 413 ldr r14, [sp, #148]//p_mad8x8 414 415 vmov.i8 q12, #0 416 417vaa_calc_sad_ssd_bgd_height_loop: 418 419 mov r7, r2 420vaa_calc_sad_ssd_bgd_width_loop: 421 422 //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff q8, l_sum q9, l_sqsum q10 423 SAD_SSD_BGD_16x16 r0,r1,r4 424 425 //psad8x8 426 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! 427 428 sub r0, r0, r5 //jump to next 16x16 429 sub r1, r1, r5 //jump to next 16x16 430 431 //p_sd8x8 432 vst4.32 {d28[0], d29[0],d30[0], d31[0]}, [r12]! 433 434 //p_mad8x8 435 vst2.16 {d26[0], d27[0]}, [r14]! 436 437 //psqdiff16x16 438 vpaddl.s32 q8, q8 439 vadd.i32 d16, d16, d17 440 441 vst1.32 {d16[0]}, [r11]! //psqdiff16x16 442 443 //psum16x16 444 SSD_SAD_SD_MAD_PADDL q9, d18, d19 445 vst1.32 {d18[0]}, [r9]! //psum16x16 446 447 //psqsum16x16 448 vpaddl.s32 q10, q10 449 vadd.i32 d20, d20, d21 450 vst1.32 {d20[0]}, [r10]! //psqsum16x16 451 452 subs r7, #16 453 454 bne vaa_calc_sad_ssd_bgd_width_loop 455 456 sub r0, r0, r6 //jump to next 16 x width 457 sub r1, r1, r6 //jump to next 16 x width 458 459 subs r3, #16 460bne vaa_calc_sad_ssd_bgd_height_loop 461 462 //psadframe 463 ldr r7, [sp, #124]//psadframe 464 465 vadd.i32 d24, d24, d25 466 vst1.32 {d24[0]}, [r7] 467 468 vpop {q4-q7} 469 ldmia sp!, {r0-r12, r14} 470 471WELS_ASM_FUNC_END 472 473 474.macro SAD_VAR_16 arg0, arg1, arg2, arg3 475 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 476 477 vpadal.u8 q3, q0 //add cur_row together 478 vpadal.u8 q4, q1 //add ref_row together 479 480 vabd.u8 q2, q0, q1 //abs_diff 481 482 vpadal.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 483 484 vld1.8 {q1}, [\arg1], \arg2 485 486 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 487 488 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 489.endm 490 491.macro SAD_VAR_16_END arg0, arg1, arg2 492 vld1.8 {q0}, [\arg0], \arg1 //load cur_row 493 494 vpadal.u8 q3, q0 //add cur_row together 495 vpadal.u8 q4, q1 //add ref_row together 496 497 vabd.u8 q2, q0, q1 //abs_diff 498 499 vpadal.u8 \arg2, q2 //l_sad for 16 bytes reset for every 8x16 500 501 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 502 503 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 504.endm 505 506 507.macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3 508 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 509 vld1.8 {q1}, [\arg1], \arg2 510 511 vpaddl.u8 q3, q0 //add cur_row together 512 vpaddl.u8 q4, q1 //add ref_row together 513 514 vabd.u8 q2, q0, q1 //abs_diff 515 516 vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 517 518 vld1.8 {q1}, [\arg1], \arg2 519 520 vpaddl.u8 q9, q0 //q9 for l_sum reset for every 16x16 521 522 SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11 523.endm 524 525.macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3 526 vld1.8 {q0}, [\arg0], \arg2 //load cur_row 527 528 vpaddl.u8 q3, q0 //add cur_row together 529 vpaddl.u8 q4, q1 //add ref_row together 530 531 vabd.u8 q2, q0, q1 //abs_diff 532 533 vpaddl.u8 \arg3, q2 //l_sad for 16 bytes reset for every 8x16 534 535 vld1.8 {q1}, [\arg1], \arg2 536 537 vpadal.u8 q9, q0 //q9 for l_sum reset for every 16x16 538 539 SSD_MUL_SUM_16BYTES d0,d1, q10, q11 //q10 for lsqsum reset for every 16x16 540.endm 541 542.macro SAD_VAR_16x16 arg0, arg1, arg2 543 //for one 8x16 544 SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6 545 SAD_VAR_16 \arg0, \arg1, \arg2, q6 546 SAD_VAR_16 \arg0, \arg1, \arg2, q6 547 SAD_VAR_16 \arg0, \arg1, \arg2, q6 548 SAD_VAR_16 \arg0, \arg1, \arg2, q6 549 SAD_VAR_16 \arg0, \arg1, \arg2, q6 550 SAD_VAR_16 \arg0, \arg1, \arg2, q6 551 SAD_VAR_16 \arg0, \arg1, \arg2, q6 552 553 vpaddl.u16 q6, q6 554 vpaddl.u32 q6, q6 555 vadd.i32 q12, q6 556 557 //for another 8x16 558 SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7 559 SAD_VAR_16 \arg0, \arg1, \arg2, q7 560 SAD_VAR_16 \arg0, \arg1, \arg2, q7 561 SAD_VAR_16 \arg0, \arg1, \arg2, q7 562 SAD_VAR_16 \arg0, \arg1, \arg2, q7 563 SAD_VAR_16 \arg0, \arg1, \arg2, q7 564 SAD_VAR_16 \arg0, \arg1, \arg2, q7 565 SAD_VAR_16_END \arg0, \arg2, q7 566 567 vpaddl.u16 q7, q7 568 vpaddl.u32 q7, q7 569 570 vadd.i32 q12, q7 571.endm 572 573 574WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon 575 stmdb sp!, {r4-r11} 576 vpush {q4} 577 vpush {q6-q7} 578 579 ldr r4, [sp, #80] //r4 keeps the pic_stride 580 581 sub r5, r4, #1 582 lsl r5, r5, #4 //r5 keeps the little step 583 584 lsl r6, r4, #4 585 sub r6, r2, r6 //r6 keeps the big step 586 587 ldr r7, [sp, #84] //psadframe 588 ldr r8, [sp, #88] //psad8x8 589 ldr r9, [sp, #92] //psum16x16 590 ldr r10, [sp, #96] //psqsum16x16 591 592 vmov.i8 q12, #0 593vaa_calc_sad_var_height_loop: 594 595 mov r11, r2 596vaa_calc_sad_var_width_loop: 597 598 599 SAD_VAR_16x16 r0,r1,r4 600 //psad8x8 601 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! 602 603 sub r0, r0, r5 //jump to next 16x16 604 sub r1, r1, r5 //jump to next 16x16 605 606 //psum16x16 607 SSD_SAD_SD_MAD_PADDL q9, d18, d19 608 vst1.32 {d18[0]}, [r9]! //psum16x16 609 610 //psqsum16x16 611 vpaddl.s32 q10, q10 612 subs r11, #16 613 vadd.i32 d20, d20, d21 614 vst1.32 {d20[0]}, [r10]! //psqsum16x16 615 616 bne vaa_calc_sad_var_width_loop 617 618 sub r0, r0, r6 //jump to next 16 x width 619 sub r1, r1, r6 //jump to next 16 x width 620 621 subs r3, #16 622bne vaa_calc_sad_var_height_loop 623 624 vadd.i32 d24, d24, d25 625 vst1.32 {d24[0]}, [r7] 626 627 vpop {q6-q7} 628 vpop {q4} 629 ldmia sp!, {r4-r11} 630WELS_ASM_FUNC_END 631 632 633.macro SAD_SSD_16 arg0, arg1, arg2, arg3 634 SAD_VAR_16 \arg0, \arg1, \arg2, \arg3 635 636 SSD_MUL_SUM_16BYTES d4,d5,q8, q11 637.endm 638 639.macro SAD_SSD_16_END arg0, arg1, arg2 640 SAD_VAR_16_END \arg0, \arg1, \arg2 641 642 SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 643.endm 644 645.macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3 646 SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3 647 648 SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 649.endm 650 651.macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3 652 SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3 653 654 SSD_MUL_SUM_16BYTES d4,d5,q8, q11 //q8 for l_sqiff reset for every 16x16 655.endm 656 657.macro SAD_SSD_16x16 arg0, arg1, arg2 658 //for one 8x16 659 SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6 660 SAD_SSD_16 \arg0, \arg1, \arg2, q6 661 SAD_SSD_16 \arg0, \arg1, \arg2, q6 662 SAD_SSD_16 \arg0, \arg1, \arg2, q6 663 SAD_SSD_16 \arg0, \arg1, \arg2, q6 664 SAD_SSD_16 \arg0, \arg1, \arg2, q6 665 SAD_SSD_16 \arg0, \arg1, \arg2, q6 666 SAD_SSD_16 \arg0, \arg1, \arg2, q6 667 668 vpaddl.u16 q6, q6 669 vpaddl.u32 q6, q6 670 vadd.i32 q12, q6 671 672 //for another 8x16 673 SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7 674 SAD_SSD_16 \arg0, \arg1, \arg2, q7 675 SAD_SSD_16 \arg0, \arg1, \arg2, q7 676 SAD_SSD_16 \arg0, \arg1, \arg2, q7 677 SAD_SSD_16 \arg0, \arg1, \arg2, q7 678 SAD_SSD_16 \arg0, \arg1, \arg2, q7 679 SAD_SSD_16 \arg0, \arg1, \arg2, q7 680 SAD_SSD_16_END \arg0, \arg2, q7 681 682 vpaddl.u16 q7, q7 683 vpaddl.u32 q7, q7 684 685 vadd.i32 q12, q7 686.endm 687 688 689WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon 690 stmdb sp!, {r4-r12} 691 vpush {q4} 692 vpush {q6-q7} 693 694 ldr r4, [sp, #84] //r4 keeps the pic_stride 695 696 sub r5, r4, #1 697 lsl r5, r5, #4 //r5 keeps the little step 698 699 lsl r6, r4, #4 700 sub r6, r2, r6 //r6 keeps the big step 701 702 ldr r7, [sp, #88] //psadframe 703 ldr r8, [sp, #92] //psad8x8 704 ldr r9, [sp, #96] //psum16x16 705 ldr r10, [sp, #100] //psqsum16x16 706 ldr r11, [sp, #104] //psqdiff16x16 707 708 vmov.i8 q12, #0 709vaa_calc_sad_ssd_height_loop: 710 711 mov r12, r2 712vaa_calc_sad_ssd_width_loop: 713 714 715 SAD_SSD_16x16 r0,r1,r4 716 //psad8x8 717 vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]! 718 719 sub r0, r0, r5 //jump to next 16x16 720 sub r1, r1, r5 //jump to next 16x16 721 722 //psum16x16 723 vpaddl.s16 q9, q9 724 vpaddl.s32 q9, q9 725 vadd.i32 d18, d18, d19 726 vst1.32 {d18[0]}, [r9]! //psum16x16 727 728 //psqsum16x16 729 vpaddl.s32 q10, q10 730 vadd.i32 d20, d20, d21 731 vst1.32 {d20[0]}, [r10]! //psqsum16x16 732 733 //psqdiff16x16 734 vpaddl.s32 q8, q8 735 vadd.i32 d16, d16, d17 736 subs r12, #16 737 vst1.32 {d16[0]}, [r11]! //psqdiff16x16 738 739 bne vaa_calc_sad_ssd_width_loop 740 741 sub r0, r0, r6 //jump to next 16 x width 742 sub r1, r1, r6 //jump to next 16 x width 743 744 subs r3, #16 745 bne vaa_calc_sad_ssd_height_loop 746 747 vadd.i32 d24, d24, d25 748 vst1.32 {d24[0]}, [r7] 749 750 vpop {q6-q7} 751 vpop {q4} 752 ldmia sp!, {r4-r12} 753WELS_ASM_FUNC_END 754 755#endif 756