1/*! 2 * \copy 3 * Copyright (c) 2013, Cisco Systems 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * * Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 21 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 22 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 24 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 28 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 * 31 */ 32 33#ifdef HAVE_NEON_AARCH64 34#include "arm_arch64_common_macro.S" 35 36.macro ABS_SUB_SUM_16BYTES arg0, arg1 37 ld1 {v0.16b}, [x0], x4 38 ld1 {v1.16b}, [x1], x4 39 uabal \arg0, v0.8b, v1.8b 40 uabal2 \arg1, v0.16b,v1.16b 41.endm 42 43.macro ABS_SUB_SUM_8x16BYTES arg0, arg1 44 ld1 {v0.16b}, [x0], x4 45 ld1 {v1.16b}, [x1], x4 46 uabdl \arg0, v0.8b, v1.8b 47 uabdl2 \arg1, v0.16b,v1.16b 48 49 ABS_SUB_SUM_16BYTES \arg0, \arg1 50 ABS_SUB_SUM_16BYTES \arg0, \arg1 51 ABS_SUB_SUM_16BYTES \arg0, \arg1 52 ABS_SUB_SUM_16BYTES \arg0, \arg1 53 ABS_SUB_SUM_16BYTES \arg0, \arg1 54 ABS_SUB_SUM_16BYTES \arg0, \arg1 55 ABS_SUB_SUM_16BYTES \arg0, \arg1 56.endm 57 58/* 59 * void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride, 60 * int32_t *psadframe, int32_t *psad8x8) 61 */ 62WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSad_AArch64_neon 63 eor v31.16b, v31.16b, v31.16b 64 65 SIGN_EXTENSION x4, w4 66 lsl x9, x4, #4 67 sub x10, x9, #16 //x10 keep the 16*pic_stride-16 68 sub x9, x9, x2 //x9 keep the 16*pic_stride-pic_width 69vaa_calc_sad_loop0: 70 mov w11, w2 71vaa_calc_sad_loop1: 72 ABS_SUB_SUM_8x16BYTES v2.8h, v3.8h 73 ABS_SUB_SUM_8x16BYTES v4.8h, v5.8h 74 75 uaddlv s2, v2.8h 76 uaddlv s3, v3.8h 77 uaddlv s4, v4.8h 78 uaddlv s5, v5.8h 79 80 st4 {v2.s, v3.s, v4.s, v5.s}[0], [x6], #16 81 sub x0, x0, x10 82 sub x1, x1, x10 83 sub w11, w11, #16 84 add v6.2s, v2.2s, v3.2s 85 add v7.2s, v4.2s, v5.2s 86 add v6.2s, v6.2s, v7.2s 87 add v31.2s, v31.2s, v6.2s 88 cbnz w11, vaa_calc_sad_loop1 89 90 add x0, x0, x9 91 add x1, x1, x9 92 sub w3, w3, #16 93 cbnz w3, vaa_calc_sad_loop0 94 95 str s31, [x5] 96 97WELS_ASM_AARCH64_FUNC_END 98 99.macro SAD_SD_MAD_8x16BYTES 100 ld1 {v0.16b}, [x0], x4 101 ld1 {v1.16b}, [x1], x4 102 uabd v31.16b, v0.16b, v1.16b 103 uaddlp v2.8h, v31.16b 104 uaddlp v4.8h, v0.16b 105 uaddlp v5.8h, v1.16b 106.rept 7 107 ld1 {v0.16b}, [x0], x4 108 ld1 {v1.16b}, [x1], x4 109 uabd v30.16b, v0.16b, v1.16b 110 umax v31.16b, v31.16b,v30.16b 111 uadalp v2.8h, v30.16b 112 uadalp v4.8h, v0.16b 113 uadalp v5.8h, v1.16b 114.endr 115.endm 116/* 117 * void vaa_calc_sad_bgd_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride, 118 * int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8) 119 */ 120WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadBgd_AArch64_neon 121 ldr x15, [sp, #0] 122 eor v28.16b, v28.16b, v28.16b 123 124 SIGN_EXTENSION x4, w4 125 lsl x9, x4, #4 126 sub x10, x9, #16 //x10 keep the 16*pic_stride-16 127 sub x9, x9, x2 //x9 keep the 16*pic_stride-pic_width 128vaa_calc_sad_bgd_loop0: 129 mov w11, w2 130vaa_calc_sad_bgd_loop1: 131 SAD_SD_MAD_8x16BYTES 132 umaxv b24, v31.8b 133 ins v31.d[0], v31.d[1] 134 umaxv b25, v31.8b 135 uaddlv s20, v2.4h 136 ins v2.d[0], v2.d[1] 137 uaddlv s21, v2.4h 138 usubl v6.4s, v4.4h, v5.4h 139 usubl2 v7.4s, v4.8h, v5.8h 140 addv s16, v6.4s 141 addv s17, v7.4s 142 143 SAD_SD_MAD_8x16BYTES 144 umaxv b26, v31.8b 145 ins v31.d[0], v31.d[1] 146 umaxv b27, v31.8b 147 uaddlv s22, v2.4h 148 ins v2.d[0], v2.d[1] 149 uaddlv s23, v2.4h 150 usubl v6.4s, v4.4h, v5.4h 151 usubl2 v7.4s, v4.8h, v5.8h 152 addv s18, v6.4s 153 addv s19, v7.4s 154 st4 {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16 155 156 sub x0, x0, x10 157 sub x1, x1, x10 158 st4 {v16.s, v17.s, v18.s, v19.s}[0], [x7], #16 159 sub w11, w11, #16 160 st4 {v24.b, v25.b, v26.b, v27.b}[0], [x15], #4 161 add v29.2s, v20.2s, v21.2s 162 add v30.2s, v22.2s, v23.2s 163 add v29.2s, v29.2s, v30.2s 164 add v28.2s, v28.2s, v29.2s 165 cbnz w11, vaa_calc_sad_bgd_loop1 166 167 add x0, x0, x9 168 add x1, x1, x9 169 sub w3, w3, #16 170 cbnz w3, vaa_calc_sad_bgd_loop0 171 str s28, [x5] 172 173WELS_ASM_AARCH64_FUNC_END 174 175.macro SAD_SSD_BGD_8x16BYTES_1 176 ld1 {v0.16b}, [x0], x4 177 ld1 {v1.16b}, [x1], x4 178 uabd v31.16b, v0.16b, v1.16b 179 umull v30.8h, v31.8b, v31.8b 180 uaddlp v29.4s, v30.8h 181 umull2 v30.8h, v31.16b, v31.16b 182 uadalp v29.4s, v30.8h // p_sqdiff 183 184 uaddlp v28.8h, v0.16b // p_sum 185 umull v30.8h, v0.8b, v0.8b 186 uaddlp v27.4s, v30.8h 187 umull2 v30.8h, v0.16b, v0.16b 188 uadalp v27.4s, v30.8h // p_sqsum 189 190 uaddlp v2.8h, v31.16b // p_sad 191 uaddlp v4.8h, v0.16b 192 uaddlp v5.8h, v1.16b 193.rept 7 194 ld1 {v0.16b}, [x0], x4 195 ld1 {v1.16b}, [x1], x4 196 uabd v3.16b, v0.16b, v1.16b 197 umax v31.16b, v31.16b,v3.16b //p_mad 198 umull v30.8h, v3.8b, v3.8b 199 uadalp v29.4s, v30.8h 200 umull2 v30.8h, v3.16b, v3.16b 201 uadalp v29.4s, v30.8h // p_sqdiff 202 203 uadalp v28.8h, v0.16b // p_sum 204 umull v30.8h, v0.8b, v0.8b 205 uadalp v27.4s, v30.8h 206 umull2 v30.8h, v0.16b, v0.16b 207 uadalp v27.4s, v30.8h // p_sqsum 208 209 uadalp v2.8h, v3.16b //p_sad 210 uadalp v4.8h, v0.16b 211 uadalp v5.8h, v1.16b //p_sd 212.endr 213.endm 214 215.macro SAD_SSD_BGD_8x16BYTES_2 216 ld1 {v0.16b}, [x0], x4 217 ld1 {v1.16b}, [x1], x4 218 uabd v26.16b, v0.16b, v1.16b 219 umull v30.8h, v26.8b, v26.8b 220 uadalp v29.4s, v30.8h 221 umull2 v30.8h, v26.16b, v26.16b 222 uadalp v29.4s, v30.8h // p_sqdiff 223 224 uadalp v28.8h, v0.16b // p_sum 225 umull v30.8h, v0.8b, v0.8b 226 uadalp v27.4s, v30.8h 227 umull2 v30.8h, v0.16b, v0.16b 228 uadalp v27.4s, v30.8h // p_sqsum 229 230 uaddlp v16.8h,v26.16b // p_sad 231 uaddlp v6.8h, v0.16b 232 uaddlp v7.8h, v1.16b 233.rept 7 234 ld1 {v0.16b}, [x0], x4 235 ld1 {v1.16b}, [x1], x4 236 uabd v3.16b, v0.16b, v1.16b 237 umax v26.16b, v26.16b,v3.16b //p_mad 238 umull v30.8h, v3.8b, v3.8b 239 uadalp v29.4s, v30.8h 240 umull2 v30.8h, v3.16b, v3.16b 241 uadalp v29.4s, v30.8h // p_sqdiff 242 243 uadalp v28.8h, v0.16b // p_sum 244 umull v30.8h, v0.8b, v0.8b 245 uadalp v27.4s, v30.8h 246 umull2 v30.8h, v0.16b, v0.16b 247 uadalp v27.4s, v30.8h // p_sqsum 248 249 uadalp v16.8h, v3.16b //p_sad 250 uadalp v6.8h, v0.16b 251 uadalp v7.8h, v1.16b //p_sd 252.endr 253.endm 254 255/* 256 * void vaa_calc_sad_ssd_bgd_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride, 257 * int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8) 258 */ 259WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadSsdBgd_AArch64_neon 260 ldr x12, [sp, #0] //psqsum16x16 261 ldr x13, [sp, #8] //psqdiff16x16 262 ldr x14, [sp, #16] //p_sd8x8 263 ldr x15, [sp, #24] //p_mad8x8 264 eor v17.16b, v17.16b, v17.16b 265 266 SIGN_EXTENSION x4, w4 267 lsl x9, x4, #4 268 sub x10, x9, #16 //x10 keep the 16*pic_stride-16 269 sub x9, x9, x2 //x9 keep the 16*pic_stride-pic_width 270 271vaa_calc_sad_ssd_bgd_height_loop: 272 mov w11, w2 273vaa_calc_sad_ssd_bgd_width_loop: 274 SAD_SSD_BGD_8x16BYTES_1 //psad:v2, v16, psum:v28, psqsum:v27, psqdiff:v29, psd:v4, v5, v6, v7, pmad:v31, v26 275 SAD_SSD_BGD_8x16BYTES_2 276 277 umaxv b22, v31.8b 278 ins v31.d[0], v31.d[1] 279 umaxv b23, v31.8b 280 umaxv b24, v26.8b 281 ins v26.d[0], v26.d[1] 282 umaxv b25, v26.8b 283 st4 {v22.b, v23.b, v24.b, v25.b}[0], [x15], #4 284 285 usubl v20.4s, v4.4h, v5.4h 286 usubl2 v21.4s, v4.8h, v5.8h 287 addv s20, v20.4s 288 addv s21, v21.4s 289 usubl v22.4s, v6.4h, v7.4h 290 usubl2 v23.4s, v6.8h, v7.8h 291 addv s22, v22.4s 292 addv s23, v23.4s 293 st4 {v20.s, v21.s, v22.s, v23.s}[0], [x14], #16 294 295 uaddlv s20, v2.4h 296 ins v2.d[0], v2.d[1] 297 uaddlv s21, v2.4h 298 uaddlv s22, v16.4h 299 ins v16.d[0], v16.d[1] 300 uaddlv s23, v16.4h 301 st4 {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16 302 303 uaddlv s28, v28.8h 304 str s28, [x7], #4 305 addv s27, v27.4s 306 str s27, [x12], #4 307 addv s29, v29.4s 308 str s29, [x13], #4 309 310 sub x0, x0, x10 311 sub x1, x1, x10 312 sub w11, w11, #16 313 add v29.2s, v20.2s, v21.2s 314 add v30.2s, v22.2s, v23.2s 315 add v29.2s, v29.2s, v30.2s 316 add v17.2s, v17.2s, v29.2s 317 cbnz w11, vaa_calc_sad_ssd_bgd_width_loop 318 319 add x0, x0, x9 320 add x1, x1, x9 321 sub w3, w3, #16 322 cbnz w3, vaa_calc_sad_ssd_bgd_height_loop 323 str s17, [x5] 324WELS_ASM_AARCH64_FUNC_END 325 326 327.macro SAD_SSD_8x16BYTES_1 328 ld1 {v0.16b}, [x0], x4 329 ld1 {v1.16b}, [x1], x4 330 uabd v31.16b, v0.16b, v1.16b 331 umull v30.8h, v31.8b, v31.8b 332 uaddlp v29.4s, v30.8h 333 umull2 v30.8h, v31.16b, v31.16b 334 uadalp v29.4s, v30.8h // p_sqdiff 335 336 uaddlp v28.8h, v0.16b // p_sum 337 umull v30.8h, v0.8b, v0.8b 338 uaddlp v27.4s, v30.8h 339 umull2 v30.8h, v0.16b, v0.16b 340 uadalp v27.4s, v30.8h // p_sqsum 341 342 uaddlp v2.8h, v31.16b // p_sad 343.rept 7 344 ld1 {v0.16b}, [x0], x4 345 ld1 {v1.16b}, [x1], x4 346 uabd v3.16b, v0.16b, v1.16b 347 umull v30.8h, v3.8b, v3.8b 348 uadalp v29.4s, v30.8h 349 umull2 v30.8h, v3.16b, v3.16b 350 uadalp v29.4s, v30.8h // p_sqdiff 351 352 uadalp v28.8h, v0.16b // p_sum 353 umull v30.8h, v0.8b, v0.8b 354 uadalp v27.4s, v30.8h 355 umull2 v30.8h, v0.16b, v0.16b 356 uadalp v27.4s, v30.8h // p_sqsum 357 358 uadalp v2.8h, v3.16b //p_sad 359.endr 360.endm 361 362.macro SAD_SSD_8x16BYTES_2 363 ld1 {v0.16b}, [x0], x4 364 ld1 {v1.16b}, [x1], x4 365 uabd v26.16b, v0.16b, v1.16b 366 umull v30.8h, v26.8b, v26.8b 367 uadalp v29.4s, v30.8h 368 umull2 v30.8h, v26.16b, v26.16b 369 uadalp v29.4s, v30.8h // p_sqdiff 370 371 uadalp v28.8h, v0.16b // p_sum 372 umull v30.8h, v0.8b, v0.8b 373 uadalp v27.4s, v30.8h 374 umull2 v30.8h, v0.16b, v0.16b 375 uadalp v27.4s, v30.8h // p_sqsum 376 377 uaddlp v16.8h,v26.16b // p_sad 378 uaddlp v6.8h, v0.16b 379 uaddlp v7.8h, v1.16b 380.rept 7 381 ld1 {v0.16b}, [x0], x4 382 ld1 {v1.16b}, [x1], x4 383 uabd v3.16b, v0.16b, v1.16b 384 umull v30.8h, v3.8b, v3.8b 385 uadalp v29.4s, v30.8h 386 umull2 v30.8h, v3.16b, v3.16b 387 uadalp v29.4s, v30.8h // p_sqdiff 388 389 uadalp v28.8h, v0.16b // p_sum 390 umull v30.8h, v0.8b, v0.8b 391 uadalp v27.4s, v30.8h 392 umull2 v30.8h, v0.16b, v0.16b 393 uadalp v27.4s, v30.8h // p_sqsum 394 395 uadalp v16.8h, v3.16b //p_sad 396.endr 397.endm 398/* 399 * void vaa_calc_sad_ssd_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride, 400 * int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16) 401 */ 402WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadSsd_AArch64_neon 403 ldr x12, [sp, #0] //psqsum16x16 404 ldr x13, [sp, #8] //psqdiff16x16 405 eor v17.16b, v17.16b, v17.16b 406 407 SIGN_EXTENSION x4, w4 408 lsl x9, x4, #4 409 sub x10, x9, #16 //x10 keep the 16*pic_stride-16 410 sub x9, x9, x2 //x9 keep the 16*pic_stride-pic_width 411 412vaa_calc_sad_ssd_height_loop: 413 mov w11, w2 414vaa_calc_sad_ssd_width_loop: 415 SAD_SSD_8x16BYTES_1 //psad:v2, v16, psum:v28, psqsum:v27, psqdiff:v29 416 SAD_SSD_8x16BYTES_2 417 418 uaddlv s20, v2.4h 419 ins v2.d[0], v2.d[1] 420 uaddlv s21, v2.4h 421 uaddlv s22, v16.4h 422 ins v16.d[0], v16.d[1] 423 uaddlv s23, v16.4h 424 st4 {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16 425 426 uaddlv s28, v28.8h 427 str s28, [x7], #4 428 addv s27, v27.4s 429 str s27, [x12], #4 430 addv s29, v29.4s 431 str s29, [x13], #4 432 433 sub x0, x0, x10 434 sub x1, x1, x10 435 sub w11, w11, #16 436 add v29.2s, v20.2s, v21.2s 437 add v30.2s, v22.2s, v23.2s 438 add v29.2s, v29.2s, v30.2s 439 add v17.2s, v17.2s, v29.2s 440 cbnz w11, vaa_calc_sad_ssd_width_loop 441 442 add x0, x0, x9 443 add x1, x1, x9 444 sub w3, w3, #16 445 cbnz w3, vaa_calc_sad_ssd_height_loop 446 str s17, [x5] 447WELS_ASM_AARCH64_FUNC_END 448 449 450.macro SAD_VAR_8x16BYTES_1 451 ld1 {v0.16b}, [x0], x4 452 ld1 {v1.16b}, [x1], x4 453 uabd v31.16b, v0.16b, v1.16b 454 uaddlp v2.8h, v31.16b // p_sad 455 456 uaddlp v28.8h, v0.16b // p_sum 457 umull v30.8h, v0.8b, v0.8b 458 uaddlp v27.4s, v30.8h 459 umull2 v30.8h, v0.16b, v0.16b 460 uadalp v27.4s, v30.8h // p_sqsum 461 462.rept 7 463 ld1 {v0.16b}, [x0], x4 464 ld1 {v1.16b}, [x1], x4 465 uabd v3.16b, v0.16b, v1.16b 466 uadalp v2.8h, v3.16b //p_sad 467 468 uadalp v28.8h, v0.16b // p_sum 469 umull v30.8h, v0.8b, v0.8b 470 uadalp v27.4s, v30.8h 471 umull2 v30.8h, v0.16b, v0.16b 472 uadalp v27.4s, v30.8h // p_sqsum 473.endr 474.endm 475.macro SAD_VAR_8x16BYTES_2 476 ld1 {v0.16b}, [x0], x4 477 ld1 {v1.16b}, [x1], x4 478 uabd v26.16b, v0.16b, v1.16b 479 uaddlp v16.8h,v26.16b // p_sad 480 481 uadalp v28.8h, v0.16b // p_sum 482 umull v30.8h, v0.8b, v0.8b 483 uadalp v27.4s, v30.8h 484 umull2 v30.8h, v0.16b, v0.16b 485 uadalp v27.4s, v30.8h // p_sqsum 486.rept 7 487 ld1 {v0.16b}, [x0], x4 488 ld1 {v1.16b}, [x1], x4 489 uabd v3.16b, v0.16b, v1.16b 490 uadalp v16.8h, v3.16b //p_sad 491 492 uadalp v28.8h, v0.16b // p_sum 493 umull v30.8h, v0.8b, v0.8b 494 uadalp v27.4s, v30.8h 495 umull2 v30.8h, v0.16b, v0.16b 496 uadalp v27.4s, v30.8h // p_sqsum 497.endr 498.endm 499 500/* 501 * void vaa_calc_sad_var_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride, 502 * int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16) 503 */ 504WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadVar_AArch64_neon 505 ldr x12, [sp, #0] //psqsum16x16 506 eor v17.16b, v17.16b, v17.16b 507 508 SIGN_EXTENSION x4, w4 509 lsl x9, x4, #4 510 sub x10, x9, #16 //x10 keep the 16*pic_stride-16 511 sub x9, x9, x2 //x9 keep the 16*pic_stride-pic_width 512 513vaa_calc_sad_var_height_loop: 514 mov w11, w2 515vaa_calc_sad_var_width_loop: 516 SAD_VAR_8x16BYTES_1 //psad:v2, v16, psum:v28, psqsum:v27 517 SAD_VAR_8x16BYTES_2 518 519 uaddlv s20, v2.4h 520 ins v2.d[0], v2.d[1] 521 uaddlv s21, v2.4h 522 uaddlv s22, v16.4h 523 ins v16.d[0], v16.d[1] 524 uaddlv s23, v16.4h 525 st4 {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16 526 527 uaddlv s28, v28.8h 528 str s28, [x7], #4 529 addv s27, v27.4s 530 str s27, [x12], #4 531 532 sub x0, x0, x10 533 sub x1, x1, x10 534 sub w11, w11, #16 535 add v29.2s, v20.2s, v21.2s 536 add v30.2s, v22.2s, v23.2s 537 add v29.2s, v29.2s, v30.2s 538 add v17.2s, v17.2s, v29.2s 539 540 cbnz w11, vaa_calc_sad_var_width_loop 541 542 add x0, x0, x9 543 add x1, x1, x9 544 sub w3, w3, #16 545 cbnz w3, vaa_calc_sad_var_height_loop 546 str s17, [x5] 547WELS_ASM_AARCH64_FUNC_END 548 549#endif 550