• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON_AARCH64
34#include "arm_arch64_common_macro.S"
35
36.macro ABS_SUB_SUM_16BYTES arg0, arg1
37    ld1     {v0.16b}, [x0], x4
38    ld1     {v1.16b}, [x1], x4
39    uabal   \arg0, v0.8b, v1.8b
40    uabal2  \arg1, v0.16b,v1.16b
41.endm
42
43.macro ABS_SUB_SUM_8x16BYTES arg0, arg1
44    ld1     {v0.16b}, [x0], x4
45    ld1     {v1.16b}, [x1], x4
46    uabdl   \arg0, v0.8b, v1.8b
47    uabdl2  \arg1, v0.16b,v1.16b
48
49    ABS_SUB_SUM_16BYTES \arg0, \arg1
50    ABS_SUB_SUM_16BYTES \arg0, \arg1
51    ABS_SUB_SUM_16BYTES \arg0, \arg1
52    ABS_SUB_SUM_16BYTES \arg0, \arg1
53    ABS_SUB_SUM_16BYTES \arg0, \arg1
54    ABS_SUB_SUM_16BYTES \arg0, \arg1
55    ABS_SUB_SUM_16BYTES \arg0, \arg1
56.endm
57
58/*
59 * void vaa_calc_sad_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,
60 *                      int32_t *psadframe, int32_t *psad8x8)
61 */
62WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSad_AArch64_neon
63    eor     v31.16b, v31.16b, v31.16b
64
65    SIGN_EXTENSION x4, w4
66    lsl     x9, x4, #4
67    sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
68    sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
69vaa_calc_sad_loop0:
70    mov     w11, w2
71vaa_calc_sad_loop1:
72    ABS_SUB_SUM_8x16BYTES v2.8h, v3.8h
73    ABS_SUB_SUM_8x16BYTES v4.8h, v5.8h
74
75    uaddlv  s2, v2.8h
76    uaddlv  s3, v3.8h
77    uaddlv  s4, v4.8h
78    uaddlv  s5, v5.8h
79
80    st4     {v2.s, v3.s, v4.s, v5.s}[0], [x6], #16
81    sub     x0, x0, x10
82    sub     x1, x1, x10
83    sub     w11, w11, #16
84    add     v6.2s, v2.2s, v3.2s
85    add     v7.2s, v4.2s, v5.2s
86    add     v6.2s, v6.2s, v7.2s
87    add     v31.2s, v31.2s, v6.2s
88    cbnz    w11, vaa_calc_sad_loop1
89
90    add     x0, x0, x9
91    add     x1, x1, x9
92    sub     w3, w3, #16
93    cbnz    w3, vaa_calc_sad_loop0
94
95    str     s31, [x5]
96
97WELS_ASM_AARCH64_FUNC_END
98
99.macro SAD_SD_MAD_8x16BYTES
100    ld1     {v0.16b}, [x0], x4
101    ld1     {v1.16b}, [x1], x4
102    uabd    v31.16b, v0.16b, v1.16b
103    uaddlp  v2.8h, v31.16b
104    uaddlp  v4.8h, v0.16b
105    uaddlp  v5.8h, v1.16b
106.rept 7
107    ld1     {v0.16b}, [x0], x4
108    ld1     {v1.16b}, [x1], x4
109    uabd    v30.16b, v0.16b, v1.16b
110    umax    v31.16b, v31.16b,v30.16b
111    uadalp  v2.8h, v30.16b
112    uadalp  v4.8h, v0.16b
113    uadalp  v5.8h, v1.16b
114.endr
115.endm
116/*
117 * void vaa_calc_sad_bgd_neon(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,
118 *                             int32_t *psadframe, int32_t *psad8x8, int32_t *p_sd8x8, uint8_t *p_mad8x8)
119 */
120WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadBgd_AArch64_neon
121    ldr     x15, [sp, #0]
122    eor     v28.16b, v28.16b, v28.16b
123
124    SIGN_EXTENSION x4, w4
125    lsl     x9, x4, #4
126    sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
127    sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
128vaa_calc_sad_bgd_loop0:
129    mov     w11, w2
130vaa_calc_sad_bgd_loop1:
131    SAD_SD_MAD_8x16BYTES
132    umaxv   b24, v31.8b
133    ins     v31.d[0], v31.d[1]
134    umaxv   b25, v31.8b
135    uaddlv  s20, v2.4h
136    ins     v2.d[0], v2.d[1]
137    uaddlv  s21, v2.4h
138    usubl   v6.4s, v4.4h, v5.4h
139    usubl2  v7.4s, v4.8h, v5.8h
140    addv    s16, v6.4s
141    addv    s17, v7.4s
142
143    SAD_SD_MAD_8x16BYTES
144    umaxv   b26, v31.8b
145    ins     v31.d[0], v31.d[1]
146    umaxv   b27, v31.8b
147    uaddlv  s22, v2.4h
148    ins     v2.d[0], v2.d[1]
149    uaddlv  s23, v2.4h
150    usubl   v6.4s, v4.4h, v5.4h
151    usubl2  v7.4s, v4.8h, v5.8h
152    addv    s18, v6.4s
153    addv    s19, v7.4s
154    st4     {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16
155
156    sub     x0, x0, x10
157    sub     x1, x1, x10
158    st4     {v16.s, v17.s, v18.s, v19.s}[0], [x7], #16
159    sub     w11, w11, #16
160    st4     {v24.b, v25.b, v26.b, v27.b}[0], [x15], #4
161    add     v29.2s, v20.2s, v21.2s
162    add     v30.2s, v22.2s, v23.2s
163    add     v29.2s, v29.2s, v30.2s
164    add     v28.2s, v28.2s, v29.2s
165    cbnz    w11, vaa_calc_sad_bgd_loop1
166
167    add     x0, x0, x9
168    add     x1, x1, x9
169    sub     w3, w3, #16
170    cbnz    w3, vaa_calc_sad_bgd_loop0
171    str     s28, [x5]
172
173WELS_ASM_AARCH64_FUNC_END
174
175.macro SAD_SSD_BGD_8x16BYTES_1
176    ld1     {v0.16b}, [x0], x4
177    ld1     {v1.16b}, [x1], x4
178    uabd    v31.16b, v0.16b, v1.16b
179    umull   v30.8h, v31.8b, v31.8b
180    uaddlp  v29.4s, v30.8h
181    umull2  v30.8h, v31.16b, v31.16b
182    uadalp  v29.4s, v30.8h      //  p_sqdiff
183
184    uaddlp  v28.8h, v0.16b      //  p_sum
185    umull   v30.8h, v0.8b, v0.8b
186    uaddlp  v27.4s, v30.8h
187    umull2  v30.8h, v0.16b, v0.16b
188    uadalp  v27.4s, v30.8h      //  p_sqsum
189
190    uaddlp  v2.8h, v31.16b      //  p_sad
191    uaddlp  v4.8h, v0.16b
192    uaddlp  v5.8h, v1.16b
193.rept 7
194    ld1     {v0.16b}, [x0], x4
195    ld1     {v1.16b}, [x1], x4
196    uabd    v3.16b, v0.16b, v1.16b
197    umax    v31.16b, v31.16b,v3.16b     //p_mad
198    umull   v30.8h, v3.8b, v3.8b
199    uadalp  v29.4s, v30.8h
200    umull2  v30.8h, v3.16b, v3.16b
201    uadalp  v29.4s, v30.8h              //  p_sqdiff
202
203    uadalp  v28.8h, v0.16b              //  p_sum
204    umull   v30.8h, v0.8b, v0.8b
205    uadalp  v27.4s, v30.8h
206    umull2  v30.8h, v0.16b, v0.16b
207    uadalp  v27.4s, v30.8h              //  p_sqsum
208
209    uadalp  v2.8h, v3.16b              //p_sad
210    uadalp  v4.8h, v0.16b
211    uadalp  v5.8h, v1.16b               //p_sd
212.endr
213.endm
214
215.macro SAD_SSD_BGD_8x16BYTES_2
216    ld1     {v0.16b}, [x0], x4
217    ld1     {v1.16b}, [x1], x4
218    uabd    v26.16b, v0.16b, v1.16b
219    umull   v30.8h, v26.8b, v26.8b
220    uadalp  v29.4s, v30.8h
221    umull2  v30.8h, v26.16b, v26.16b
222    uadalp  v29.4s, v30.8h      //  p_sqdiff
223
224    uadalp  v28.8h, v0.16b      //  p_sum
225    umull   v30.8h, v0.8b, v0.8b
226    uadalp  v27.4s, v30.8h
227    umull2  v30.8h, v0.16b, v0.16b
228    uadalp  v27.4s, v30.8h      //  p_sqsum
229
230    uaddlp  v16.8h,v26.16b      //  p_sad
231    uaddlp  v6.8h, v0.16b
232    uaddlp  v7.8h, v1.16b
233.rept 7
234    ld1     {v0.16b}, [x0], x4
235    ld1     {v1.16b}, [x1], x4
236    uabd    v3.16b, v0.16b, v1.16b
237    umax    v26.16b, v26.16b,v3.16b     //p_mad
238    umull   v30.8h, v3.8b, v3.8b
239    uadalp  v29.4s, v30.8h
240    umull2  v30.8h, v3.16b, v3.16b
241    uadalp  v29.4s, v30.8h              //  p_sqdiff
242
243    uadalp  v28.8h, v0.16b              //  p_sum
244    umull   v30.8h, v0.8b, v0.8b
245    uadalp  v27.4s, v30.8h
246    umull2  v30.8h, v0.16b, v0.16b
247    uadalp  v27.4s, v30.8h              //  p_sqsum
248
249    uadalp  v16.8h, v3.16b              //p_sad
250    uadalp  v6.8h, v0.16b
251    uadalp  v7.8h, v1.16b               //p_sd
252.endr
253.endm
254
255/*
256 * void vaa_calc_sad_ssd_bgd_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,
257 *        int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16, int32_t *p_sd8x8, uint8_t *p_mad8x8)
258 */
259WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadSsdBgd_AArch64_neon
260    ldr     x12, [sp, #0]   //psqsum16x16
261    ldr     x13, [sp, #8]   //psqdiff16x16
262    ldr     x14, [sp, #16]  //p_sd8x8
263    ldr     x15, [sp, #24]  //p_mad8x8
264    eor     v17.16b, v17.16b, v17.16b
265
266    SIGN_EXTENSION x4, w4
267    lsl     x9, x4, #4
268    sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
269    sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
270
271vaa_calc_sad_ssd_bgd_height_loop:
272    mov     w11, w2
273vaa_calc_sad_ssd_bgd_width_loop:
274    SAD_SSD_BGD_8x16BYTES_1     //psad:v2, v16, psum:v28, psqsum:v27, psqdiff:v29, psd:v4, v5, v6, v7, pmad:v31, v26
275    SAD_SSD_BGD_8x16BYTES_2
276
277    umaxv   b22, v31.8b
278    ins     v31.d[0], v31.d[1]
279    umaxv   b23, v31.8b
280    umaxv   b24, v26.8b
281    ins     v26.d[0], v26.d[1]
282    umaxv   b25, v26.8b
283    st4     {v22.b, v23.b, v24.b, v25.b}[0], [x15], #4
284
285    usubl   v20.4s, v4.4h, v5.4h
286    usubl2  v21.4s, v4.8h, v5.8h
287    addv    s20, v20.4s
288    addv    s21, v21.4s
289    usubl   v22.4s, v6.4h, v7.4h
290    usubl2  v23.4s, v6.8h, v7.8h
291    addv    s22, v22.4s
292    addv    s23, v23.4s
293    st4     {v20.s, v21.s, v22.s, v23.s}[0], [x14], #16
294
295    uaddlv  s20, v2.4h
296    ins     v2.d[0], v2.d[1]
297    uaddlv  s21, v2.4h
298    uaddlv  s22, v16.4h
299    ins     v16.d[0], v16.d[1]
300    uaddlv  s23, v16.4h
301    st4     {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16
302
303    uaddlv  s28, v28.8h
304    str     s28, [x7], #4
305    addv    s27, v27.4s
306    str     s27, [x12], #4
307    addv    s29, v29.4s
308    str     s29, [x13], #4
309
310    sub     x0, x0, x10
311    sub     x1, x1, x10
312    sub     w11, w11, #16
313    add     v29.2s, v20.2s, v21.2s
314    add     v30.2s, v22.2s, v23.2s
315    add     v29.2s, v29.2s, v30.2s
316    add     v17.2s, v17.2s, v29.2s
317    cbnz    w11, vaa_calc_sad_ssd_bgd_width_loop
318
319    add     x0, x0, x9
320    add     x1, x1, x9
321    sub     w3, w3, #16
322    cbnz    w3, vaa_calc_sad_ssd_bgd_height_loop
323    str     s17, [x5]
324WELS_ASM_AARCH64_FUNC_END
325
326
327.macro SAD_SSD_8x16BYTES_1
328    ld1     {v0.16b}, [x0], x4
329    ld1     {v1.16b}, [x1], x4
330    uabd    v31.16b, v0.16b, v1.16b
331    umull   v30.8h, v31.8b, v31.8b
332    uaddlp  v29.4s, v30.8h
333    umull2  v30.8h, v31.16b, v31.16b
334    uadalp  v29.4s, v30.8h      //  p_sqdiff
335
336    uaddlp  v28.8h, v0.16b      //  p_sum
337    umull   v30.8h, v0.8b, v0.8b
338    uaddlp  v27.4s, v30.8h
339    umull2  v30.8h, v0.16b, v0.16b
340    uadalp  v27.4s, v30.8h      //  p_sqsum
341
342    uaddlp  v2.8h, v31.16b      //  p_sad
343.rept 7
344    ld1     {v0.16b}, [x0], x4
345    ld1     {v1.16b}, [x1], x4
346    uabd    v3.16b, v0.16b, v1.16b
347    umull   v30.8h, v3.8b, v3.8b
348    uadalp  v29.4s, v30.8h
349    umull2  v30.8h, v3.16b, v3.16b
350    uadalp  v29.4s, v30.8h              //  p_sqdiff
351
352    uadalp  v28.8h, v0.16b              //  p_sum
353    umull   v30.8h, v0.8b, v0.8b
354    uadalp  v27.4s, v30.8h
355    umull2  v30.8h, v0.16b, v0.16b
356    uadalp  v27.4s, v30.8h              //  p_sqsum
357
358    uadalp  v2.8h, v3.16b              //p_sad
359.endr
360.endm
361
362.macro SAD_SSD_8x16BYTES_2
363    ld1     {v0.16b}, [x0], x4
364    ld1     {v1.16b}, [x1], x4
365    uabd    v26.16b, v0.16b, v1.16b
366    umull   v30.8h, v26.8b, v26.8b
367    uadalp  v29.4s, v30.8h
368    umull2  v30.8h, v26.16b, v26.16b
369    uadalp  v29.4s, v30.8h      //  p_sqdiff
370
371    uadalp  v28.8h, v0.16b      //  p_sum
372    umull   v30.8h, v0.8b, v0.8b
373    uadalp  v27.4s, v30.8h
374    umull2  v30.8h, v0.16b, v0.16b
375    uadalp  v27.4s, v30.8h      //  p_sqsum
376
377    uaddlp  v16.8h,v26.16b      //  p_sad
378    uaddlp  v6.8h, v0.16b
379    uaddlp  v7.8h, v1.16b
380.rept 7
381    ld1     {v0.16b}, [x0], x4
382    ld1     {v1.16b}, [x1], x4
383    uabd    v3.16b, v0.16b, v1.16b
384    umull   v30.8h, v3.8b, v3.8b
385    uadalp  v29.4s, v30.8h
386    umull2  v30.8h, v3.16b, v3.16b
387    uadalp  v29.4s, v30.8h              //  p_sqdiff
388
389    uadalp  v28.8h, v0.16b              //  p_sum
390    umull   v30.8h, v0.8b, v0.8b
391    uadalp  v27.4s, v30.8h
392    umull2  v30.8h, v0.16b, v0.16b
393    uadalp  v27.4s, v30.8h              //  p_sqsum
394
395    uadalp  v16.8h, v3.16b              //p_sad
396.endr
397.endm
398/*
399 * void vaa_calc_sad_ssd_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,
400 *        int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16, int32_t *psqdiff16x16)
401 */
402WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadSsd_AArch64_neon
403    ldr     x12, [sp, #0]   //psqsum16x16
404    ldr     x13, [sp, #8]   //psqdiff16x16
405    eor     v17.16b, v17.16b, v17.16b
406
407    SIGN_EXTENSION x4, w4
408    lsl     x9, x4, #4
409    sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
410    sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
411
412vaa_calc_sad_ssd_height_loop:
413    mov     w11, w2
414vaa_calc_sad_ssd_width_loop:
415    SAD_SSD_8x16BYTES_1     //psad:v2, v16, psum:v28, psqsum:v27, psqdiff:v29
416    SAD_SSD_8x16BYTES_2
417
418    uaddlv  s20, v2.4h
419    ins     v2.d[0], v2.d[1]
420    uaddlv  s21, v2.4h
421    uaddlv  s22, v16.4h
422    ins     v16.d[0], v16.d[1]
423    uaddlv  s23, v16.4h
424    st4     {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16
425
426    uaddlv  s28, v28.8h
427    str     s28, [x7], #4
428    addv    s27, v27.4s
429    str     s27, [x12], #4
430    addv    s29, v29.4s
431    str     s29, [x13], #4
432
433    sub     x0, x0, x10
434    sub     x1, x1, x10
435    sub     w11, w11, #16
436    add     v29.2s, v20.2s, v21.2s
437    add     v30.2s, v22.2s, v23.2s
438    add     v29.2s, v29.2s, v30.2s
439    add     v17.2s, v17.2s, v29.2s
440    cbnz    w11, vaa_calc_sad_ssd_width_loop
441
442    add     x0, x0, x9
443    add     x1, x1, x9
444    sub     w3, w3, #16
445    cbnz    w3, vaa_calc_sad_ssd_height_loop
446    str     s17, [x5]
447WELS_ASM_AARCH64_FUNC_END
448
449
450.macro SAD_VAR_8x16BYTES_1
451    ld1     {v0.16b}, [x0], x4
452    ld1     {v1.16b}, [x1], x4
453    uabd    v31.16b, v0.16b, v1.16b
454    uaddlp  v2.8h, v31.16b      //  p_sad
455
456    uaddlp  v28.8h, v0.16b      //  p_sum
457    umull   v30.8h, v0.8b, v0.8b
458    uaddlp  v27.4s, v30.8h
459    umull2  v30.8h, v0.16b, v0.16b
460    uadalp  v27.4s, v30.8h      //  p_sqsum
461
462.rept 7
463    ld1     {v0.16b}, [x0], x4
464    ld1     {v1.16b}, [x1], x4
465    uabd    v3.16b, v0.16b, v1.16b
466    uadalp  v2.8h, v3.16b              //p_sad
467
468    uadalp  v28.8h, v0.16b              //  p_sum
469    umull   v30.8h, v0.8b, v0.8b
470    uadalp  v27.4s, v30.8h
471    umull2  v30.8h, v0.16b, v0.16b
472    uadalp  v27.4s, v30.8h              //  p_sqsum
473.endr
474.endm
475.macro SAD_VAR_8x16BYTES_2
476    ld1     {v0.16b}, [x0], x4
477    ld1     {v1.16b}, [x1], x4
478    uabd    v26.16b, v0.16b, v1.16b
479    uaddlp  v16.8h,v26.16b      //  p_sad
480
481    uadalp  v28.8h, v0.16b      //  p_sum
482    umull   v30.8h, v0.8b, v0.8b
483    uadalp  v27.4s, v30.8h
484    umull2  v30.8h, v0.16b, v0.16b
485    uadalp  v27.4s, v30.8h      //  p_sqsum
486.rept 7
487    ld1     {v0.16b}, [x0], x4
488    ld1     {v1.16b}, [x1], x4
489    uabd    v3.16b, v0.16b, v1.16b
490    uadalp  v16.8h, v3.16b              //p_sad
491
492    uadalp  v28.8h, v0.16b              //  p_sum
493    umull   v30.8h, v0.8b, v0.8b
494    uadalp  v27.4s, v30.8h
495    umull2  v30.8h, v0.16b, v0.16b
496    uadalp  v27.4s, v30.8h              //  p_sqsum
497.endr
498.endm
499
500/*
501 * void vaa_calc_sad_var_c(uint8_t *cur_data, uint8_t *ref_data, int32_t pic_width, int32_t pic_height, int32_t pic_stride,
502 *        int32_t *psadframe,int32_t *psad8x8, int32_t *psum16x16, int32_t *psqsum16x16)
503 */
504WELS_ASM_AARCH64_FUNC_BEGIN VAACalcSadVar_AArch64_neon
505    ldr     x12, [sp, #0]   //psqsum16x16
506    eor     v17.16b, v17.16b, v17.16b
507
508    SIGN_EXTENSION x4, w4
509    lsl     x9, x4, #4
510    sub     x10, x9, #16    //x10 keep the 16*pic_stride-16
511    sub     x9, x9, x2      //x9 keep the 16*pic_stride-pic_width
512
513vaa_calc_sad_var_height_loop:
514    mov     w11, w2
515vaa_calc_sad_var_width_loop:
516    SAD_VAR_8x16BYTES_1     //psad:v2, v16, psum:v28, psqsum:v27
517    SAD_VAR_8x16BYTES_2
518
519    uaddlv  s20, v2.4h
520    ins     v2.d[0], v2.d[1]
521    uaddlv  s21, v2.4h
522    uaddlv  s22, v16.4h
523    ins     v16.d[0], v16.d[1]
524    uaddlv  s23, v16.4h
525    st4     {v20.s, v21.s, v22.s, v23.s}[0], [x6], #16
526
527    uaddlv  s28, v28.8h
528    str     s28, [x7], #4
529    addv    s27, v27.4s
530    str     s27, [x12], #4
531
532    sub     x0, x0, x10
533    sub     x1, x1, x10
534    sub     w11, w11, #16
535    add     v29.2s, v20.2s, v21.2s
536    add     v30.2s, v22.2s, v23.2s
537    add     v29.2s, v29.2s, v30.2s
538    add     v17.2s, v17.2s, v29.2s
539
540    cbnz    w11, vaa_calc_sad_var_width_loop
541
542    add     x0, x0, x9
543    add     x1, x1, x9
544    sub     w3, w3, #16
545    cbnz    w3, vaa_calc_sad_var_height_loop
546    str     s17, [x5]
547WELS_ASM_AARCH64_FUNC_END
548
549#endif
550