• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON
34#include "arm_arch_common_macro.S"
35
36.macro ABS_SUB_SUM_16BYTES arg0, arg1, arg2, arg3, arg4
37    vld1.32 {q15}, [\arg0], \arg2
38    vld1.32 {q14}, [\arg1], \arg2
39    vabal.u8 \arg3, d30, d28
40    vabal.u8 \arg4, d31, d29
41.endm
42
43.macro ABS_SUB_SUM_8x16BYTES arg0, arg1, arg2, arg3, arg4
44    vld1.32 {q15}, [\arg0], \arg2
45    vld1.32 {q14}, [\arg1], \arg2
46    vabdl.u8 \arg3, d30, d28
47    vabdl.u8 \arg4, d31, d29
48
49    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
50    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
51    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
52    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
53    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
54    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
55    ABS_SUB_SUM_16BYTES \arg0, \arg1, \arg2, \arg3, \arg4
56.endm
57
58.macro SAD_8X16BITS arg0, arg1, arg2
59    vadd.u16 d31, \arg0, \arg1
60    vpaddl.u16 d31, d31
61    vpaddl.u32 \arg2, d31
62.endm
63
64
65WELS_ASM_FUNC_BEGIN VAACalcSad_neon
66
67    stmdb sp!, {r4-r8}
68
69    ldr r4, [sp, #20] //load pic_stride
70    ldr r5, [sp, #28] //load psad8x8
71
72    //Initial the Q8 register for save the "psadframe"
73    vmov.s64 q8, #0
74
75    //Get the jump distance to use on loop codes
76    lsl r8, r4, #4
77    sub r7, r8, #16 //R7 keep the 16*pic_stride-16
78    sub r8, r2      //R8 keep the 16*pic_stride-pic_width
79
80vaa_calc_sad_loop0:
81
82    //R6 keep the pic_width
83    mov r6, r2
84
85vaa_calc_sad_loop1:
86
87    //Process the 16x16 bytes
88    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q0, q1
89    ABS_SUB_SUM_8x16BYTES r0, r1, r4, q2, q3
90
91    //Do the SAD
92    SAD_8X16BITS d0, d1, d0
93    SAD_8X16BITS d2, d3, d1
94    SAD_8X16BITS d4, d5, d2
95    SAD_8X16BITS d6, d7, d3
96
97    //Write to "psad8x8" buffer
98    vst4.32 {d0[0],d1[0],d2[0],d3[0]}, [r5]!
99
100
101    //Adjust the input address
102    sub r0, r7
103    sub r1, r7
104
105    subs r6, #16
106
107    //Save to calculate "psadframe"
108    vadd.u32 q0, q1
109    vadd.u32 q8, q0
110
111    bne vaa_calc_sad_loop1
112
113    //Adjust the input address
114    add r0, r8
115    add r1, r8
116
117    subs r3, #16
118    bne vaa_calc_sad_loop0
119
120    ldr r6, [sp, #24] //load psadframe
121    vadd.u32 d16, d17
122    vst1.32 {d16[0]}, [r6]
123
124    ldmia sp!, {r4-r8}
125
126WELS_ASM_FUNC_END
127
128
129.macro SAD_SD_MAD_16BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6
130    vld1.32 {q0}, [\arg0], \arg2
131    vld1.32 {q1}, [\arg1], \arg2
132
133    vpadal.u8 \arg3, q0
134    vpadal.u8 \arg4, q1
135
136    vabd.u8 q0, q0, q1
137    vmax.u8 \arg5, q0
138    vpadal.u8 \arg6, q0
139.endm
140
141.macro SAD_SD_MAD_8x16BYTES arg0, arg1, arg2, arg3, arg4, arg5
142    vld1.32 {q0}, [\arg0], \arg2
143    vld1.32 {q1}, [\arg1], \arg2
144
145    vpaddl.u8 q2, q0
146    vpaddl.u8 q3, q1
147
148    vabd.u8 \arg3, q0, q1
149    vpaddl.u8 \arg4, \arg3       //abs_diff
150
151
152    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
153    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
154    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
155    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
156    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
157    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
158    SAD_SD_MAD_16BYTES \arg0,\arg1,\arg2,q2,q3,\arg3,\arg4
159
160    vsub.u16 \arg5, q2, q3
161.endm
162
163.macro SAD_SD_MAD_CALC arg0, arg1, arg2, arg3, arg4
164    vpmax.u8 d0, \arg0, \arg1 //8bytes
165    vpmax.u8 d0, d0, d0 //4bytes
166    vpmax.u8 \arg2, d0, d0 //2bytes
167
168    vpaddl.u16 \arg3, \arg3
169    vpaddl.u32 \arg3, \arg3
170    vpaddl.s16 \arg4, \arg4
171    vpaddl.s32 \arg4, \arg4
172.endm
173
174WELS_ASM_FUNC_BEGIN VAACalcSadBgd_neon
175
176    stmdb sp!, {r4-r10}
177
178    ldr r4, [sp, #28] //load pic_stride
179    ldr r5, [sp, #36] //load psad8x8
180    ldr r6, [sp, #40] //load psd8x8
181    ldr r7, [sp, #44] //load pmad8x8
182
183    //Initial the Q4 register for save the "psadframe"
184    vmov.s64 q15, #0
185
186    //Get the jump distance to use on loop codes
187    lsl r10, r4, #4
188    sub r9, r10, #16 //R9 keep the 16*pic_stride-16
189    sub r10, r2      //R10 keep the 16*pic_stride-pic_width
190
191vaa_calc_sad_bgd_loop0:
192
193    //R6 keep the pic_width
194    mov r8, r2
195
196vaa_calc_sad_bgd_loop1:
197
198    //Process the 16x16 bytes        pmad psad psd
199    SAD_SD_MAD_8x16BYTES r0, r1, r4, q13, q11, q9
200    SAD_SD_MAD_8x16BYTES r0, r1, r4, q14, q12, q10
201
202    SAD_SD_MAD_CALC d26, d27, d16, q11, q9
203    SAD_SD_MAD_CALC d28, d29, d17, q12, q10
204
205    //Write to "psad8x8" buffer
206    vst4.32 {d22[0],d23[0],d24[0],d25[0]}, [r5]!
207    //Adjust the input address
208    sub r0, r9
209    sub r1, r9
210    //Write to "psd8x8" buffer
211    vst4.32 {d18[0],d19[0],d20[0],d21[0]}, [r6]!
212    subs r8, #16
213    //Write to "pmad8x8" buffer
214    vst2.16 {d16[0],d17[0]}, [r7]!
215    //Save to calculate "psadframe"
216    vadd.u32 q11, q12
217    vadd.u32 q15, q11
218
219    bne vaa_calc_sad_bgd_loop1
220
221    //Adjust the input address
222    add r0, r10
223    add r1, r10
224
225    subs r3, #16
226    bne vaa_calc_sad_bgd_loop0
227
228    ldr r8, [sp, #32] //load psadframe
229    vadd.u32 d30, d31
230    vst1.32 {d30[0]}, [r8]
231    ldmia sp!, {r4-r10}
232
233WELS_ASM_FUNC_END
234
235
236.macro SSD_MUL_SUM_16BYTES_RESET arg0, arg1, arg2, arg3
237    vmull.u8   \arg3, \arg0, \arg0
238    vpaddl.u16 \arg2, \arg3
239
240    vmull.u8   \arg3, \arg1, \arg1
241    vpadal.u16 \arg2, \arg3
242.endm
243
244.macro SSD_MUL_SUM_16BYTES arg0, arg1, arg2, arg3
245    vmull.u8   \arg3, \arg0, \arg0
246    vpadal.u16 \arg2, \arg3
247
248    vmull.u8   \arg3, \arg1, \arg1
249    vpadal.u16 \arg2, \arg3
250.endm
251
252.macro SAD_SSD_BGD_16 arg0, arg1, arg2, arg3
253    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
254
255    vpadal.u8 q3, q0    //add cur_row together
256    vpadal.u8 q4, q1    //add ref_row together
257
258    vabd.u8 q2, q0, q1  //abs_diff
259
260    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
261
262    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
263
264    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
265
266    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
267    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
268
269    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
270.endm
271
272//the last row of a 16x16 block
273.macro SAD_SSD_BGD_16_end arg0, arg1, arg2
274    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
275
276    vpadal.u8 q3, q0    //add cur_row together
277    vpadal.u8 q4, q1    //add ref_row together
278
279    vabd.u8 q2, q0, q1  //abs_diff
280
281    vmax.u8 q5, q2                              //l_mad for 16 bytes reset for every 8x16
282
283    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
284
285    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
286
287    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
288
289    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
290.endm
291
292//for the begin of a 8x16 block, use some instructions to reset the register
293.macro SAD_SSD_BGD_16_RESET_8x8 arg0, arg1, arg2, arg3
294    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
295
296    vpaddl.u8 q3, q0    //add cur_row together
297    vpaddl.u8 q4, q1    //add ref_row together
298
299    vabd.u8 q2, q0, q1  //abs_diff
300
301    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
302
303    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
304
305
306    SSD_MUL_SUM_16BYTES d4,d5, q8, q11          //q8 for l_sqiff    reset for every 16x16
307
308    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
309
310    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
311
312    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
313.endm
314
315//for the begin of a 16x16 block, use some instructions to reset the register
316.macro SAD_SSD_BGD_16_RESET_16x16 arg0, arg1, arg2, arg3
317    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
318    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
319
320    vpaddl.u8 q3, q0    //add cur_row together
321    vpaddl.u8 q4, q1    //add ref_row together
322
323    vabd.u8 q2, q0, q1  //abs_diff
324
325    vmov q5,q2         //calculate max and avoid reset to zero, l_mad for 16 bytes reset for every 8x16
326
327    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
328
329    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
330
331    vld1.8 {q1}, [\arg1], \arg2 //load ref_row
332
333    vpaddl.u8 q9, q0                                //q9 for l_sum      reset for every 16x16
334
335    SSD_MUL_SUM_16BYTES_RESET d0,d1,q10,q11         //q10 for lsqsum    reset for every 16x16
336.endm
337
338//for each 8x16 block
339.macro SAD_SSD_BGD_CALC_8x16 arg0, arg1, arg2
340
341    vpmax.u8 d10, d10, d11 //4 numbers
342    vpmax.u8 d10, d10, d10 //2 numbers
343    vpmax.u8 d10, d10, d10 //1 number1
344
345    vmov \arg0, d10         //d26 d27 keeps the l_mad
346
347    //p_sd8x8
348    vpaddl.u16 q3, q3
349    vpaddl.u16 q4, q4
350
351    vsub.i32 \arg1, q3, q4
352    vpaddl.u32 \arg1, \arg1
353
354    //psad8x8
355    vpaddl.u16 \arg2, \arg2
356    vpaddl.u32 \arg2, \arg2
357
358    //psadframe
359    vadd.i32 q12, \arg2
360.endm
361
362.macro SAD_SSD_BGD_16x16 arg0, arg1, arg2
363    //for one 8x16
364    SAD_SSD_BGD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
365    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
366    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
367    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
368    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
369    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
370    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
371    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q6
372
373    SAD_SSD_BGD_CALC_8x16 d26, q14, q6
374
375    //for another 8x16
376    SAD_SSD_BGD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
377    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
378    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
379    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
380    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
381    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
382    SAD_SSD_BGD_16 \arg0, \arg1, \arg2, q7
383    SAD_SSD_BGD_16_end \arg0, \arg2, q7
384
385    SAD_SSD_BGD_CALC_8x16 d27, q15, q7
386.endm
387
388.macro SSD_SAD_SD_MAD_PADDL arg0, arg1, arg2
389    vpaddl.s16 \arg0, \arg0
390    vpaddl.s32 \arg0, \arg0
391    vadd.i32 \arg1, \arg1, \arg2
392.endm
393
394
395WELS_ASM_FUNC_BEGIN VAACalcSadSsdBgd_neon
396    stmdb sp!, {r0-r12, r14}
397    vpush {q4-q7}
398
399    ldr r4, [sp, #120] //r4 keeps the pic_stride
400
401    sub r5, r4, #1
402    lsl r5, r5, #4 //r5 keeps the little step
403
404    lsl r6, r4, #4
405    sub r6, r2, r6  //r6 keeps the big step
406
407
408    ldr r8, [sp, #128]//psad8x8
409    ldr r9, [sp, #132]//psum16x16
410    ldr r10, [sp, #136]//psqsum16x16
411    ldr r11, [sp, #140]//psqdiff16x16
412    ldr r12, [sp, #144]//p_sd8x8
413    ldr r14, [sp, #148]//p_mad8x8
414
415    vmov.i8 q12, #0
416
417vaa_calc_sad_ssd_bgd_height_loop:
418
419    mov r7, r2
420vaa_calc_sad_ssd_bgd_width_loop:
421
422    //l_sd q14&q15, l_mad q13, l_sad q6 & q7, l_sqdiff  q8, l_sum q9, l_sqsum q10
423    SAD_SSD_BGD_16x16 r0,r1,r4
424
425    //psad8x8
426    vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
427
428    sub r0, r0, r5 //jump to next 16x16
429    sub r1, r1, r5 //jump to next 16x16
430
431    //p_sd8x8
432    vst4.32 {d28[0], d29[0],d30[0], d31[0]}, [r12]!
433
434    //p_mad8x8
435    vst2.16 {d26[0], d27[0]}, [r14]!
436
437    //psqdiff16x16
438    vpaddl.s32 q8, q8
439    vadd.i32 d16, d16, d17
440
441    vst1.32 {d16[0]}, [r11]! //psqdiff16x16
442
443    //psum16x16
444    SSD_SAD_SD_MAD_PADDL q9, d18, d19
445    vst1.32 {d18[0]}, [r9]! //psum16x16
446
447    //psqsum16x16
448    vpaddl.s32 q10, q10
449    vadd.i32 d20, d20, d21
450    vst1.32 {d20[0]}, [r10]! //psqsum16x16
451
452    subs r7, #16
453
454    bne vaa_calc_sad_ssd_bgd_width_loop
455
456    sub r0, r0, r6      //jump to next 16 x width
457    sub r1, r1, r6      //jump to next 16 x width
458
459    subs r3, #16
460bne vaa_calc_sad_ssd_bgd_height_loop
461
462    //psadframe
463    ldr r7, [sp, #124]//psadframe
464
465    vadd.i32 d24, d24, d25
466    vst1.32 {d24[0]}, [r7]
467
468    vpop {q4-q7}
469    ldmia sp!, {r0-r12, r14}
470
471WELS_ASM_FUNC_END
472
473
474.macro SAD_VAR_16 arg0, arg1, arg2, arg3
475    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
476
477    vpadal.u8 q3, q0    //add cur_row together
478    vpadal.u8 q4, q1    //add ref_row together
479
480    vabd.u8 q2, q0, q1  //abs_diff
481
482    vpadal.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
483
484    vld1.8 {q1}, [\arg1], \arg2
485
486    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
487
488    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
489.endm
490
491.macro SAD_VAR_16_END arg0, arg1, arg2
492    vld1.8 {q0}, [\arg0], \arg1 //load cur_row
493
494    vpadal.u8 q3, q0    //add cur_row together
495    vpadal.u8 q4, q1    //add ref_row together
496
497    vabd.u8 q2, q0, q1  //abs_diff
498
499    vpadal.u8 \arg2, q2                         //l_sad for 16 bytes reset for every 8x16
500
501    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
502
503    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
504.endm
505
506
507.macro SAD_VAR_16_RESET_16x16 arg0, arg1, arg2, arg3
508    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
509    vld1.8 {q1}, [\arg1], \arg2
510
511    vpaddl.u8 q3, q0    //add cur_row together
512    vpaddl.u8 q4, q1    //add ref_row together
513
514    vabd.u8 q2, q0, q1  //abs_diff
515
516    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
517
518    vld1.8 {q1}, [\arg1], \arg2
519
520    vpaddl.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
521
522    SSD_MUL_SUM_16BYTES_RESET d0,d1, q10, q11
523.endm
524
525.macro SAD_VAR_16_RESET_8x8 arg0, arg1, arg2, arg3
526    vld1.8 {q0}, [\arg0], \arg2 //load cur_row
527
528    vpaddl.u8 q3, q0    //add cur_row together
529    vpaddl.u8 q4, q1    //add ref_row together
530
531    vabd.u8 q2, q0, q1  //abs_diff
532
533    vpaddl.u8 \arg3, q2                         //l_sad for 16 bytes reset for every 8x16
534
535    vld1.8 {q1}, [\arg1], \arg2
536
537    vpadal.u8 q9, q0                            //q9 for l_sum      reset for every 16x16
538
539    SSD_MUL_SUM_16BYTES d0,d1, q10, q11         //q10 for lsqsum    reset for every 16x16
540.endm
541
542.macro SAD_VAR_16x16 arg0, arg1, arg2
543    //for one 8x16
544    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, q6
545    SAD_VAR_16 \arg0, \arg1, \arg2, q6
546    SAD_VAR_16 \arg0, \arg1, \arg2, q6
547    SAD_VAR_16 \arg0, \arg1, \arg2, q6
548    SAD_VAR_16 \arg0, \arg1, \arg2, q6
549    SAD_VAR_16 \arg0, \arg1, \arg2, q6
550    SAD_VAR_16 \arg0, \arg1, \arg2, q6
551    SAD_VAR_16 \arg0, \arg1, \arg2, q6
552
553    vpaddl.u16 q6, q6
554    vpaddl.u32 q6, q6
555    vadd.i32 q12, q6
556
557    //for another 8x16
558    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, q7
559    SAD_VAR_16 \arg0, \arg1, \arg2, q7
560    SAD_VAR_16 \arg0, \arg1, \arg2, q7
561    SAD_VAR_16 \arg0, \arg1, \arg2, q7
562    SAD_VAR_16 \arg0, \arg1, \arg2, q7
563    SAD_VAR_16 \arg0, \arg1, \arg2, q7
564    SAD_VAR_16 \arg0, \arg1, \arg2, q7
565    SAD_VAR_16_END \arg0, \arg2, q7
566
567    vpaddl.u16 q7, q7
568    vpaddl.u32 q7, q7
569
570    vadd.i32 q12, q7
571.endm
572
573
574WELS_ASM_FUNC_BEGIN VAACalcSadVar_neon
575    stmdb sp!, {r4-r11}
576    vpush {q4}
577    vpush {q6-q7}
578
579    ldr r4, [sp, #80] //r4 keeps the pic_stride
580
581    sub r5, r4, #1
582    lsl r5, r5, #4 //r5 keeps the little step
583
584    lsl r6, r4, #4
585    sub r6, r2, r6  //r6 keeps the big step
586
587    ldr r7,     [sp, #84]   //psadframe
588    ldr r8,     [sp, #88]   //psad8x8
589    ldr r9,     [sp, #92]   //psum16x16
590    ldr r10,    [sp, #96]   //psqsum16x16
591
592    vmov.i8 q12, #0
593vaa_calc_sad_var_height_loop:
594
595    mov r11, r2
596vaa_calc_sad_var_width_loop:
597
598
599    SAD_VAR_16x16 r0,r1,r4
600    //psad8x8
601    vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
602
603    sub r0, r0, r5 //jump to next 16x16
604    sub r1, r1, r5 //jump to next 16x16
605
606    //psum16x16
607    SSD_SAD_SD_MAD_PADDL q9, d18, d19
608    vst1.32 {d18[0]}, [r9]! //psum16x16
609
610    //psqsum16x16
611    vpaddl.s32 q10, q10
612    subs r11, #16
613    vadd.i32 d20, d20, d21
614    vst1.32 {d20[0]}, [r10]! //psqsum16x16
615
616    bne vaa_calc_sad_var_width_loop
617
618    sub r0, r0, r6      //jump to next 16 x width
619    sub r1, r1, r6      //jump to next 16 x width
620
621    subs r3, #16
622bne vaa_calc_sad_var_height_loop
623
624    vadd.i32 d24, d24, d25
625    vst1.32 {d24[0]}, [r7]
626
627    vpop {q6-q7}
628    vpop {q4}
629    ldmia sp!, {r4-r11}
630WELS_ASM_FUNC_END
631
632
633.macro SAD_SSD_16 arg0, arg1, arg2, arg3
634    SAD_VAR_16 \arg0, \arg1, \arg2, \arg3
635
636    SSD_MUL_SUM_16BYTES d4,d5,q8, q11
637.endm
638
639.macro SAD_SSD_16_END arg0, arg1, arg2
640    SAD_VAR_16_END \arg0, \arg1, \arg2
641
642    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
643.endm
644
645.macro SAD_SSD_16_RESET_16x16 arg0, arg1, arg2, arg3
646    SAD_VAR_16_RESET_16x16 \arg0, \arg1, \arg2, \arg3
647
648    SSD_MUL_SUM_16BYTES_RESET d4,d5,q8, q11         //q8 for l_sqiff    reset for every 16x16
649.endm
650
651.macro SAD_SSD_16_RESET_8x8 arg0, arg1, arg2, arg3
652    SAD_VAR_16_RESET_8x8 \arg0, \arg1, \arg2, \arg3
653
654    SSD_MUL_SUM_16BYTES d4,d5,q8, q11           //q8 for l_sqiff    reset for every 16x16
655.endm
656
657.macro SAD_SSD_16x16 arg0, arg1, arg2
658    //for one 8x16
659    SAD_SSD_16_RESET_16x16 \arg0, \arg1, \arg2, q6
660    SAD_SSD_16 \arg0, \arg1, \arg2, q6
661    SAD_SSD_16 \arg0, \arg1, \arg2, q6
662    SAD_SSD_16 \arg0, \arg1, \arg2, q6
663    SAD_SSD_16 \arg0, \arg1, \arg2, q6
664    SAD_SSD_16 \arg0, \arg1, \arg2, q6
665    SAD_SSD_16 \arg0, \arg1, \arg2, q6
666    SAD_SSD_16 \arg0, \arg1, \arg2, q6
667
668    vpaddl.u16 q6, q6
669    vpaddl.u32 q6, q6
670    vadd.i32 q12, q6
671
672    //for another 8x16
673    SAD_SSD_16_RESET_8x8 \arg0, \arg1, \arg2, q7
674    SAD_SSD_16 \arg0, \arg1, \arg2, q7
675    SAD_SSD_16 \arg0, \arg1, \arg2, q7
676    SAD_SSD_16 \arg0, \arg1, \arg2, q7
677    SAD_SSD_16 \arg0, \arg1, \arg2, q7
678    SAD_SSD_16 \arg0, \arg1, \arg2, q7
679    SAD_SSD_16 \arg0, \arg1, \arg2, q7
680    SAD_SSD_16_END \arg0, \arg2, q7
681
682    vpaddl.u16 q7, q7
683    vpaddl.u32 q7, q7
684
685    vadd.i32 q12, q7
686.endm
687
688
689WELS_ASM_FUNC_BEGIN VAACalcSadSsd_neon
690    stmdb sp!, {r4-r12}
691    vpush {q4}
692    vpush {q6-q7}
693
694    ldr r4, [sp, #84] //r4 keeps the pic_stride
695
696    sub r5, r4, #1
697    lsl r5, r5, #4 //r5 keeps the little step
698
699    lsl r6, r4, #4
700    sub r6, r2, r6  //r6 keeps the big step
701
702    ldr r7,     [sp, #88]   //psadframe
703    ldr r8,     [sp, #92]   //psad8x8
704    ldr r9,     [sp, #96]   //psum16x16
705    ldr r10,    [sp, #100]  //psqsum16x16
706    ldr r11,    [sp, #104]  //psqdiff16x16
707
708    vmov.i8 q12, #0
709vaa_calc_sad_ssd_height_loop:
710
711    mov r12, r2
712vaa_calc_sad_ssd_width_loop:
713
714
715    SAD_SSD_16x16 r0,r1,r4
716    //psad8x8
717    vst4.32 {d12[0], d13[0], d14[0], d15[0]}, [r8]!
718
719    sub r0, r0, r5 //jump to next 16x16
720    sub r1, r1, r5 //jump to next 16x16
721
722    //psum16x16
723    vpaddl.s16 q9, q9
724    vpaddl.s32 q9, q9
725    vadd.i32 d18, d18, d19
726    vst1.32 {d18[0]}, [r9]! //psum16x16
727
728    //psqsum16x16
729    vpaddl.s32 q10, q10
730    vadd.i32 d20, d20, d21
731    vst1.32 {d20[0]}, [r10]! //psqsum16x16
732
733    //psqdiff16x16
734    vpaddl.s32 q8, q8
735    vadd.i32 d16, d16, d17
736    subs r12, #16
737    vst1.32 {d16[0]}, [r11]! //psqdiff16x16
738
739    bne vaa_calc_sad_ssd_width_loop
740
741    sub r0, r0, r6      //jump to next 16 x width
742    sub r1, r1, r6      //jump to next 16 x width
743
744    subs r3, #16
745    bne vaa_calc_sad_ssd_height_loop
746
747    vadd.i32 d24, d24, d25
748    vst1.32 {d24[0]}, [r7]
749
750    vpop {q6-q7}
751    vpop {q4}
752    ldmia sp!, {r4-r12}
753WELS_ASM_FUNC_END
754
755#endif
756