• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3  * All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * - Neither the name of the copyright owner, nor the names of its contributors
16  *   may be used to endorse or promote products derived from this software
17  *   without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "oapv_def.h"
33 #include <math.h>
34 
35 #if ARM_NEON
36 
37 /* SAD for 16bit **************************************************************/
sad_16b_neon_8x2n(int w,int h,void * src1,void * src2,int s_src1,int s_src2)38 int sad_16b_neon_8x2n(int w, int h, void *src1, void *src2, int s_src1, int s_src2)
39 {
40     int sad = 0;
41     s16* s1 = (s16*) src1;
42     s16* s2 = (s16*) src2;
43     int16x8_t s1_vector, s2_vector;
44     int32x4_t  diff_part1, diff_part2, diff_part1_abs, diff_part2_abs, sad_vector, sad_vector_temp;
45     // Loop unrolled
46     { // Row 0
47         // Loading one row (8 elements) each of src1 and src_2
48         s1_vector = vld1q_s16(s1);
49         s1 += s_src1;
50         s2_vector = vld1q_s16(s2);
51         s2 += s_src2;
52 
53         // Subtracting s1_vector from s2_vector and storing in 32 bits
54         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
55         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
56 
57         //Taking absolute value of difference and adding them
58         diff_part1_abs = vabsq_s32(diff_part1);
59         diff_part2_abs = vabsq_s32(diff_part2);
60 
61         sad_vector = vaddq_s32(diff_part1_abs, diff_part2_abs);
62     }
63     { // Row 1
64         s1_vector = vld1q_s16(s1);
65         s1 += s_src1;
66         s2_vector = vld1q_s16(s2);
67         s2 += s_src2;
68 
69         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
70         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
71 
72         diff_part1_abs = vabsq_s32(diff_part1);
73         diff_part2_abs = vabsq_s32(diff_part2);
74 
75         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
76         // Updating sad_vector by adding the new values
77         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
78     }
79     { // Row 2
80         s1_vector = vld1q_s16(s1);
81         s1 += s_src1;
82         s2_vector = vld1q_s16(s2);
83         s2 += s_src2;
84 
85         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
86         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
87 
88         diff_part1_abs = vabsq_s32(diff_part1);
89         diff_part2_abs = vabsq_s32(diff_part2);
90 
91         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
92         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
93     }
94     { // Row 3
95         s1_vector = vld1q_s16(s1);
96         s1 += s_src1;
97         s2_vector = vld1q_s16(s2);
98         s2 += s_src2;
99 
100         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
101         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
102 
103         diff_part1_abs = vabsq_s32(diff_part1);
104         diff_part2_abs = vabsq_s32(diff_part2);
105 
106         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
107         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
108     }
109     { // Row 4
110         s1_vector = vld1q_s16(s1);
111         s1 += s_src1;
112         s2_vector = vld1q_s16(s2);
113         s2 += s_src2;
114 
115         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
116         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
117 
118         diff_part1_abs = vabsq_s32(diff_part1);
119         diff_part2_abs = vabsq_s32(diff_part2);
120 
121         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
122         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
123     }
124     { // Row 5
125         s1_vector = vld1q_s16(s1);
126         s1 += s_src1;
127         s2_vector = vld1q_s16(s2);
128         s2 += s_src2;
129 
130         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
131         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
132 
133         diff_part1_abs = vabsq_s32(diff_part1);
134         diff_part2_abs = vabsq_s32(diff_part2);
135 
136         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
137         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
138     }
139     { // Row 6
140         s1_vector = vld1q_s16(s1);
141         s1 += s_src1;
142         s2_vector = vld1q_s16(s2);
143         s2 += s_src2;
144 
145         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
146         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
147 
148         diff_part1_abs = vabsq_s32(diff_part1);
149         diff_part2_abs = vabsq_s32(diff_part2);
150 
151         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
152         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
153     }
154     { // Row 7
155         s1_vector = vld1q_s16(s1);
156         s1 += s_src1;
157         s2_vector = vld1q_s16(s2);
158         s2 += s_src2;
159 
160         diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
161         diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
162 
163         diff_part1_abs = vabsq_s32(diff_part1);
164         diff_part2_abs = vabsq_s32(diff_part2);
165 
166         sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
167         sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
168     }
169     // Adding all the elments in sad vector
170     sad = vaddvq_s32(sad_vector);
171     return sad;
172 }
173 
174 const oapv_fn_sad_t oapv_tbl_fn_sad_16b_neon[2] = {
175     sad_16b_neon_8x2n,
176     NULL
177 };
178 
179 /* SSD ***********************************************************************/
ssd_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2)180 static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2)
181 {
182     s64 ssd = 0;
183     s16* s1 = (s16*) src1;
184     s16* s2 = (s16*) src2;
185     s16 i;
186     int16x8_t s1_vector, s2_vector;
187     int32x4_t diff1, diff2;
188     int32x2_t diff1_low, diff2_low;
189     int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff;
190     // Loop unrolling
191     { // Row 0
192         s1_vector = vld1q_s16(s1);
193         s1 += s_src1;
194         s2_vector = vld1q_s16(s2);
195         s2 += s_src2;
196 
197         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
198         diff2 = vsubl_high_s16(s1_vector, s2_vector);
199         diff1_low = vget_low_s32(diff1);
200         diff2_low = vget_low_s32(diff2);
201 
202         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
203         sq_diff1_high = vmull_high_s32(diff1, diff1);
204         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
205         sq_diff2_high = vmull_high_s32(diff2, diff2);
206 
207         sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high);
208         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
209         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
210     }
211     { // Row 1
212         s1_vector = vld1q_s16(s1);
213         s1 += s_src1;
214         s2_vector = vld1q_s16(s2);
215         s2 += s_src2;
216 
217         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
218         diff2 = vsubl_high_s16(s1_vector, s2_vector);
219         diff1_low = vget_low_s32(diff1);
220         diff2_low = vget_low_s32(diff2);
221 
222         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
223         sq_diff1_high = vmull_high_s32(diff1, diff1);
224         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
225         sq_diff2_high = vmull_high_s32(diff2, diff2);
226 
227         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
228         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
229         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
230         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
231     }
232     { // Row 2
233         s1_vector = vld1q_s16(s1);
234         s1 += s_src1;
235         s2_vector = vld1q_s16(s2);
236         s2 += s_src2;
237 
238         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
239         diff2 = vsubl_high_s16(s1_vector, s2_vector);
240         diff1_low = vget_low_s32(diff1);
241         diff2_low = vget_low_s32(diff2);
242 
243         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
244         sq_diff1_high = vmull_high_s32(diff1, diff1);
245         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
246         sq_diff2_high = vmull_high_s32(diff2, diff2);
247 
248         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
249         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
250         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
251         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
252     }
253     { // Row 3
254         s1_vector = vld1q_s16(s1);
255         s1 += s_src1;
256         s2_vector = vld1q_s16(s2);
257         s2 += s_src2;
258 
259         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
260         diff2 = vsubl_high_s16(s1_vector, s2_vector);
261         diff1_low = vget_low_s32(diff1);
262         diff2_low = vget_low_s32(diff2);
263 
264         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
265         sq_diff1_high = vmull_high_s32(diff1, diff1);
266         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
267         sq_diff2_high = vmull_high_s32(diff2, diff2);
268 
269         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
270         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
271         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
272         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
273     }
274     { // Row 4
275         s1_vector = vld1q_s16(s1);
276         s1 += s_src1;
277         s2_vector = vld1q_s16(s2);
278         s2 += s_src2;
279 
280         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
281         diff2 = vsubl_high_s16(s1_vector, s2_vector);
282         diff1_low = vget_low_s32(diff1);
283         diff2_low = vget_low_s32(diff2);
284 
285         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
286         sq_diff1_high = vmull_high_s32(diff1, diff1);
287         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
288         sq_diff2_high = vmull_high_s32(diff2, diff2);
289 
290         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
291         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
292         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
293         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
294     }
295     { // Row 5
296         s1_vector = vld1q_s16(s1);
297         s1 += s_src1;
298         s2_vector = vld1q_s16(s2);
299         s2 += s_src2;
300 
301         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
302         diff2 = vsubl_high_s16(s1_vector, s2_vector);
303         diff1_low = vget_low_s32(diff1);
304         diff2_low = vget_low_s32(diff2);
305 
306         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
307         sq_diff1_high = vmull_high_s32(diff1, diff1);
308         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
309         sq_diff2_high = vmull_high_s32(diff2, diff2);
310 
311         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
312         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
313         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
314         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
315     }
316     { // Row 6
317         s1_vector = vld1q_s16(s1);
318         s1 += s_src1;
319         s2_vector = vld1q_s16(s2);
320         s2 += s_src2;
321 
322         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
323         diff2 = vsubl_high_s16(s1_vector, s2_vector);
324         diff1_low = vget_low_s32(diff1);
325         diff2_low = vget_low_s32(diff2);
326 
327         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
328         sq_diff1_high = vmull_high_s32(diff1, diff1);
329         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
330         sq_diff2_high = vmull_high_s32(diff2, diff2);
331 
332         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
333         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
334         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
335         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
336     }
337     { // Row 7
338         s1_vector = vld1q_s16(s1);
339         s1 += s_src1;
340         s2_vector = vld1q_s16(s2);
341         s2 += s_src2;
342 
343         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
344         diff2 = vsubl_high_s16(s1_vector, s2_vector);
345         diff1_low = vget_low_s32(diff1);
346         diff2_low = vget_low_s32(diff2);
347 
348         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
349         sq_diff1_high = vmull_high_s32(diff1, diff1);
350         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
351         sq_diff2_high = vmull_high_s32(diff2, diff2);
352 
353         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
354         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
355         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
356         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
357     }
358     ssd += vaddvq_s64(sq_diff);
359     return ssd;
360 }
361 
362 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] =
363     {
364         ssd_16b_neon_8x8,
365             NULL};
366 
367 /* DIFF **********************************************************************/
diff_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int s_diff,s16 * diff)368 static void diff_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff)
369 {
370     s16* s1 = (s16*) src1;
371     s16* s2 = (s16*) src2;
372     int16x8_t s1_vector, s2_vector, diff_vector;
373     // Loop unrolled
374     { // Row 0
375         // Loading one row (8 elements) each of src1 and src_2
376         s1_vector = vld1q_s16(s1);
377         s1 += s_src1;
378         s2_vector = vld1q_s16(s2);
379         s2 += s_src2;
380 
381         // Subtracting s1_vector from s2_vector
382         diff_vector = vsubq_s16(s1_vector, s2_vector);
383 
384         // Storing the result in diff
385         vst1q_s16(diff, diff_vector);
386         diff += s_diff;
387     }
388     { // Row 1
389         s1_vector = vld1q_s16(s1);
390         s1 += s_src1;
391         s2_vector = vld1q_s16(s2);
392         s2 += s_src2;
393 
394         diff_vector = vsubq_s16(s1_vector, s2_vector);
395 
396         vst1q_s16(diff, diff_vector);
397         diff += s_diff;
398     }
399     { // Row 2
400         s1_vector = vld1q_s16(s1);
401         s1 += s_src1;
402         s2_vector = vld1q_s16(s2);
403         s2 += s_src2;
404 
405         diff_vector = vsubq_s16(s1_vector, s2_vector);
406 
407         vst1q_s16(diff, diff_vector);
408         diff += s_diff;
409     }
410     { // Row 3
411         s1_vector = vld1q_s16(s1);
412         s1 += s_src1;
413         s2_vector = vld1q_s16(s2);
414         s2 += s_src2;
415 
416         diff_vector = vsubq_s16(s1_vector, s2_vector);
417 
418         vst1q_s16(diff, diff_vector);
419         diff += s_diff;
420     }
421     { // Row 4
422         s1_vector = vld1q_s16(s1);
423         s1 += s_src1;
424         s2_vector = vld1q_s16(s2);
425         s2 += s_src2;
426 
427         diff_vector = vsubq_s16(s1_vector, s2_vector);
428 
429         vst1q_s16(diff, diff_vector);
430         diff += s_diff;
431     }
432     { // Row 5
433         s1_vector = vld1q_s16(s1);
434         s1 += s_src1;
435         s2_vector = vld1q_s16(s2);
436         s2 += s_src2;
437 
438         diff_vector = vsubq_s16(s1_vector, s2_vector);
439 
440         vst1q_s16(diff, diff_vector);
441         diff += s_diff;
442     }
443     { // Row 6
444         s1_vector = vld1q_s16(s1);
445         s1 += s_src1;
446         s2_vector = vld1q_s16(s2);
447         s2 += s_src2;
448 
449         diff_vector = vsubq_s16(s1_vector, s2_vector);
450 
451         vst1q_s16(diff, diff_vector);
452         diff += s_diff;
453     }
454     { // Row 7
455         s1_vector = vld1q_s16(s1);
456         s1 += s_src1;
457         s2_vector = vld1q_s16(s2);
458         s2 += s_src2;
459 
460         diff_vector = vsubq_s16(s1_vector, s2_vector);
461 
462         vst1q_s16(diff, diff_vector);
463         diff += s_diff;
464     }
465 }
466 const oapv_fn_diff_t oapv_tbl_fn_diff_16b_neon[2] = {
467     diff_16b_neon_8x8,
468     NULL
469 };
470 
oapv_dc_removed_had8x8_neon(pel * org,int s_org)471 int oapv_dc_removed_had8x8_neon(pel* org, int s_org)
472 {
473     int satd = 0;
474     /* all 128 bit registers are named with a suffix mxnb, where m is the */
475     /* number of n bits packed in the register                            */
476 
477     int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
478     int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
479     int16x8_t pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
480     int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
481     int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
482     int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
483     int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2;
484 
485     src0_8x16b = (vld1q_s16(&org[0]));
486     org = org + s_org;
487     src1_8x16b = (vld1q_s16(&org[0]));
488     org = org + s_org;
489     src2_8x16b = (vld1q_s16(&org[0]));
490     org = org + s_org;
491     src3_8x16b = (vld1q_s16(&org[0]));
492     org = org + s_org;
493     src4_8x16b = (vld1q_s16(&org[0]));
494     org = org + s_org;
495     src5_8x16b = (vld1q_s16(&org[0]));
496     org = org + s_org;
497     src6_8x16b = (vld1q_s16(&org[0]));
498     org = org + s_org;
499     src7_8x16b = (vld1q_s16(&org[0]));
500     org = org + s_org;
501 
502     /**************** 8x8 horizontal transform *******************************/
503     /***********************    8x8 16 bit Transpose  ************************/
504 
505     out3_8x16b = vcombine_s16(vget_low_s16(src0_8x16b), vget_low_s16(src1_8x16b));
506     out7_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vget_high_s16(src1_8x16b));
507 
508     pred0_8x16b = vcombine_s16(vget_low_s16(src2_8x16b), vget_low_s16(src3_8x16b));
509     src2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vget_high_s16(src3_8x16b));
510 
511     out2_8x16b = vcombine_s16(vget_low_s16(src4_8x16b), vget_low_s16(src5_8x16b));
512     pred7_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vget_high_s16(src5_8x16b));
513 
514     pred3_8x16b = vcombine_s16(vget_low_s16(src6_8x16b), vget_low_s16(src7_8x16b));
515     src6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vget_high_s16(src7_8x16b));
516 
517 
518     out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
519     out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
520 
521     pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
522     pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
523 
524     out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
525     out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
526 
527     pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
528     pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
529 
530     out0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
531     out1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
532     out2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
533     out3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
534     out4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
535     out5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
536     out6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
537     out7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
538 
539     /**********************   8x8 16 bit Transpose End   *********************/
540 
541     /* r0 + r1 */
542     pred0_8x16b = vaddq_s16(out0_8x16b, out1_8x16b);
543     /* r2 + r3 */
544     pred2_8x16b = vaddq_s16(out2_8x16b, out3_8x16b);
545     /* r4 + r5 */
546     pred4_8x16b = vaddq_s16(out4_8x16b, out5_8x16b);
547     /* r6 + r7 */
548     pred6_8x16b = vaddq_s16(out6_8x16b, out7_8x16b);
549 
550 
551     /* r0 + r1 + r2 + r3 */
552     pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
553     /* r4 + r5 + r6 + r7 */
554     pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
555     /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
556     src0_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
557     /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
558     src4_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
559 
560     /* r0 + r1 - r2 - r3 */
561     pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
562     /* r4 + r5 - r6 - r7 */
563     pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
564     /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
565     src2_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
566     /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
567     src6_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
568 
569     /* r0 - r1 */
570     pred0_8x16b = vsubq_s16(out0_8x16b, out1_8x16b);
571     /* r2 - r3 */
572     pred2_8x16b = vsubq_s16(out2_8x16b, out3_8x16b);
573     /* r4 - r5 */
574     pred4_8x16b = vsubq_s16(out4_8x16b, out5_8x16b);
575     /* r6 - r7 */
576     pred6_8x16b = vsubq_s16(out6_8x16b, out7_8x16b);
577 
578     /* r0 - r1 + r2 - r3 */
579     pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
580     /* r4 - r5 + r6 - r7 */
581     pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
582     /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
583     src1_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
584     /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
585     src5_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
586 
587     /* r0 - r1 - r2 + r3 */
588     pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
589     /* r4 - r5 - r6 + r7 */
590     pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
591     /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
592     src3_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
593     /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
594     src7_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
595 
596 
597     /***********************    8x8 16 bit Transpose  ************************/
598     out3_8x16b = vzip1q_s16(src0_8x16b, src1_8x16b);
599     pred0_8x16b = vzip1q_s16(src2_8x16b, src3_8x16b);
600     out2_8x16b = vzip1q_s16(src4_8x16b, src5_8x16b);
601     pred3_8x16b = vzip1q_s16(src6_8x16b, src7_8x16b);
602     out7_8x16b = vzip2q_s16(src0_8x16b, src1_8x16b);
603     src2_8x16b = vzip2q_s16(src2_8x16b, src3_8x16b);
604     pred7_8x16b = vzip2q_s16(src4_8x16b, src5_8x16b);
605     src6_8x16b = vzip2q_s16(src6_8x16b, src7_8x16b);
606 
607     out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
608     out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
609 
610     pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
611     pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
612 
613     out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
614     out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
615 
616     pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
617     pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
618 
619     src0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
620     src1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
621     src2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
622     src3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
623     src4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
624     src5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
625     src6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
626     src7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
627 
628     /**********************   8x8 16 bit Transpose End   *********************/
629     /**************** 8x8 horizontal transform *******************************/
630     {
631         int16x8_t out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
632         int16x8_t out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
633         int16x8_t tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
634         int16x8_t tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
635 
636         /************************* 8x8 Vertical Transform*************************/
637         tmp0_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vcreate_s32(0));
638         tmp1_8x16b = vcombine_s16(vget_high_s16(src1_8x16b), vcreate_s32(0));
639         tmp2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vcreate_s32(0));
640         tmp3_8x16b = vcombine_s16(vget_high_s16(src3_8x16b), vcreate_s32(0));
641         tmp4_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vcreate_s32(0));
642         tmp5_8x16b = vcombine_s16(vget_high_s16(src5_8x16b), vcreate_s32(0));
643         tmp6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vcreate_s32(0));
644         tmp7_8x16b = vcombine_s16(vget_high_s16(src7_8x16b), vcreate_s32(0));
645 
646         /*************************First 4 pixels ********************************/
647 
648         src0_8x16b = vmovl_s16(vget_low_s16(src0_8x16b));
649         src1_8x16b = vmovl_s16(vget_low_s16(src1_8x16b));
650         src2_8x16b = vmovl_s16(vget_low_s16(src2_8x16b));
651         src3_8x16b = vmovl_s16(vget_low_s16(src3_8x16b));
652         src4_8x16b = vmovl_s16(vget_low_s16(src4_8x16b));
653         src5_8x16b = vmovl_s16(vget_low_s16(src5_8x16b));
654         src6_8x16b = vmovl_s16(vget_low_s16(src6_8x16b));
655         src7_8x16b = vmovl_s16(vget_low_s16(src7_8x16b));
656 
657         /* r0 + r1 */
658         pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
659         /* r2 + r3 */
660         pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
661         /* r4 + r5 */
662         pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
663         /* r6 + r7 */
664         pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
665 
666         /* r0 + r1 + r2 + r3 */
667         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
668         /* r4 + r5 + r6 + r7 */
669         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
670         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
671         out0_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
672         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
673         out4_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
674 
675         /* r0 + r1 - r2 - r3 */
676         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
677         /* r4 + r5 - r6 - r7 */
678         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
679         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
680         out2_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
681         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
682         out6_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
683 
684         /* r0 - r1 */
685         pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
686         /* r2 - r3 */
687         pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
688         /* r4 - r5 */
689         pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
690         /* r6 - r7 */
691         pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
692 
693         /* r0 - r1 + r2 - r3 */
694         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
695         /* r4 - r5 + r6 - r7 */
696         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
697         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
698         out1_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
699         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
700         out5_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
701 
702         /* r0 - r1 - r2 + r3 */
703         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
704         /* r4 - r5 - r6 + r7 */
705         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
706         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
707         out3_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
708         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
709         out7_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
710 
711         /*************************First 4 pixels ********************************/
712 
713         /**************************Next 4 pixels *******************************/
714         src0_8x16b = vmovl_s16(vget_low_s16(tmp0_8x16b));
715         src1_8x16b = vmovl_s16(vget_low_s16(tmp1_8x16b));
716         src2_8x16b = vmovl_s16(vget_low_s16(tmp2_8x16b));
717         src3_8x16b = vmovl_s16(vget_low_s16(tmp3_8x16b));
718         src4_8x16b = vmovl_s16(vget_low_s16(tmp4_8x16b));
719         src5_8x16b = vmovl_s16(vget_low_s16(tmp5_8x16b));
720         src6_8x16b = vmovl_s16(vget_low_s16(tmp6_8x16b));
721         src7_8x16b = vmovl_s16(vget_low_s16(tmp7_8x16b));
722 
723         /* r0 + r1 */
724         pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
725         /* r2 + r3 */
726         pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
727         /* r4 + r5 */
728         pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
729         /* r6 + r7 */
730         pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
731 
732         /* r0 + r1 + r2 + r3 */
733         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
734         /* r4 + r5 + r6 + r7 */
735         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
736         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
737         out0a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
738         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
739         out4a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
740 
741         /* r0 + r1 - r2 - r3 */
742         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
743         /* r4 + r5 - r6 - r7 */
744         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
745         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
746         out2a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
747         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
748         out6a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
749 
750         /* r0 - r1 */
751         pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
752         /* r2 - r3 */
753         pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
754         /* r4 - r5 */
755         pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
756         /* r6 - r7 */
757         pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
758 
759         /* r0 - r1 + r2 - r3 */
760         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
761         /* r4 - r5 + r6 - r7 */
762         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
763         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
764         out1a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
765         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
766         out5a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
767 
768         /* r0 - r1 - r2 + r3 */
769         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
770         /* r4 - r5 - r6 + r7 */
771         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
772         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
773         out3a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
774         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
775         out7a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
776 
777         /**************************Next 4 pixels *******************************/
778         /************************* 8x8 Vertical Transform*************************/
779 
780         /****************************SATD calculation ****************************/
781         src0_8x16b = vabsq_s32(out0_8x16b);
782         src1_8x16b = vabsq_s32(out1_8x16b);
783         src2_8x16b = vabsq_s32(out2_8x16b);
784         src3_8x16b = vabsq_s32(out3_8x16b);
785         src4_8x16b = vabsq_s32(out4_8x16b);
786         src5_8x16b = vabsq_s32(out5_8x16b);
787         src6_8x16b = vabsq_s32(out6_8x16b);
788         src7_8x16b = vabsq_s32(out7_8x16b);
789         s32* p = (s32*)&src0_8x16b;
790         p[0] = 0;
791 
792         satd = vaddvq_s32(src0_8x16b);
793         satd += vaddvq_s32(src1_8x16b);
794         satd += vaddvq_s32(src2_8x16b);
795         satd += vaddvq_s32(src3_8x16b);
796         satd += vaddvq_s32(src4_8x16b);
797         satd += vaddvq_s32(src5_8x16b);
798         satd += vaddvq_s32(src6_8x16b);
799         satd += vaddvq_s32(src7_8x16b);
800 
801         src0_8x16b = vabsq_s32(out0a_8x16b);
802         src1_8x16b = vabsq_s32(out1a_8x16b);
803         src2_8x16b = vabsq_s32(out2a_8x16b);
804         src3_8x16b = vabsq_s32(out3a_8x16b);
805         src4_8x16b = vabsq_s32(out4a_8x16b);
806         src5_8x16b = vabsq_s32(out5a_8x16b);
807         src6_8x16b = vabsq_s32(out6a_8x16b);
808         src7_8x16b = vabsq_s32(out7a_8x16b);
809 
810         satd += vaddvq_s32(src0_8x16b);
811         satd += vaddvq_s32(src1_8x16b);
812         satd += vaddvq_s32(src2_8x16b);
813         satd += vaddvq_s32(src3_8x16b);
814         satd += vaddvq_s32(src4_8x16b);
815         satd += vaddvq_s32(src5_8x16b);
816         satd += vaddvq_s32(src6_8x16b);
817         satd += vaddvq_s32(src7_8x16b);
818 
819         satd = (satd + 2) >> 2;
820         return satd;
821     }
822 }
823 #endif /* ARM_NEON */
824