1 /*
2 * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3 * All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * - Neither the name of the copyright owner, nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "oapv_def.h"
33 #include <math.h>
34
35 #if ARM_NEON
36
37 /* SAD for 16bit **************************************************************/
sad_16b_neon_8x2n(int w,int h,void * src1,void * src2,int s_src1,int s_src2)38 int sad_16b_neon_8x2n(int w, int h, void *src1, void *src2, int s_src1, int s_src2)
39 {
40 int sad = 0;
41 s16* s1 = (s16*) src1;
42 s16* s2 = (s16*) src2;
43 int16x8_t s1_vector, s2_vector;
44 int32x4_t diff_part1, diff_part2, diff_part1_abs, diff_part2_abs, sad_vector, sad_vector_temp;
45 // Loop unrolled
46 { // Row 0
47 // Loading one row (8 elements) each of src1 and src_2
48 s1_vector = vld1q_s16(s1);
49 s1 += s_src1;
50 s2_vector = vld1q_s16(s2);
51 s2 += s_src2;
52
53 // Subtracting s1_vector from s2_vector and storing in 32 bits
54 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
55 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
56
57 //Taking absolute value of difference and adding them
58 diff_part1_abs = vabsq_s32(diff_part1);
59 diff_part2_abs = vabsq_s32(diff_part2);
60
61 sad_vector = vaddq_s32(diff_part1_abs, diff_part2_abs);
62 }
63 { // Row 1
64 s1_vector = vld1q_s16(s1);
65 s1 += s_src1;
66 s2_vector = vld1q_s16(s2);
67 s2 += s_src2;
68
69 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
70 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
71
72 diff_part1_abs = vabsq_s32(diff_part1);
73 diff_part2_abs = vabsq_s32(diff_part2);
74
75 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
76 // Updating sad_vector by adding the new values
77 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
78 }
79 { // Row 2
80 s1_vector = vld1q_s16(s1);
81 s1 += s_src1;
82 s2_vector = vld1q_s16(s2);
83 s2 += s_src2;
84
85 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
86 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
87
88 diff_part1_abs = vabsq_s32(diff_part1);
89 diff_part2_abs = vabsq_s32(diff_part2);
90
91 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
92 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
93 }
94 { // Row 3
95 s1_vector = vld1q_s16(s1);
96 s1 += s_src1;
97 s2_vector = vld1q_s16(s2);
98 s2 += s_src2;
99
100 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
101 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
102
103 diff_part1_abs = vabsq_s32(diff_part1);
104 diff_part2_abs = vabsq_s32(diff_part2);
105
106 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
107 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
108 }
109 { // Row 4
110 s1_vector = vld1q_s16(s1);
111 s1 += s_src1;
112 s2_vector = vld1q_s16(s2);
113 s2 += s_src2;
114
115 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
116 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
117
118 diff_part1_abs = vabsq_s32(diff_part1);
119 diff_part2_abs = vabsq_s32(diff_part2);
120
121 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
122 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
123 }
124 { // Row 5
125 s1_vector = vld1q_s16(s1);
126 s1 += s_src1;
127 s2_vector = vld1q_s16(s2);
128 s2 += s_src2;
129
130 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
131 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
132
133 diff_part1_abs = vabsq_s32(diff_part1);
134 diff_part2_abs = vabsq_s32(diff_part2);
135
136 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
137 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
138 }
139 { // Row 6
140 s1_vector = vld1q_s16(s1);
141 s1 += s_src1;
142 s2_vector = vld1q_s16(s2);
143 s2 += s_src2;
144
145 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
146 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
147
148 diff_part1_abs = vabsq_s32(diff_part1);
149 diff_part2_abs = vabsq_s32(diff_part2);
150
151 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
152 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
153 }
154 { // Row 7
155 s1_vector = vld1q_s16(s1);
156 s1 += s_src1;
157 s2_vector = vld1q_s16(s2);
158 s2 += s_src2;
159
160 diff_part1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
161 diff_part2 = vsubl_high_s16(s1_vector, s2_vector);
162
163 diff_part1_abs = vabsq_s32(diff_part1);
164 diff_part2_abs = vabsq_s32(diff_part2);
165
166 sad_vector_temp = vaddq_s32(diff_part1_abs, diff_part2_abs);
167 sad_vector = vaddq_s32(sad_vector, sad_vector_temp);
168 }
169 // Adding all the elments in sad vector
170 sad = vaddvq_s32(sad_vector);
171 return sad;
172 }
173
174 const oapv_fn_sad_t oapv_tbl_fn_sad_16b_neon[2] = {
175 sad_16b_neon_8x2n,
176 NULL
177 };
178
179 /* SSD ***********************************************************************/
ssd_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2)180 static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2)
181 {
182 s64 ssd = 0;
183 s16* s1 = (s16*) src1;
184 s16* s2 = (s16*) src2;
185 s16 i;
186 int16x8_t s1_vector, s2_vector;
187 int32x4_t diff1, diff2;
188 int32x2_t diff1_low, diff2_low;
189 int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff;
190 // Loop unrolling
191 { // Row 0
192 s1_vector = vld1q_s16(s1);
193 s1 += s_src1;
194 s2_vector = vld1q_s16(s2);
195 s2 += s_src2;
196
197 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
198 diff2 = vsubl_high_s16(s1_vector, s2_vector);
199 diff1_low = vget_low_s32(diff1);
200 diff2_low = vget_low_s32(diff2);
201
202 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
203 sq_diff1_high = vmull_high_s32(diff1, diff1);
204 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
205 sq_diff2_high = vmull_high_s32(diff2, diff2);
206
207 sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high);
208 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
209 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
210 }
211 { // Row 1
212 s1_vector = vld1q_s16(s1);
213 s1 += s_src1;
214 s2_vector = vld1q_s16(s2);
215 s2 += s_src2;
216
217 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
218 diff2 = vsubl_high_s16(s1_vector, s2_vector);
219 diff1_low = vget_low_s32(diff1);
220 diff2_low = vget_low_s32(diff2);
221
222 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
223 sq_diff1_high = vmull_high_s32(diff1, diff1);
224 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
225 sq_diff2_high = vmull_high_s32(diff2, diff2);
226
227 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
228 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
229 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
230 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
231 }
232 { // Row 2
233 s1_vector = vld1q_s16(s1);
234 s1 += s_src1;
235 s2_vector = vld1q_s16(s2);
236 s2 += s_src2;
237
238 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
239 diff2 = vsubl_high_s16(s1_vector, s2_vector);
240 diff1_low = vget_low_s32(diff1);
241 diff2_low = vget_low_s32(diff2);
242
243 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
244 sq_diff1_high = vmull_high_s32(diff1, diff1);
245 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
246 sq_diff2_high = vmull_high_s32(diff2, diff2);
247
248 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
249 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
250 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
251 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
252 }
253 { // Row 3
254 s1_vector = vld1q_s16(s1);
255 s1 += s_src1;
256 s2_vector = vld1q_s16(s2);
257 s2 += s_src2;
258
259 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
260 diff2 = vsubl_high_s16(s1_vector, s2_vector);
261 diff1_low = vget_low_s32(diff1);
262 diff2_low = vget_low_s32(diff2);
263
264 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
265 sq_diff1_high = vmull_high_s32(diff1, diff1);
266 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
267 sq_diff2_high = vmull_high_s32(diff2, diff2);
268
269 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
270 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
271 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
272 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
273 }
274 { // Row 4
275 s1_vector = vld1q_s16(s1);
276 s1 += s_src1;
277 s2_vector = vld1q_s16(s2);
278 s2 += s_src2;
279
280 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
281 diff2 = vsubl_high_s16(s1_vector, s2_vector);
282 diff1_low = vget_low_s32(diff1);
283 diff2_low = vget_low_s32(diff2);
284
285 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
286 sq_diff1_high = vmull_high_s32(diff1, diff1);
287 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
288 sq_diff2_high = vmull_high_s32(diff2, diff2);
289
290 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
291 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
292 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
293 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
294 }
295 { // Row 5
296 s1_vector = vld1q_s16(s1);
297 s1 += s_src1;
298 s2_vector = vld1q_s16(s2);
299 s2 += s_src2;
300
301 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
302 diff2 = vsubl_high_s16(s1_vector, s2_vector);
303 diff1_low = vget_low_s32(diff1);
304 diff2_low = vget_low_s32(diff2);
305
306 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
307 sq_diff1_high = vmull_high_s32(diff1, diff1);
308 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
309 sq_diff2_high = vmull_high_s32(diff2, diff2);
310
311 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
312 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
313 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
314 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
315 }
316 { // Row 6
317 s1_vector = vld1q_s16(s1);
318 s1 += s_src1;
319 s2_vector = vld1q_s16(s2);
320 s2 += s_src2;
321
322 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
323 diff2 = vsubl_high_s16(s1_vector, s2_vector);
324 diff1_low = vget_low_s32(diff1);
325 diff2_low = vget_low_s32(diff2);
326
327 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
328 sq_diff1_high = vmull_high_s32(diff1, diff1);
329 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
330 sq_diff2_high = vmull_high_s32(diff2, diff2);
331
332 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
333 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
334 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
335 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
336 }
337 { // Row 7
338 s1_vector = vld1q_s16(s1);
339 s1 += s_src1;
340 s2_vector = vld1q_s16(s2);
341 s2 += s_src2;
342
343 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
344 diff2 = vsubl_high_s16(s1_vector, s2_vector);
345 diff1_low = vget_low_s32(diff1);
346 diff2_low = vget_low_s32(diff2);
347
348 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
349 sq_diff1_high = vmull_high_s32(diff1, diff1);
350 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
351 sq_diff2_high = vmull_high_s32(diff2, diff2);
352
353 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
354 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
355 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
356 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
357 }
358 ssd += vaddvq_s64(sq_diff);
359 return ssd;
360 }
361
362 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] =
363 {
364 ssd_16b_neon_8x8,
365 NULL};
366
367 /* DIFF **********************************************************************/
diff_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int s_diff,s16 * diff)368 static void diff_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int s_diff, s16 *diff)
369 {
370 s16* s1 = (s16*) src1;
371 s16* s2 = (s16*) src2;
372 int16x8_t s1_vector, s2_vector, diff_vector;
373 // Loop unrolled
374 { // Row 0
375 // Loading one row (8 elements) each of src1 and src_2
376 s1_vector = vld1q_s16(s1);
377 s1 += s_src1;
378 s2_vector = vld1q_s16(s2);
379 s2 += s_src2;
380
381 // Subtracting s1_vector from s2_vector
382 diff_vector = vsubq_s16(s1_vector, s2_vector);
383
384 // Storing the result in diff
385 vst1q_s16(diff, diff_vector);
386 diff += s_diff;
387 }
388 { // Row 1
389 s1_vector = vld1q_s16(s1);
390 s1 += s_src1;
391 s2_vector = vld1q_s16(s2);
392 s2 += s_src2;
393
394 diff_vector = vsubq_s16(s1_vector, s2_vector);
395
396 vst1q_s16(diff, diff_vector);
397 diff += s_diff;
398 }
399 { // Row 2
400 s1_vector = vld1q_s16(s1);
401 s1 += s_src1;
402 s2_vector = vld1q_s16(s2);
403 s2 += s_src2;
404
405 diff_vector = vsubq_s16(s1_vector, s2_vector);
406
407 vst1q_s16(diff, diff_vector);
408 diff += s_diff;
409 }
410 { // Row 3
411 s1_vector = vld1q_s16(s1);
412 s1 += s_src1;
413 s2_vector = vld1q_s16(s2);
414 s2 += s_src2;
415
416 diff_vector = vsubq_s16(s1_vector, s2_vector);
417
418 vst1q_s16(diff, diff_vector);
419 diff += s_diff;
420 }
421 { // Row 4
422 s1_vector = vld1q_s16(s1);
423 s1 += s_src1;
424 s2_vector = vld1q_s16(s2);
425 s2 += s_src2;
426
427 diff_vector = vsubq_s16(s1_vector, s2_vector);
428
429 vst1q_s16(diff, diff_vector);
430 diff += s_diff;
431 }
432 { // Row 5
433 s1_vector = vld1q_s16(s1);
434 s1 += s_src1;
435 s2_vector = vld1q_s16(s2);
436 s2 += s_src2;
437
438 diff_vector = vsubq_s16(s1_vector, s2_vector);
439
440 vst1q_s16(diff, diff_vector);
441 diff += s_diff;
442 }
443 { // Row 6
444 s1_vector = vld1q_s16(s1);
445 s1 += s_src1;
446 s2_vector = vld1q_s16(s2);
447 s2 += s_src2;
448
449 diff_vector = vsubq_s16(s1_vector, s2_vector);
450
451 vst1q_s16(diff, diff_vector);
452 diff += s_diff;
453 }
454 { // Row 7
455 s1_vector = vld1q_s16(s1);
456 s1 += s_src1;
457 s2_vector = vld1q_s16(s2);
458 s2 += s_src2;
459
460 diff_vector = vsubq_s16(s1_vector, s2_vector);
461
462 vst1q_s16(diff, diff_vector);
463 diff += s_diff;
464 }
465 }
466 const oapv_fn_diff_t oapv_tbl_fn_diff_16b_neon[2] = {
467 diff_16b_neon_8x8,
468 NULL
469 };
470
oapv_dc_removed_had8x8_neon(pel * org,int s_org)471 int oapv_dc_removed_had8x8_neon(pel* org, int s_org)
472 {
473 int satd = 0;
474 /* all 128 bit registers are named with a suffix mxnb, where m is the */
475 /* number of n bits packed in the register */
476
477 int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
478 int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
479 int16x8_t pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
480 int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
481 int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
482 int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
483 int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2;
484
485 src0_8x16b = (vld1q_s16(&org[0]));
486 org = org + s_org;
487 src1_8x16b = (vld1q_s16(&org[0]));
488 org = org + s_org;
489 src2_8x16b = (vld1q_s16(&org[0]));
490 org = org + s_org;
491 src3_8x16b = (vld1q_s16(&org[0]));
492 org = org + s_org;
493 src4_8x16b = (vld1q_s16(&org[0]));
494 org = org + s_org;
495 src5_8x16b = (vld1q_s16(&org[0]));
496 org = org + s_org;
497 src6_8x16b = (vld1q_s16(&org[0]));
498 org = org + s_org;
499 src7_8x16b = (vld1q_s16(&org[0]));
500 org = org + s_org;
501
502 /**************** 8x8 horizontal transform *******************************/
503 /*********************** 8x8 16 bit Transpose ************************/
504
505 out3_8x16b = vcombine_s16(vget_low_s16(src0_8x16b), vget_low_s16(src1_8x16b));
506 out7_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vget_high_s16(src1_8x16b));
507
508 pred0_8x16b = vcombine_s16(vget_low_s16(src2_8x16b), vget_low_s16(src3_8x16b));
509 src2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vget_high_s16(src3_8x16b));
510
511 out2_8x16b = vcombine_s16(vget_low_s16(src4_8x16b), vget_low_s16(src5_8x16b));
512 pred7_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vget_high_s16(src5_8x16b));
513
514 pred3_8x16b = vcombine_s16(vget_low_s16(src6_8x16b), vget_low_s16(src7_8x16b));
515 src6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vget_high_s16(src7_8x16b));
516
517
518 out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
519 out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
520
521 pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
522 pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
523
524 out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
525 out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
526
527 pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
528 pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
529
530 out0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
531 out1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
532 out2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
533 out3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
534 out4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
535 out5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
536 out6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
537 out7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
538
539 /********************** 8x8 16 bit Transpose End *********************/
540
541 /* r0 + r1 */
542 pred0_8x16b = vaddq_s16(out0_8x16b, out1_8x16b);
543 /* r2 + r3 */
544 pred2_8x16b = vaddq_s16(out2_8x16b, out3_8x16b);
545 /* r4 + r5 */
546 pred4_8x16b = vaddq_s16(out4_8x16b, out5_8x16b);
547 /* r6 + r7 */
548 pred6_8x16b = vaddq_s16(out6_8x16b, out7_8x16b);
549
550
551 /* r0 + r1 + r2 + r3 */
552 pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
553 /* r4 + r5 + r6 + r7 */
554 pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
555 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
556 src0_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
557 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
558 src4_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
559
560 /* r0 + r1 - r2 - r3 */
561 pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
562 /* r4 + r5 - r6 - r7 */
563 pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
564 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
565 src2_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
566 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
567 src6_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
568
569 /* r0 - r1 */
570 pred0_8x16b = vsubq_s16(out0_8x16b, out1_8x16b);
571 /* r2 - r3 */
572 pred2_8x16b = vsubq_s16(out2_8x16b, out3_8x16b);
573 /* r4 - r5 */
574 pred4_8x16b = vsubq_s16(out4_8x16b, out5_8x16b);
575 /* r6 - r7 */
576 pred6_8x16b = vsubq_s16(out6_8x16b, out7_8x16b);
577
578 /* r0 - r1 + r2 - r3 */
579 pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
580 /* r4 - r5 + r6 - r7 */
581 pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
582 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
583 src1_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
584 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
585 src5_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
586
587 /* r0 - r1 - r2 + r3 */
588 pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
589 /* r4 - r5 - r6 + r7 */
590 pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
591 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
592 src3_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
593 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
594 src7_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
595
596
597 /*********************** 8x8 16 bit Transpose ************************/
598 out3_8x16b = vzip1q_s16(src0_8x16b, src1_8x16b);
599 pred0_8x16b = vzip1q_s16(src2_8x16b, src3_8x16b);
600 out2_8x16b = vzip1q_s16(src4_8x16b, src5_8x16b);
601 pred3_8x16b = vzip1q_s16(src6_8x16b, src7_8x16b);
602 out7_8x16b = vzip2q_s16(src0_8x16b, src1_8x16b);
603 src2_8x16b = vzip2q_s16(src2_8x16b, src3_8x16b);
604 pred7_8x16b = vzip2q_s16(src4_8x16b, src5_8x16b);
605 src6_8x16b = vzip2q_s16(src6_8x16b, src7_8x16b);
606
607 out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
608 out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
609
610 pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
611 pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
612
613 out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
614 out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
615
616 pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
617 pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
618
619 src0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
620 src1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
621 src2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
622 src3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
623 src4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
624 src5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
625 src6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
626 src7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
627
628 /********************** 8x8 16 bit Transpose End *********************/
629 /**************** 8x8 horizontal transform *******************************/
630 {
631 int16x8_t out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
632 int16x8_t out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
633 int16x8_t tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
634 int16x8_t tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
635
636 /************************* 8x8 Vertical Transform*************************/
637 tmp0_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vcreate_s32(0));
638 tmp1_8x16b = vcombine_s16(vget_high_s16(src1_8x16b), vcreate_s32(0));
639 tmp2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vcreate_s32(0));
640 tmp3_8x16b = vcombine_s16(vget_high_s16(src3_8x16b), vcreate_s32(0));
641 tmp4_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vcreate_s32(0));
642 tmp5_8x16b = vcombine_s16(vget_high_s16(src5_8x16b), vcreate_s32(0));
643 tmp6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vcreate_s32(0));
644 tmp7_8x16b = vcombine_s16(vget_high_s16(src7_8x16b), vcreate_s32(0));
645
646 /*************************First 4 pixels ********************************/
647
648 src0_8x16b = vmovl_s16(vget_low_s16(src0_8x16b));
649 src1_8x16b = vmovl_s16(vget_low_s16(src1_8x16b));
650 src2_8x16b = vmovl_s16(vget_low_s16(src2_8x16b));
651 src3_8x16b = vmovl_s16(vget_low_s16(src3_8x16b));
652 src4_8x16b = vmovl_s16(vget_low_s16(src4_8x16b));
653 src5_8x16b = vmovl_s16(vget_low_s16(src5_8x16b));
654 src6_8x16b = vmovl_s16(vget_low_s16(src6_8x16b));
655 src7_8x16b = vmovl_s16(vget_low_s16(src7_8x16b));
656
657 /* r0 + r1 */
658 pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
659 /* r2 + r3 */
660 pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
661 /* r4 + r5 */
662 pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
663 /* r6 + r7 */
664 pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
665
666 /* r0 + r1 + r2 + r3 */
667 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
668 /* r4 + r5 + r6 + r7 */
669 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
670 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
671 out0_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
672 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
673 out4_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
674
675 /* r0 + r1 - r2 - r3 */
676 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
677 /* r4 + r5 - r6 - r7 */
678 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
679 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
680 out2_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
681 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
682 out6_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
683
684 /* r0 - r1 */
685 pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
686 /* r2 - r3 */
687 pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
688 /* r4 - r5 */
689 pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
690 /* r6 - r7 */
691 pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
692
693 /* r0 - r1 + r2 - r3 */
694 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
695 /* r4 - r5 + r6 - r7 */
696 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
697 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
698 out1_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
699 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
700 out5_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
701
702 /* r0 - r1 - r2 + r3 */
703 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
704 /* r4 - r5 - r6 + r7 */
705 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
706 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
707 out3_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
708 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
709 out7_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
710
711 /*************************First 4 pixels ********************************/
712
713 /**************************Next 4 pixels *******************************/
714 src0_8x16b = vmovl_s16(vget_low_s16(tmp0_8x16b));
715 src1_8x16b = vmovl_s16(vget_low_s16(tmp1_8x16b));
716 src2_8x16b = vmovl_s16(vget_low_s16(tmp2_8x16b));
717 src3_8x16b = vmovl_s16(vget_low_s16(tmp3_8x16b));
718 src4_8x16b = vmovl_s16(vget_low_s16(tmp4_8x16b));
719 src5_8x16b = vmovl_s16(vget_low_s16(tmp5_8x16b));
720 src6_8x16b = vmovl_s16(vget_low_s16(tmp6_8x16b));
721 src7_8x16b = vmovl_s16(vget_low_s16(tmp7_8x16b));
722
723 /* r0 + r1 */
724 pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
725 /* r2 + r3 */
726 pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
727 /* r4 + r5 */
728 pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
729 /* r6 + r7 */
730 pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
731
732 /* r0 + r1 + r2 + r3 */
733 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
734 /* r4 + r5 + r6 + r7 */
735 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
736 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
737 out0a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
738 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
739 out4a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
740
741 /* r0 + r1 - r2 - r3 */
742 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
743 /* r4 + r5 - r6 - r7 */
744 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
745 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
746 out2a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
747 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
748 out6a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
749
750 /* r0 - r1 */
751 pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
752 /* r2 - r3 */
753 pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
754 /* r4 - r5 */
755 pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
756 /* r6 - r7 */
757 pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
758
759 /* r0 - r1 + r2 - r3 */
760 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
761 /* r4 - r5 + r6 - r7 */
762 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
763 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
764 out1a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
765 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
766 out5a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
767
768 /* r0 - r1 - r2 + r3 */
769 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
770 /* r4 - r5 - r6 + r7 */
771 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
772 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
773 out3a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
774 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
775 out7a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
776
777 /**************************Next 4 pixels *******************************/
778 /************************* 8x8 Vertical Transform*************************/
779
780 /****************************SATD calculation ****************************/
781 src0_8x16b = vabsq_s32(out0_8x16b);
782 src1_8x16b = vabsq_s32(out1_8x16b);
783 src2_8x16b = vabsq_s32(out2_8x16b);
784 src3_8x16b = vabsq_s32(out3_8x16b);
785 src4_8x16b = vabsq_s32(out4_8x16b);
786 src5_8x16b = vabsq_s32(out5_8x16b);
787 src6_8x16b = vabsq_s32(out6_8x16b);
788 src7_8x16b = vabsq_s32(out7_8x16b);
789 s32* p = (s32*)&src0_8x16b;
790 p[0] = 0;
791
792 satd = vaddvq_s32(src0_8x16b);
793 satd += vaddvq_s32(src1_8x16b);
794 satd += vaddvq_s32(src2_8x16b);
795 satd += vaddvq_s32(src3_8x16b);
796 satd += vaddvq_s32(src4_8x16b);
797 satd += vaddvq_s32(src5_8x16b);
798 satd += vaddvq_s32(src6_8x16b);
799 satd += vaddvq_s32(src7_8x16b);
800
801 src0_8x16b = vabsq_s32(out0a_8x16b);
802 src1_8x16b = vabsq_s32(out1a_8x16b);
803 src2_8x16b = vabsq_s32(out2a_8x16b);
804 src3_8x16b = vabsq_s32(out3a_8x16b);
805 src4_8x16b = vabsq_s32(out4a_8x16b);
806 src5_8x16b = vabsq_s32(out5a_8x16b);
807 src6_8x16b = vabsq_s32(out6a_8x16b);
808 src7_8x16b = vabsq_s32(out7a_8x16b);
809
810 satd += vaddvq_s32(src0_8x16b);
811 satd += vaddvq_s32(src1_8x16b);
812 satd += vaddvq_s32(src2_8x16b);
813 satd += vaddvq_s32(src3_8x16b);
814 satd += vaddvq_s32(src4_8x16b);
815 satd += vaddvq_s32(src5_8x16b);
816 satd += vaddvq_s32(src6_8x16b);
817 satd += vaddvq_s32(src7_8x16b);
818
819 satd = (satd + 2) >> 2;
820 return satd;
821 }
822 }
823 #endif /* ARM_NEON */
824