1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_had_compute_neon.c
24 *
25 * @brief
26 * Contains intrinsic definitions of functions for computing had
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38
39 /*****************************************************************************/
40 /* File Includes */
41 /*****************************************************************************/
42 /* System include files */
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_had_satd.h"
52 #include "ihevce_cmn_utils_instr_set_router.h"
53
54 /*****************************************************************************/
55 /* Globals */
56 /*****************************************************************************/
57 const int16_t gu2_dc_mask[8] = { 0x0000, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff };
58
59 /*****************************************************************************/
60 /* Function Macros */
61 /*****************************************************************************/
62 #define RESIDUE(k, is_chroma) \
63 if(!is_chroma) \
64 { \
65 const uint8x8_t s##k = vld1_u8(pu1_src); \
66 const uint8x8_t p##k = vld1_u8(pu1_pred); \
67 *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k)); \
68 pu1_src += src_strd; \
69 pu1_pred += pred_strd; \
70 } \
71 else \
72 { \
73 const uint8x8_t s##k = vld2_u8(pu1_src).val[0]; \
74 const uint8x8_t p##k = vld2_u8(pu1_pred).val[0]; \
75 *r##k = vreinterpretq_s16_u16(vsubl_u8(s##k, p##k)); \
76 pu1_src += src_strd; \
77 pu1_pred += pred_strd; \
78 }
79
80 /*****************************************************************************/
81 /* Function Definitions */
82 /*****************************************************************************/
83
84 static INLINE void
hadamard4x4_2_one_pass(int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3)85 hadamard4x4_2_one_pass(int16x8_t *r0, int16x8_t *r1, int16x8_t *r2, int16x8_t *r3)
86 {
87 const int16x8_t a0 = vaddq_s16(*r0, *r2);
88 const int16x8_t a1 = vaddq_s16(*r1, *r3);
89 const int16x8_t a2 = vsubq_s16(*r0, *r2);
90 const int16x8_t a3 = vsubq_s16(*r1, *r3);
91
92 *r0 = vaddq_s16(a0, a1);
93 *r1 = vsubq_s16(a0, a1);
94 *r2 = vaddq_s16(a2, a3);
95 *r3 = vsubq_s16(a2, a3);
96 }
97
hadamard4x4_2(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3)98 static INLINE void hadamard4x4_2(
99 UWORD8 *pu1_src,
100 WORD32 src_strd,
101 UWORD8 *pu1_pred,
102 WORD32 pred_strd,
103 int16x8_t *r0,
104 int16x8_t *r1,
105 int16x8_t *r2,
106 int16x8_t *r3)
107 {
108 // compute error between src and pred
109 RESIDUE(0, 0);
110 RESIDUE(1, 0);
111 RESIDUE(2, 0);
112 RESIDUE(3, 0);
113
114 // vertical hadamard tx
115 hadamard4x4_2_one_pass(r0, r1, r2, r3);
116
117 // transpose
118 transpose_s16_4x4q(r0, r1, r2, r3);
119
120 // horizontal hadamard tx
121 hadamard4x4_2_one_pass(r0, r1, r2, r3);
122 }
123
hadamard4x4_4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7)124 static INLINE void hadamard4x4_4(
125 UWORD8 *pu1_src,
126 WORD32 src_strd,
127 UWORD8 *pu1_pred,
128 WORD32 pred_strd,
129 int16x8_t *r0,
130 int16x8_t *r1,
131 int16x8_t *r2,
132 int16x8_t *r3,
133 int16x8_t *r4,
134 int16x8_t *r5,
135 int16x8_t *r6,
136 int16x8_t *r7)
137 {
138 // hadamard 4x4_2n
139 hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r0, r1, r2, r3);
140
141 // hadamard 4x4_2n
142 pu1_src += (4 * src_strd);
143 pu1_pred += (4 * pred_strd);
144 hadamard4x4_2(pu1_src, src_strd, pu1_pred, pred_strd, r4, r5, r6, r7);
145 }
146
hadamard_sad4x4_4(int16x8_t * a,WORD32 * pi4_hsad,WORD32 hsad_stride)147 static INLINE WORD32 hadamard_sad4x4_4(int16x8_t *a, WORD32 *pi4_hsad, WORD32 hsad_stride)
148 {
149 int16x8_t p[8];
150 int32x4_t b01, b23;
151 int64x2_t c01, c23;
152 int32x2_t d01, d23;
153
154 // satd
155 p[0] = vabsq_s16(a[0]);
156 p[1] = vabsq_s16(a[1]);
157 p[0] = vaddq_s16(p[0], p[1]);
158 p[2] = vabsq_s16(a[2]);
159 p[3] = vabsq_s16(a[3]);
160 p[2] = vaddq_s16(p[2], p[3]);
161
162 p[4] = vabsq_s16(a[4]);
163 p[5] = vabsq_s16(a[5]);
164 p[4] = vaddq_s16(p[4], p[5]);
165 p[6] = vabsq_s16(a[6]);
166 p[7] = vabsq_s16(a[7]);
167 p[6] = vaddq_s16(p[6], p[7]);
168
169 p[0] = vaddq_s16(p[0], p[2]);
170 b01 = vpaddlq_s16(p[0]);
171 c01 = vpaddlq_s32(b01);
172 d01 = vrshrn_n_s64(c01, 2);
173 vst1_s32(pi4_hsad, d01);
174 pi4_hsad += hsad_stride;
175
176 p[4] = vaddq_s16(p[4], p[6]);
177 b23 = vpaddlq_s16(p[4]);
178 c23 = vpaddlq_s32(b23);
179 d23 = vrshrn_n_s64(c23, 2);
180 vst1_s32(pi4_hsad, d23);
181
182 d01 = vadd_s32(d01, d23);
183
184 return (WORD32)(vget_lane_s64(vpaddl_s32(d01), 0));
185 }
186
hadamard_sad8x8_using4x4(int16x8_t * a,WORD32 * early_cbf,WORD32 i4_frm_qstep)187 static INLINE WORD32 hadamard_sad8x8_using4x4(int16x8_t *a, WORD32 *early_cbf, WORD32 i4_frm_qstep)
188 {
189 int16x8_t p[8];
190 const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
191 int32x4_t b;
192 int64x2_t c;
193 int64_t satd;
194 WORD32 i;
195
196 for(i = 0; i < 4; i++)
197 {
198 int16x8_t p0 = vaddq_s16(a[i], a[i + 4]);
199 int16x8_t p1 = vsubq_s16(a[i], a[i + 4]);
200
201 int16x4_t q0 = vadd_s16(vget_low_s16(p0), vget_high_s16(p0));
202 int16x4_t q1 = vsub_s16(vget_low_s16(p0), vget_high_s16(p0));
203 int16x4_t q2 = vadd_s16(vget_low_s16(p1), vget_high_s16(p1));
204 int16x4_t q3 = vsub_s16(vget_low_s16(p1), vget_high_s16(p1));
205
206 a[i] = vcombine_s16(q0, q2);
207 a[i + 4] = vcombine_s16(q1, q3);
208 }
209
210 #define EARLY_EXIT(k) \
211 { \
212 p[k] = vabsq_s16(a[k]); \
213 if(*early_cbf == 0) \
214 { \
215 uint16x8_t cmp; \
216 cmp = vcgtq_s16(p[k], threshold); \
217 if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) || \
218 vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0)) \
219 { \
220 *early_cbf = 1; \
221 } \
222 } \
223 }
224 // satd
225 EARLY_EXIT(0);
226 EARLY_EXIT(1);
227 p[0] = vaddq_s16(p[0], p[1]);
228 EARLY_EXIT(2);
229 EARLY_EXIT(3);
230 p[2] = vaddq_s16(p[2], p[3]);
231
232 EARLY_EXIT(4);
233 EARLY_EXIT(5);
234 p[4] = vaddq_s16(p[4], p[5]);
235 EARLY_EXIT(6);
236 EARLY_EXIT(7);
237 #undef EARLY_EXIT
238 p[6] = vaddq_s16(p[6], p[7]);
239
240 p[0] = vaddq_s16(p[0], p[2]);
241 p[4] = vaddq_s16(p[4], p[6]);
242 p[0] = vaddq_s16(p[0], p[4]);
243 b = vpaddlq_s16(p[0]);
244 c = vpaddlq_s32(b);
245 satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
246
247 return ((satd + 4) >> 3);
248 }
249
hadamard8x8_one_pass(int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7)250 static INLINE void hadamard8x8_one_pass(
251 int16x8_t *r0,
252 int16x8_t *r1,
253 int16x8_t *r2,
254 int16x8_t *r3,
255 int16x8_t *r4,
256 int16x8_t *r5,
257 int16x8_t *r6,
258 int16x8_t *r7)
259 {
260 const int16x8_t a0 = vaddq_s16(*r0, *r4);
261 const int16x8_t a4 = vsubq_s16(*r0, *r4);
262 const int16x8_t a1 = vaddq_s16(*r1, *r5);
263 const int16x8_t a5 = vsubq_s16(*r1, *r5);
264 const int16x8_t a2 = vaddq_s16(*r2, *r6);
265 const int16x8_t a6 = vsubq_s16(*r2, *r6);
266 const int16x8_t a3 = vaddq_s16(*r3, *r7);
267 const int16x8_t a7 = vsubq_s16(*r3, *r7);
268
269 const int16x8_t b0 = vaddq_s16(a0, a2);
270 const int16x8_t b2 = vsubq_s16(a0, a2);
271 const int16x8_t b1 = vaddq_s16(a1, a3);
272 const int16x8_t b3 = vsubq_s16(a1, a3);
273 const int16x8_t b4 = vaddq_s16(a4, a6);
274 const int16x8_t b6 = vsubq_s16(a4, a6);
275 const int16x8_t b5 = vaddq_s16(a5, a7);
276 const int16x8_t b7 = vsubq_s16(a5, a7);
277
278 *r0 = vaddq_s16(b0, b1);
279 *r1 = vsubq_s16(b0, b1);
280 *r2 = vaddq_s16(b2, b3);
281 *r3 = vsubq_s16(b2, b3);
282 *r4 = vaddq_s16(b4, b5);
283 *r5 = vsubq_s16(b4, b5);
284 *r6 = vaddq_s16(b6, b7);
285 *r7 = vsubq_s16(b6, b7);
286 }
287
hadamard8x8(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,int16x8_t * r0,int16x8_t * r1,int16x8_t * r2,int16x8_t * r3,int16x8_t * r4,int16x8_t * r5,int16x8_t * r6,int16x8_t * r7,WORD32 is_chroma)288 static INLINE void hadamard8x8(
289 UWORD8 *pu1_src,
290 WORD32 src_strd,
291 UWORD8 *pu1_pred,
292 WORD32 pred_strd,
293 int16x8_t *r0,
294 int16x8_t *r1,
295 int16x8_t *r2,
296 int16x8_t *r3,
297 int16x8_t *r4,
298 int16x8_t *r5,
299 int16x8_t *r6,
300 int16x8_t *r7,
301 WORD32 is_chroma)
302 {
303 // compute error between src and pred
304 RESIDUE(0, is_chroma);
305 RESIDUE(1, is_chroma);
306 RESIDUE(2, is_chroma);
307 RESIDUE(3, is_chroma);
308 RESIDUE(4, is_chroma);
309 RESIDUE(5, is_chroma);
310 RESIDUE(6, is_chroma);
311 RESIDUE(7, is_chroma);
312
313 // vertical hadamard tx
314 hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
315
316 // transpose
317 transpose_s16_8x8(r0, r1, r2, r3, r4, r5, r6, r7);
318
319 // horizontal hadamard tx
320 hadamard8x8_one_pass(r0, r1, r2, r3, r4, r5, r6, r7);
321 }
322
ihevce_HAD_8x8_8bit_plane_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 is_chroma,WORD32 ac_only)323 static INLINE UWORD32 ihevce_HAD_8x8_8bit_plane_neon(
324 UWORD8 *pu1_src,
325 WORD32 src_strd,
326 UWORD8 *pu1_pred,
327 WORD32 pred_strd,
328 WORD32 is_chroma,
329 WORD32 ac_only)
330 {
331 int16x8_t a0, a1, a2, a3, a4, a5, a6, a7;
332 int32x4_t b;
333 int64x2_t c;
334 int64_t satd;
335
336 // hadamard 8x8
337 hadamard8x8(
338 pu1_src, src_strd, pu1_pred, pred_strd, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7, is_chroma);
339
340 if(ac_only)
341 {
342 const int16x8_t mask = vld1q_s16(gu2_dc_mask);
343 a0 = vandq_s16(a0, mask);
344 }
345
346 // satd
347 a0 = vabsq_s16(a0);
348 a1 = vabsq_s16(a1);
349 a0 = vaddq_s16(a0, a1);
350 a2 = vabsq_s16(a2);
351 a3 = vabsq_s16(a3);
352 a2 = vaddq_s16(a2, a3);
353
354 a4 = vabsq_s16(a4);
355 a5 = vabsq_s16(a5);
356 a4 = vaddq_s16(a4, a5);
357 a6 = vabsq_s16(a6);
358 a7 = vabsq_s16(a7);
359 a6 = vaddq_s16(a6, a7);
360
361 a0 = vaddq_s16(a0, a2);
362 a4 = vaddq_s16(a4, a6);
363 a0 = vaddq_s16(a0, a4);
364 b = vpaddlq_s16(a0);
365 c = vpaddlq_s32(b);
366 satd = vget_lane_s64(vadd_s64(vget_low_s64(c), vget_high_s64(c)), 0);
367
368 return ((satd + 4) >> 3);
369 }
370
ihevce_HAD_4x4_8bit_plane_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD32 is_chroma,WORD32 ac_only)371 static INLINE UWORD32 ihevce_HAD_4x4_8bit_plane_neon(
372 UWORD8 *pu1_src,
373 WORD32 src_strd,
374 UWORD8 *pu1_pred,
375 WORD32 pred_strd,
376 WORD32 is_chroma,
377 WORD32 ac_only)
378 {
379 uint8x16_t src_u8, pred_u8;
380 int16x8_t res_01, res_23;
381 int16x4_t h[4];
382 int16x4_t v[4];
383 int16x4x2_t trans_4[2];
384 int16x8_t combined_rows[4];
385 int32x4x2_t trans_8;
386 int32x4_t sad_32_4[3];
387 int32x2_t sad_32_2;
388 int64x1_t sad_64_1;
389 int32_t sad;
390
391 if(!is_chroma)
392 {
393 src_u8 = load_unaligned_u8q(pu1_src, src_strd);
394 pred_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
395 }
396 else
397 {
398 src_u8 = load_unaligned_u8qi(pu1_src, src_strd);
399 pred_u8 = load_unaligned_u8qi(pu1_pred, pred_strd);
400 }
401 res_01 = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(src_u8), vget_low_u8(pred_u8)));
402 res_23 = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(src_u8), vget_high_u8(pred_u8)));
403
404 h[0] = vadd_s16(vget_low_s16(res_01), vget_high_s16(res_23));
405 h[1] = vadd_s16(vget_high_s16(res_01), vget_low_s16(res_23));
406 h[2] = vsub_s16(vget_high_s16(res_01), vget_low_s16(res_23));
407 h[3] = vsub_s16(vget_low_s16(res_01), vget_high_s16(res_23));
408
409 v[0] = vadd_s16(h[0], h[1]);
410 v[1] = vadd_s16(h[3], h[2]);
411 v[2] = vsub_s16(h[0], h[1]);
412 v[3] = vsub_s16(h[3], h[2]);
413
414 trans_4[0] = vtrn_s16(v[0], v[2]);
415 trans_4[1] = vtrn_s16(v[1], v[3]);
416
417 combined_rows[0] = vcombine_s16(trans_4[0].val[0], trans_4[1].val[0]);
418 combined_rows[1] = vcombine_s16(trans_4[0].val[1], trans_4[1].val[1]);
419
420 combined_rows[2] = vaddq_s16(combined_rows[0], combined_rows[1]);
421 combined_rows[3] = vsubq_s16(combined_rows[0], combined_rows[1]);
422
423 trans_8 =
424 vtrnq_s32(vreinterpretq_s32_s16(combined_rows[2]), vreinterpretq_s32_s16(combined_rows[3]));
425
426 combined_rows[0] =
427 vaddq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
428 combined_rows[0] = vabsq_s16(combined_rows[0]);
429 combined_rows[1] =
430 vsubq_s16(vreinterpretq_s16_s32(trans_8.val[0]), vreinterpretq_s16_s32(trans_8.val[1]));
431 combined_rows[1] = vabsq_s16(combined_rows[1]);
432
433 if(ac_only)
434 {
435 const int16x8_t mask = vld1q_s16(gu2_dc_mask);
436 combined_rows[0] = vandq_s16(combined_rows[0], mask);
437 }
438
439 sad_32_4[0] = vpaddlq_s16(combined_rows[0]);
440 sad_32_4[1] = vpaddlq_s16(combined_rows[1]);
441 sad_32_4[2] = vaddq_s32(sad_32_4[0], sad_32_4[1]);
442 sad_32_2 = vadd_s32(vget_high_s32(sad_32_4[2]), vget_low_s32(sad_32_4[2]));
443 sad_64_1 = vpaddl_s32(sad_32_2);
444 sad = vget_lane_s64(sad_64_1, 0);
445
446 return ((sad + 2) >> 2);
447 }
448
ihevce_HAD_4x4_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)449 UWORD32 ihevce_HAD_4x4_8bit_neon(
450 UWORD8 *pu1_src,
451 WORD32 src_strd,
452 UWORD8 *pu1_pred,
453 WORD32 pred_strd,
454 WORD16 *pi2_dst,
455 WORD32 dst_strd)
456 {
457 (void)pi2_dst;
458 (void)dst_strd;
459 return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
460 }
461
ihevce_chroma_compute_AC_HAD_4x4_8bit_neon(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)462 UWORD32 ihevce_chroma_compute_AC_HAD_4x4_8bit_neon(
463 UWORD8 *pu1_origin,
464 WORD32 src_strd,
465 UWORD8 *pu1_pred_buf,
466 WORD32 pred_strd,
467 WORD16 *pi2_dst,
468 WORD32 dst_strd)
469 {
470 (void)pi2_dst;
471 (void)dst_strd;
472 return ihevce_HAD_4x4_8bit_plane_neon(pu1_origin, src_strd, pu1_pred_buf, pred_strd, 1, 1);
473 }
474
ihevce_HAD_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)475 UWORD32 ihevce_HAD_8x8_8bit_neon(
476 UWORD8 *pu1_src,
477 WORD32 src_strd,
478 UWORD8 *pu1_pred,
479 WORD32 pred_strd,
480 WORD16 *pi2_dst,
481 WORD32 dst_strd)
482 {
483 (void)pi2_dst;
484 (void)dst_strd;
485 return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 0);
486 }
487
ihevce_compute_ac_had_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)488 UWORD32 ihevce_compute_ac_had_8x8_8bit_neon(
489 UWORD8 *pu1_src,
490 WORD32 src_strd,
491 UWORD8 *pu1_pred,
492 WORD32 pred_strd,
493 WORD16 *pi2_dst,
494 WORD32 dst_strd)
495 {
496 (void)pi2_dst;
497 (void)dst_strd;
498 return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 0, 1);
499 }
500
ihevce_HAD_16x16_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)501 UWORD32 ihevce_HAD_16x16_8bit_neon(
502 UWORD8 *pu1_src,
503 WORD32 src_strd,
504 UWORD8 *pu1_pred,
505 WORD32 pred_strd,
506 WORD16 *pi2_dst,
507 WORD32 dst_strd)
508 {
509 int16x8_t b0[8];
510 int16x8_t b1[8];
511 int16x8_t b2[8];
512 int16x8_t b3[8];
513 uint32x4_t sum = vdupq_n_u32(0);
514 uint64x2_t c;
515 uint64_t satd;
516 WORD32 i;
517
518 (void)pi2_dst;
519 (void)dst_strd;
520
521 // hadamard 8x8 - b0
522 hadamard8x8(
523 pu1_src,
524 src_strd,
525 pu1_pred,
526 pred_strd,
527 &b0[0],
528 &b0[1],
529 &b0[2],
530 &b0[3],
531 &b0[4],
532 &b0[5],
533 &b0[6],
534 &b0[7],
535 0);
536 // hadamard 8x8 - b1
537 hadamard8x8(
538 pu1_src + 8,
539 src_strd,
540 pu1_pred + 8,
541 pred_strd,
542 &b1[0],
543 &b1[1],
544 &b1[2],
545 &b1[3],
546 &b1[4],
547 &b1[5],
548 &b1[6],
549 &b1[7],
550 0);
551 // hadamard 8x8 - b2
552 hadamard8x8(
553 pu1_src + (8 * src_strd),
554 src_strd,
555 pu1_pred + (8 * pred_strd),
556 pred_strd,
557 &b2[0],
558 &b2[1],
559 &b2[2],
560 &b2[3],
561 &b2[4],
562 &b2[5],
563 &b2[6],
564 &b2[7],
565 0);
566 // hadamard 8x8 - b3
567 hadamard8x8(
568 pu1_src + (8 * src_strd) + 8,
569 src_strd,
570 pu1_pred + (8 * pred_strd) + 8,
571 pred_strd,
572 &b3[0],
573 &b3[1],
574 &b3[2],
575 &b3[3],
576 &b3[4],
577 &b3[5],
578 &b3[6],
579 &b3[7],
580 0);
581
582 for(i = 0; i < 8; i++)
583 {
584 int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
585 int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
586 int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
587 int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
588
589 int16x8_t q0 = vaddq_s16(p0, p2);
590 int16x8_t q1 = vsubq_s16(p0, p2);
591 int16x8_t q2 = vaddq_s16(p1, p3);
592 int16x8_t q3 = vsubq_s16(p1, p3);
593
594 uint16x8_t r0 =
595 vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
596 uint16x8_t r1 =
597 vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
598
599 uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
600 uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
601
602 sum = vaddq_u32(sum, s0);
603 sum = vaddq_u32(sum, s1);
604 }
605
606 c = vpaddlq_u32(sum);
607 satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
608
609 return ((satd + 4) >> 3);
610 }
611
ihevce_chroma_HAD_4x4_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)612 UWORD32 ihevce_chroma_HAD_4x4_8bit_neon(
613 UWORD8 *pu1_src,
614 WORD32 src_strd,
615 UWORD8 *pu1_pred,
616 WORD32 pred_strd,
617 WORD16 *pi2_dst,
618 WORD32 dst_strd)
619 {
620 (void)pi2_dst;
621 (void)dst_strd;
622 return ihevce_HAD_4x4_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
623 }
624
ihevce_chroma_HAD_8x8_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)625 UWORD32 ihevce_chroma_HAD_8x8_8bit_neon(
626 UWORD8 *pu1_src,
627 WORD32 src_strd,
628 UWORD8 *pu1_pred,
629 WORD32 pred_strd,
630 WORD16 *pi2_dst,
631 WORD32 dst_strd)
632 {
633 (void)pi2_dst;
634 (void)dst_strd;
635 return ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
636 }
637
ihevce_chroma_HAD_16x16_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)638 UWORD32 ihevce_chroma_HAD_16x16_8bit_neon(
639 UWORD8 *pu1_src,
640 WORD32 src_strd,
641 UWORD8 *pu1_pred,
642 WORD32 pred_strd,
643 WORD16 *pi2_dst,
644 WORD32 dst_strd)
645 {
646 UWORD32 au4_satd[4];
647
648 (void)pi2_dst;
649 (void)dst_strd;
650 au4_satd[0] = ihevce_HAD_8x8_8bit_plane_neon(pu1_src, src_strd, pu1_pred, pred_strd, 1, 0);
651 au4_satd[1] =
652 ihevce_HAD_8x8_8bit_plane_neon(pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, 1, 0);
653 au4_satd[2] = ihevce_HAD_8x8_8bit_plane_neon(
654 pu1_src + 8 * src_strd, src_strd, pu1_pred + 8 * pred_strd, pred_strd, 1, 0);
655 au4_satd[3] = ihevce_HAD_8x8_8bit_plane_neon(
656 pu1_src + 8 * src_strd + 16, src_strd, pu1_pred + 8 * pred_strd + 16, pred_strd, 1, 0);
657
658 return au4_satd[0] + au4_satd[1] + au4_satd[2] + au4_satd[3];
659 }
660
ihevce_HAD_32x32_8bit_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)661 UWORD32 ihevce_HAD_32x32_8bit_neon(
662 UWORD8 *pu1_src,
663 WORD32 src_strd,
664 UWORD8 *pu1_pred,
665 WORD32 pred_strd,
666 WORD16 *pi2_dst,
667 WORD32 dst_strd)
668 {
669 int16x8_t a[4][4][8];
670 uint32x4_t sum = vdupq_n_u32(0);
671 WORD32 b8, b16;
672 uint64x2_t c;
673 uint64_t satd;
674 WORD32 i, j;
675
676 (void)pi2_dst;
677 (void)dst_strd;
678 // hadamard 32x32
679 for(b16 = 0; b16 < 4; b16++)
680 {
681 UWORD8 *pu1_src_b16 = pu1_src + (b16 >> 1) * (src_strd * 16) + ((b16 & 1) * 16);
682 UWORD8 *pu1_pred_b16 = pu1_pred + (b16 >> 1) * (pred_strd * 16) + ((b16 & 1) * 16);
683 // hadamard 16x16
684 for(b8 = 0; b8 < 4; b8++)
685 {
686 UWORD8 *pu1_src_b8 = pu1_src_b16 + (b8 >> 1) * (src_strd * 8) + ((b8 & 1) * 8);
687 UWORD8 *pu1_pred_b8 = pu1_pred_b16 + (b8 >> 1) * (pred_strd * 8) + ((b8 & 1) * 8);
688 // hadamard 8x8
689 hadamard8x8(
690 pu1_src_b8,
691 src_strd,
692 pu1_pred_b8,
693 pred_strd,
694 &a[b16][b8][0],
695 &a[b16][b8][1],
696 &a[b16][b8][2],
697 &a[b16][b8][3],
698 &a[b16][b8][4],
699 &a[b16][b8][5],
700 &a[b16][b8][6],
701 &a[b16][b8][7],
702 0);
703 }
704 for(i = 0; i < 8; i++)
705 {
706 int16x8_t p0 = vhaddq_s16(a[b16][0][i], a[b16][1][i]);
707 int16x8_t p1 = vhsubq_s16(a[b16][0][i], a[b16][1][i]);
708 int16x8_t p2 = vhaddq_s16(a[b16][2][i], a[b16][3][i]);
709 int16x8_t p3 = vhsubq_s16(a[b16][2][i], a[b16][3][i]);
710
711 a[b16][0][i] = vaddq_s16(p0, p2);
712 a[b16][1][i] = vsubq_s16(p0, p2);
713 a[b16][2][i] = vaddq_s16(p1, p3);
714 a[b16][3][i] = vsubq_s16(p1, p3);
715
716 a[b16][0][i] = vshrq_n_s16(a[b16][0][i], 2);
717 a[b16][1][i] = vshrq_n_s16(a[b16][1][i], 2);
718 a[b16][2][i] = vshrq_n_s16(a[b16][2][i], 2);
719 a[b16][3][i] = vshrq_n_s16(a[b16][3][i], 2);
720 }
721 }
722 for(j = 0; j < 4; j++)
723 {
724 for(i = 0; i < 8; i++)
725 {
726 int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
727 int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
728 int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
729 int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
730
731 int16x8_t q0 = vaddq_s16(p0, p2);
732 int16x8_t q1 = vsubq_s16(p0, p2);
733 int16x8_t q2 = vaddq_s16(p1, p3);
734 int16x8_t q3 = vsubq_s16(p1, p3);
735
736 uint16x8_t r0 = vaddq_u16(
737 vreinterpretq_u16_s16(vabsq_s16(q0)), vreinterpretq_u16_s16(vabsq_s16(q1)));
738 uint16x8_t r1 = vaddq_u16(
739 vreinterpretq_u16_s16(vabsq_s16(q2)), vreinterpretq_u16_s16(vabsq_s16(q3)));
740
741 uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
742 uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
743
744 sum = vaddq_u32(sum, s0);
745 sum = vaddq_u32(sum, s1);
746 }
747 }
748 c = vpaddlq_u32(sum);
749 satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
750
751 return ((satd + 2) >> 2);
752 }
753
ihevce_had4_4x4_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst4x4,WORD32 dst_strd,WORD32 * pi4_hsad,WORD32 hsad_stride,WORD32 i4_frm_qstep)754 WORD32 ihevce_had4_4x4_neon(
755 UWORD8 *pu1_src,
756 WORD32 src_strd,
757 UWORD8 *pu1_pred,
758 WORD32 pred_strd,
759 WORD16 *pi2_dst4x4,
760 WORD32 dst_strd,
761 WORD32 *pi4_hsad,
762 WORD32 hsad_stride,
763 WORD32 i4_frm_qstep)
764 {
765 int16x8_t a[8];
766
767 (void)pi2_dst4x4;
768 (void)dst_strd;
769 (void)i4_frm_qstep;
770
771 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
772 hadamard4x4_4(
773 pu1_src,
774 src_strd,
775 pu1_pred,
776 pred_strd,
777 &a[0],
778 &a[1],
779 &a[2],
780 &a[3],
781 &a[4],
782 &a[5],
783 &a[6],
784 &a[7]);
785
786 return hadamard_sad4x4_4(a, pi4_hsad, hsad_stride);
787 }
788
ihevce_had_8x8_using_4_4x4_r_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)789 WORD32 ihevce_had_8x8_using_4_4x4_r_neon(
790 UWORD8 *pu1_src,
791 WORD32 src_strd,
792 UWORD8 *pu1_pred,
793 WORD32 pred_strd,
794 WORD16 *pi2_dst,
795 WORD32 dst_strd,
796 WORD32 **ppi4_hsad,
797 WORD32 **ppi4_tu_split,
798 WORD32 **ppi4_tu_early_cbf,
799 WORD32 pos_x_y_4x4,
800 WORD32 num_4x4_in_row,
801 WORD32 lambda,
802 WORD32 lambda_q_shift,
803 WORD32 i4_frm_qstep,
804 WORD32 i4_cur_depth,
805 WORD32 i4_max_depth,
806 WORD32 i4_max_tr_size,
807 WORD32 *pi4_tu_split_cost,
808 void *pv_func_sel)
809 {
810 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
811 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
812
813 WORD32 *pi4_4x4_hsad;
814 WORD32 *pi4_8x8_hsad;
815 WORD32 *pi4_8x8_tu_split;
816 WORD32 *pi4_8x8_tu_early_cbf;
817
818 WORD32 cost_child, cost_parent;
819 WORD32 best_cost;
820 WORD32 early_cbf = 0;
821 const UWORD8 u1_cur_tr_size = 8;
822
823 WORD32 i;
824
825 int16x8_t a[8];
826
827 (void)pv_func_sel;
828
829 assert(pos_x >= 0);
830 assert(pos_y >= 0);
831
832 /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */
833 pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
834 pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
835 pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
836 pi4_8x8_tu_early_cbf =
837 ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
838
839 /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
840 hadamard4x4_4(
841 pu1_src,
842 src_strd,
843 pu1_pred,
844 pred_strd,
845 &a[0],
846 &a[1],
847 &a[2],
848 &a[3],
849 &a[4],
850 &a[5],
851 &a[6],
852 &a[7]);
853
854 /* -------- cost child -------- */
855 cost_child = hadamard_sad4x4_4(a, pi4_4x4_hsad, num_4x4_in_row);
856 /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
857 cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
858
859 /* -------- cost parent -------- */
860 cost_parent = hadamard_sad8x8_using4x4(a, &early_cbf, i4_frm_qstep);
861 for(i = 0; i < 8; i++, pi2_dst += dst_strd)
862 vst1q_s16(pi2_dst, a[i]);
863
864 if(i4_cur_depth < i4_max_depth)
865 {
866 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
867 {
868 *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
869 best_cost = cost_child;
870 best_cost <<= 1;
871 best_cost++;
872 pi4_8x8_tu_split[0] = 1;
873 pi4_8x8_hsad[0] = cost_child;
874 }
875 else
876 {
877 best_cost = cost_parent;
878 best_cost <<= 1;
879 pi4_8x8_tu_split[0] = 0;
880 pi4_8x8_hsad[0] = cost_parent;
881 }
882 }
883 else
884 {
885 best_cost = cost_parent;
886 best_cost <<= 1;
887 pi4_8x8_tu_split[0] = 0;
888 pi4_8x8_hsad[0] = cost_parent;
889 }
890
891 pi4_8x8_tu_early_cbf[0] = early_cbf;
892
893 /* best cost has tu_split_flag at LSB(Least significant bit) */
894 return ((best_cost << 1) + early_cbf);
895 }
896
ihevce_compute_16x16HAD_using_8x8_neon(WORD16 * pi2_8x8_had,WORD32 had8_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)897 static WORD32 ihevce_compute_16x16HAD_using_8x8_neon(
898 WORD16 *pi2_8x8_had,
899 WORD32 had8_strd,
900 WORD16 *pi2_dst,
901 WORD32 dst_strd,
902 WORD32 i4_frm_qstep,
903 WORD32 *pi4_cbf)
904 {
905 int16x8_t b0[8];
906 int16x8_t b1[8];
907 int16x8_t b2[8];
908 int16x8_t b3[8];
909 const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
910 uint32x4_t sum = vdupq_n_u32(0);
911 uint64x2_t c;
912 uint64_t satd;
913 WORD32 i;
914
915 for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
916 {
917 b0[i] = vld1q_s16(pi2_8x8_had);
918 b1[i] = vld1q_s16(pi2_8x8_had + 8);
919 }
920 for(i = 0; i < 8; i++, pi2_8x8_had += had8_strd)
921 {
922 b2[i] = vld1q_s16(pi2_8x8_had);
923 b3[i] = vld1q_s16(pi2_8x8_had + 8);
924 }
925
926 #define EARLY_EXIT(k) \
927 { \
928 p##k = vabsq_s16(q##k); \
929 if(*pi4_cbf == 0) \
930 { \
931 uint16x8_t cmp; \
932 cmp = vcgtq_s16(p##k, threshold); \
933 if(vget_lane_s64(vreinterpret_s64_u16(vget_low_u16(cmp)), 0) || \
934 vget_lane_s64(vreinterpret_s64_u16(vget_high_u16(cmp)), 0)) \
935 { \
936 *pi4_cbf = 1; \
937 } \
938 } \
939 }
940 for(i = 0; i < 8; i++, pi2_dst += dst_strd)
941 {
942 int16x8_t p0 = vhaddq_s16(b0[i], b1[i]);
943 int16x8_t p1 = vhsubq_s16(b0[i], b1[i]);
944 int16x8_t p2 = vhaddq_s16(b2[i], b3[i]);
945 int16x8_t p3 = vhsubq_s16(b2[i], b3[i]);
946
947 int16x8_t q0 = vaddq_s16(p0, p2);
948 int16x8_t q1 = vsubq_s16(p0, p2);
949 int16x8_t q2 = vaddq_s16(p1, p3);
950 int16x8_t q3 = vsubq_s16(p1, p3);
951
952 vst1q_s16(pi2_dst, q0);
953 EARLY_EXIT(0);
954 vst1q_s16(pi2_dst + 8, q1);
955 EARLY_EXIT(1);
956 vst1q_s16(pi2_dst + 8 * dst_strd, q2);
957 EARLY_EXIT(2);
958 vst1q_s16(pi2_dst + 8 * dst_strd + 8, q3);
959 EARLY_EXIT(3);
960 uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
961 uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
962
963 uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
964 uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
965
966 sum = vaddq_u32(sum, s0);
967 sum = vaddq_u32(sum, s1);
968 }
969
970 c = vpaddlq_u32(sum);
971 satd = vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
972
973 return ((satd + 4) >> 3);
974 }
975
ihevce_had_16x16_r_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)976 WORD32 ihevce_had_16x16_r_neon(
977 UWORD8 *pu1_src,
978 WORD32 src_strd,
979 UWORD8 *pu1_pred,
980 WORD32 pred_strd,
981 WORD16 *pi2_dst,
982 WORD32 dst_strd,
983 WORD32 **ppi4_hsad,
984 WORD32 **ppi4_tu_split,
985 WORD32 **ppi4_tu_early_cbf,
986 WORD32 pos_x_y_4x4,
987 WORD32 num_4x4_in_row,
988 WORD32 lambda,
989 WORD32 lambda_q_shift,
990 WORD32 i4_frm_qstep,
991 WORD32 i4_cur_depth,
992 WORD32 i4_max_depth,
993 WORD32 i4_max_tr_size,
994 WORD32 *pi4_tu_split_cost,
995 void *pv_func_sel)
996 {
997 WORD16 ai2_8x8_had[256];
998
999 WORD32 *pi4_16x16_hsad;
1000 WORD32 *pi4_16x16_tu_split;
1001 WORD32 *pi4_16x16_tu_early_cbf;
1002
1003 WORD32 best_cost, best_cost_tu_split;
1004 WORD32 tu_split_flag = 0;
1005 WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1006 WORD32 cost_parent, cost_child = 0;
1007
1008 const UWORD8 u1_cur_tr_size = 16;
1009
1010 WORD32 i;
1011
1012 WORD16 *pi2_y0;
1013 UWORD8 *src, *pred;
1014 WORD32 pos_x_y_4x4_0;
1015
1016 WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1017 WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1018
1019 assert(pos_x >= 0);
1020 assert(pos_y >= 0);
1021
1022 /* Initialize pointers to store 16x16 SATDs */
1023 pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1024
1025 pi4_16x16_tu_split =
1026 ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1027
1028 pi4_16x16_tu_early_cbf =
1029 ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1030
1031 /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1032 for(i = 0; i < 4; i++)
1033 {
1034 src = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1035 pred = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1036 pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1037 pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1038
1039 best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r_neon(
1040 src,
1041 src_strd,
1042 pred,
1043 pred_strd,
1044 pi2_y0,
1045 16,
1046 ppi4_hsad,
1047 ppi4_tu_split,
1048 ppi4_tu_early_cbf,
1049 pos_x_y_4x4_0,
1050 num_4x4_in_row,
1051 lambda,
1052 lambda_q_shift,
1053 i4_frm_qstep,
1054 i4_cur_depth + 1,
1055 i4_max_depth,
1056 i4_max_tr_size,
1057 pi4_tu_split_cost,
1058 pv_func_sel);
1059
1060 /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1061 best_cost = (best_cost_tu_split >> 2);
1062
1063 /* Last but one bit stores the information regarding the TU_Split */
1064 tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1065
1066 /* Last bit stores the information regarding the early_cbf */
1067 i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1068
1069 cost_child += best_cost;
1070
1071 tu_split_flag <<= 1;
1072 i4_early_cbf_flag <<= 1;
1073 }
1074
1075 /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1076 pi2_y0 = ai2_8x8_had;
1077
1078 /* Threshold currently passed as "0" */
1079 cost_parent = ihevce_compute_16x16HAD_using_8x8_neon(
1080 pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1081
1082 /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1083 cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1084
1085 i4_early_cbf_flag += early_cbf;
1086
1087 /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1088 which decides the extent to which TU_REC needs to be done */
1089 if(i4_cur_depth < i4_max_depth)
1090 {
1091 if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1092 {
1093 *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1094 tu_split_flag += 1;
1095 best_cost = cost_child;
1096 }
1097 else
1098 {
1099 tu_split_flag += 0;
1100 best_cost = cost_parent;
1101 }
1102 }
1103 else
1104 {
1105 tu_split_flag += 0;
1106 best_cost = cost_parent;
1107 }
1108
1109 pi4_16x16_hsad[0] = best_cost;
1110 pi4_16x16_tu_split[0] = tu_split_flag;
1111 pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1112
1113 /*returning two values(best cost & tu_split_flag) as a single value*/
1114 return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1115 }
1116
ihevce_compute_32x32HAD_using_16x16_neon(WORD16 * pi2_16x16_had,WORD32 had16_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1117 UWORD32 ihevce_compute_32x32HAD_using_16x16_neon(
1118 WORD16 *pi2_16x16_had,
1119 WORD32 had16_strd,
1120 WORD16 *pi2_dst,
1121 WORD32 dst_strd,
1122 WORD32 i4_frm_qstep,
1123 WORD32 *pi4_cbf)
1124 {
1125 int16x8_t a[4][4][8];
1126 uint32x4_t sum = vdupq_n_u32(0);
1127 const int16x8_t threshold = vdupq_n_s16((int16_t)(i4_frm_qstep >> 8));
1128 WORD32 b8, b16;
1129 uint64x2_t c;
1130 WORD32 i, j;
1131
1132 (void)pi2_dst;
1133 (void)dst_strd;
1134
1135 for(b16 = 0; b16 < 4; b16++)
1136 {
1137 WORD16 *pi2_b16 = pi2_16x16_had + (b16 >> 1) * (had16_strd * 16) + ((b16 & 1) * 16);
1138
1139 for(b8 = 0; b8 < 4; b8++)
1140 {
1141 WORD16 *pi2_b8 = pi2_b16 + (b8 >> 1) * (had16_strd * 8) + ((b8 & 1) * 8);
1142
1143 for(i = 0; i < 8; i++, pi2_b8 += had16_strd)
1144 {
1145 a[b16][b8][i] = vld1q_s16(pi2_b8);
1146 a[b16][b8][i] = vshrq_n_s16(a[b16][b8][i], 2);
1147 }
1148 }
1149 }
1150
1151 for(j = 0; j < 4; j++)
1152 {
1153 for(i = 0; i < 8; i++)
1154 {
1155 int16x8_t p0 = vaddq_s16(a[0][j][i], a[1][j][i]);
1156 int16x8_t p1 = vsubq_s16(a[0][j][i], a[1][j][i]);
1157 int16x8_t p2 = vaddq_s16(a[2][j][i], a[3][j][i]);
1158 int16x8_t p3 = vsubq_s16(a[2][j][i], a[3][j][i]);
1159
1160 int16x8_t q0 = vaddq_s16(p0, p2);
1161 int16x8_t q1 = vsubq_s16(p0, p2);
1162 int16x8_t q2 = vaddq_s16(p1, p3);
1163 int16x8_t q3 = vsubq_s16(p1, p3);
1164
1165 EARLY_EXIT(0);
1166 EARLY_EXIT(1);
1167 EARLY_EXIT(2);
1168 EARLY_EXIT(3);
1169
1170 uint16x8_t r0 = vaddq_u16(vreinterpretq_u16_s16(p0), vreinterpretq_u16_s16(p1));
1171 uint16x8_t r1 = vaddq_u16(vreinterpretq_u16_s16(p2), vreinterpretq_u16_s16(p3));
1172
1173 uint32x4_t s0 = vaddl_u16(vget_low_u16(r0), vget_high_u16(r0));
1174 uint32x4_t s1 = vaddl_u16(vget_low_u16(r1), vget_high_u16(r1));
1175
1176 sum = vaddq_u32(sum, s0);
1177 sum = vaddq_u32(sum, s1);
1178 }
1179 }
1180 c = vpaddlq_u32(sum);
1181
1182 return vget_lane_u64(vadd_u64(vget_low_u64(c), vget_high_u64(c)), 0);
1183 }
1184