1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 ******************************************************************************
22 * @file
23 * ihevce_me_neon.c
24 *
25 * @brief
26 * Subpel refinement modules for ME algo
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38
39 /*****************************************************************************/
40 /* File Includes */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <assert.h>
46 #include <arm_neon.h>
47
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_cmn_utils_neon.h"
52 #include "ihevc_chroma_itrans_recon.h"
53 #include "ihevc_chroma_intra_pred.h"
54 #include "ihevc_debug.h"
55 #include "ihevc_deblk.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_itrans_recon.h"
58 #include "ihevc_intra_pred.h"
59 #include "ihevc_inter_pred.h"
60 #include "ihevc_macros.h"
61 #include "ihevc_mem_fns.h"
62 #include "ihevc_padding.h"
63 #include "ihevc_quant_iquant_ssd.h"
64 #include "ihevc_resi_trans.h"
65 #include "ihevc_sao.h"
66 #include "ihevc_structs.h"
67 #include "ihevc_weighted_pred.h"
68
69 #include "rc_cntrl_param.h"
70 #include "rc_frame_info_collector.h"
71 #include "rc_look_ahead_params.h"
72
73 #include "ihevce_api.h"
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_function_selector.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_enc_structs.h"
80 #include "ihevce_had_satd.h"
81 #include "ihevce_ipe_instr_set_router.h"
82 #include "ihevce_global_tables.h"
83
84 #include "hme_datatype.h"
85 #include "hme_common_defs.h"
86 #include "hme_common_utils.h"
87 #include "hme_interface.h"
88 #include "hme_defs.h"
89 #include "hme_err_compute.h"
90 #include "hme_globals.h"
91
92 #include "ihevce_me_instr_set_router.h"
93
94 /*****************************************************************************/
95 /* Typedefs */
96 /*****************************************************************************/
97 typedef void ft_calc_sad4_nxn(
98 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad);
99
100 /*****************************************************************************/
101 /* Function Macros */
102 /*****************************************************************************/
103 #define COMBINE_SADS(pps, as, i) \
104 { \
105 pps[PART_ID_NxN_TL][i] = (as[0] + as[1] + as[4] + as[5]); \
106 pps[PART_ID_NxN_TR][i] = (as[2] + as[3] + as[6] + as[7]); \
107 pps[PART_ID_NxN_BL][i] = (as[8] + as[9] + as[12] + as[13]); \
108 pps[PART_ID_NxN_BR][i] = (as[10] + as[11] + as[14] + as[15]); \
109 \
110 pps[PART_ID_Nx2N_L][i] = pps[PART_ID_NxN_TL][i] + pps[PART_ID_NxN_BL][i]; \
111 pps[PART_ID_Nx2N_R][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_BR][i]; \
112 pps[PART_ID_2NxN_T][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_TL][i]; \
113 pps[PART_ID_2NxN_B][i] = pps[PART_ID_NxN_BR][i] + pps[PART_ID_NxN_BL][i]; \
114 \
115 pps[PART_ID_nLx2N_L][i] = (as[8] + as[0] + as[12] + as[4]); \
116 pps[PART_ID_nRx2N_R][i] = (as[3] + as[7] + as[15] + as[11]); \
117 pps[PART_ID_2NxnU_T][i] = (as[1] + as[0] + as[2] + as[3]); \
118 pps[PART_ID_2NxnD_B][i] = (as[15] + as[14] + as[12] + as[13]); \
119 \
120 pps[PART_ID_2Nx2N][i] = pps[PART_ID_2NxN_T][i] + pps[PART_ID_2NxN_B][i]; \
121 \
122 pps[PART_ID_2NxnU_B][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnU_T][i]; \
123 pps[PART_ID_2NxnD_T][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnD_B][i]; \
124 pps[PART_ID_nRx2N_L][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nRx2N_R][i]; \
125 pps[PART_ID_nLx2N_R][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nLx2N_L][i]; \
126 }
127
128 #define COMBINE_SADS_2(ps, as) \
129 { \
130 ps[PART_ID_NxN_TL] = (as[0] + as[1] + as[4] + as[5]); \
131 ps[PART_ID_NxN_TR] = (as[2] + as[3] + as[6] + as[7]); \
132 ps[PART_ID_NxN_BL] = (as[8] + as[9] + as[12] + as[13]); \
133 ps[PART_ID_NxN_BR] = (as[10] + as[11] + as[14] + as[15]); \
134 \
135 ps[PART_ID_Nx2N_L] = ps[PART_ID_NxN_TL] + ps[PART_ID_NxN_BL]; \
136 ps[PART_ID_Nx2N_R] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_BR]; \
137 ps[PART_ID_2NxN_T] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_TL]; \
138 ps[PART_ID_2NxN_B] = ps[PART_ID_NxN_BR] + ps[PART_ID_NxN_BL]; \
139 \
140 ps[PART_ID_nLx2N_L] = (as[8] + as[0] + as[12] + as[4]); \
141 ps[PART_ID_nRx2N_R] = (as[3] + as[7] + as[15] + as[11]); \
142 ps[PART_ID_2NxnU_T] = (as[1] + as[0] + as[2] + as[3]); \
143 ps[PART_ID_2NxnD_B] = (as[15] + as[14] + as[12] + as[13]); \
144 \
145 ps[PART_ID_2Nx2N] = ps[PART_ID_2NxN_T] + ps[PART_ID_2NxN_B]; \
146 \
147 ps[PART_ID_2NxnU_B] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnU_T]; \
148 ps[PART_ID_2NxnD_T] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnD_B]; \
149 ps[PART_ID_nRx2N_L] = ps[PART_ID_2Nx2N] - ps[PART_ID_nRx2N_R]; \
150 ps[PART_ID_nLx2N_R] = ps[PART_ID_2Nx2N] - ps[PART_ID_nLx2N_L]; \
151 }
152
153 /*****************************************************************************/
154 /* Function Definitions */
155 /*****************************************************************************/
156
ihevce_sad4_2x2_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)157 static void ihevce_sad4_2x2_neon(
158 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
159 {
160 uint16x8_t abs = vdupq_n_u16(0);
161 uint32x4_t sad;
162 WORD32 i;
163
164 /* -------- Compute four 2x2 SAD Transforms of 8x2 in one call--------- */
165 for(i = 0; i < 2; i++)
166 {
167 const uint8x8_t src = vld1_u8(pu1_src);
168 const uint8x8_t pred = vld1_u8(pu1_pred);
169
170 abs = vabal_u8(abs, src, pred);
171 pu1_src += src_strd;
172 pu1_pred += pred_strd;
173 }
174 sad = vpaddlq_u16(abs);
175 vst1q_u32(pu4_sad, sad);
176 }
177
ihevce_sad4_4x4_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD16 * pu2_sad)178 static void ihevce_sad4_4x4_neon(
179 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD16 *pu2_sad)
180 {
181 uint16x8_t abs_01 = vdupq_n_u16(0);
182 uint16x8_t abs_23 = vdupq_n_u16(0);
183 uint16x4_t tmp_a0, tmp_a1;
184 WORD32 i;
185
186 /* -------- Compute four 4x4 SAD Transforms of 16x4 in one call--------- */
187 for(i = 0; i < 4; i++)
188 {
189 const uint8x16_t src = vld1q_u8(pu1_src);
190 const uint8x16_t pred = vld1q_u8(pu1_pred);
191
192 abs_01 = vabal_u8(abs_01, vget_low_u8(src), vget_low_u8(pred));
193 abs_23 = vabal_u8(abs_23, vget_high_u8(src), vget_high_u8(pred));
194 pu1_src += src_strd;
195 pu1_pred += pred_strd;
196 }
197 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
198 tmp_a1 = vpadd_u16(vget_low_u16(abs_23), vget_high_u16(abs_23));
199 abs_01 = vcombine_u16(tmp_a0, tmp_a1);
200 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
201 vst1_u16(pu2_sad, tmp_a0);
202 }
203
ihevce_sad4_8x8_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)204 static void ihevce_sad4_8x8_neon(
205 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
206 {
207 uint16x8_t abs_0 = vdupq_n_u16(0);
208 uint16x8_t abs_1 = vdupq_n_u16(0);
209 uint16x8_t abs_2 = vdupq_n_u16(0);
210 uint16x8_t abs_3 = vdupq_n_u16(0);
211 uint16x4_t tmp_a0, tmp_a1;
212 uint32x4_t sad;
213 WORD32 i;
214
215 /* -------- Compute four 8x8 SAD Transforms of 32x8 in one call--------- */
216 for(i = 0; i < 8; i++)
217 {
218 uint8x16_t src_01 = vld1q_u8(pu1_src);
219 uint8x16_t pred_01 = vld1q_u8(pu1_pred);
220 uint8x16_t src_23 = vld1q_u8(pu1_src + 16);
221 uint8x16_t pred_23 = vld1q_u8(pu1_pred + 16);
222
223 abs_0 = vabal_u8(abs_0, vget_low_u8(src_01), vget_low_u8(pred_01));
224 abs_1 = vabal_u8(abs_1, vget_high_u8(src_01), vget_high_u8(pred_01));
225 abs_2 = vabal_u8(abs_2, vget_low_u8(src_23), vget_low_u8(pred_23));
226 abs_3 = vabal_u8(abs_3, vget_high_u8(src_23), vget_high_u8(pred_23));
227 pu1_src += src_strd;
228 pu1_pred += pred_strd;
229 }
230 tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
231 tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
232 abs_0 = vcombine_u16(tmp_a0, tmp_a1);
233 tmp_a0 = vpadd_u16(vget_low_u16(abs_2), vget_high_u16(abs_2));
234 tmp_a1 = vpadd_u16(vget_low_u16(abs_3), vget_high_u16(abs_3));
235 abs_1 = vcombine_u16(tmp_a0, tmp_a1);
236 tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
237 tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
238 abs_0 = vcombine_u16(tmp_a0, tmp_a1);
239 sad = vpaddlq_u16(abs_0);
240 vst1q_u32(pu4_sad, sad);
241 }
242
ihevce_sad4_16x16_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)243 static void ihevce_sad4_16x16_neon(
244 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
245 {
246 WORD32 i;
247
248 /* ------ Compute four 16x16 SAD Transforms of 64x16 in one call-------- */
249 for(i = 0; i < 4; i++)
250 {
251 pu4_sad[i] = ihevce_nxn_sad_computer_neon(
252 pu1_src + (i * 16), src_strd, pu1_pred + (i * 16), pred_strd, 16);
253 }
254 }
255
compute_part_sads_for_MxM_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)256 void compute_part_sads_for_MxM_blk_neon(
257 grid_ctxt_t *ps_grid,
258 UWORD8 *pu1_cur_ptr,
259 WORD32 cur_buf_stride,
260 WORD32 **pp_part_sads,
261 cand_t *ps_cand,
262 WORD32 *num_cands,
263 CU_SIZE_T e_cu_size)
264 {
265 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
266 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
267
268 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
269 WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0, -grd_sz_x, 0, grd_sz_x, 0,
270 -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
271 WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0, 0, -grd_sz_y, 0, grd_sz_y,
272 -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
273 WORD32 shift = (WORD32)e_cu_size;
274
275 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
276 WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
277 WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
278
279 cand_t *cand0 = ps_cand;
280
281 ft_calc_sad4_nxn *calc_sad4 = NULL;
282
283 /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
284 * AMP cases */
285 UWORD32 au4_nxn_sad[16];
286
287 WORD32 i, j;
288
289 *num_cands = 0;
290
291 /* Loop to fill up the cand_t array and to calculate num_cands */
292 for(i = 0; i < ps_grid->num_grids; i++)
293 {
294 WORD32 j;
295 WORD32 mask = ps_grid->pi4_grd_mask[i];
296 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
297 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
298 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
299
300 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
301 {
302 if(mask & 1)
303 {
304 *num_cands = *num_cands + 1;
305 cand0->grid_ix = i;
306 cand0->ref_idx = ps_grid->p_ref_idx[i];
307 cand0->pu1_ref_ptr =
308 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
309 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
310 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
311 cand0++;
312 }
313 }
314 }
315
316 /* fn selector */
317 if(e_cu_size == CU_8x8)
318 calc_sad4 = ihevce_sad4_2x2_neon;
319 else if(e_cu_size == CU_32x32)
320 calc_sad4 = ihevce_sad4_8x8_neon;
321 else if(e_cu_size == CU_64x64)
322 calc_sad4 = ihevce_sad4_16x16_neon;
323
324 /* Loop to compute the SAD's */
325 for(i = 0; i < *num_cands; i++)
326 {
327 cand_t *cand = ps_cand + i;
328
329 for(j = 0; j < 4; j++)
330 (*calc_sad4)(
331 pu1_cur_ptr + j * cur_buf_stride_lsN,
332 cur_buf_stride,
333 cand->pu1_ref_ptr + j * ref_buf_stride_lsN,
334 ref_buf_stride,
335 &au4_nxn_sad[4 * j]);
336
337 COMBINE_SADS(pp_part_sads, au4_nxn_sad, i);
338 }
339 }
340
compute_4x4_sads_for_16x16_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands)341 void compute_4x4_sads_for_16x16_blk_neon(
342 grid_ctxt_t *ps_grid,
343 UWORD8 *pu1_cur_ptr,
344 WORD32 cur_buf_stride,
345 UWORD16 **pp_part_sads,
346 cand_t *ps_cand,
347 WORD32 *num_cands)
348 {
349 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
350 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
351
352 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
353 WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0, -grd_sz_x, 0, grd_sz_x, 0,
354 -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
355 WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0, 0, -grd_sz_y, 0, grd_sz_y,
356 -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
357
358 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
359 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
360 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
361
362 cand_t *cand0 = ps_cand;
363
364 /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
365 * AMP cases */
366 UWORD16 au2_4x4_sad[16];
367
368 WORD32 i, j;
369
370 *num_cands = 0;
371
372 /* Loop to fill up the cand_t array and to calculate num_cands */
373 for(i = 0; i < ps_grid->num_grids; i++)
374 {
375 WORD32 j;
376 WORD32 mask = ps_grid->pi4_grd_mask[i];
377 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
378 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
379 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
380
381 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
382 {
383 if(mask & 1)
384 {
385 *num_cands = *num_cands + 1;
386 cand0->grid_ix = i;
387 cand0->ref_idx = ps_grid->p_ref_idx[i];
388 cand0->pu1_ref_ptr =
389 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
390 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
391 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
392 cand0++;
393 }
394 }
395 }
396
397 /* Loop to compute the SAD's */
398 for(i = 0; i < *num_cands; i++)
399 {
400 cand_t *cand = ps_cand + i;
401
402 for(j = 0; j < 4; j++)
403 ihevce_sad4_4x4_neon(
404 pu1_cur_ptr + j * cur_buf_stride_ls2,
405 cur_buf_stride,
406 cand->pu1_ref_ptr + j * ref_buf_stride_ls2,
407 ref_buf_stride,
408 &au2_4x4_sad[4 * j]);
409
410 COMBINE_SADS(pp_part_sads, au2_4x4_sad, i);
411 }
412 }
413
hme_evalsad_grid_npu_MxN_neon(err_prms_t * ps_prms)414 void hme_evalsad_grid_npu_MxN_neon(err_prms_t *ps_prms)
415 {
416 S32 *pi4_sad = ps_prms->pi4_sad_grid;
417 S32 i, grid_count = 0;
418 S32 x_off = ps_prms->i4_step;
419 S32 y_off = ps_prms->i4_step * ps_prms->i4_ref_stride;
420
421 assert((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
422
423 for(i = 0; i < 9; i++)
424 {
425 if(ps_prms->i4_grid_mask & (1 << i))
426 grid_count++;
427 }
428 pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
429
430 for(i = 0; i < 9; i++)
431 {
432 U08 *pu1_inp = ps_prms->pu1_inp;
433 U08 *pu1_ref = ps_prms->pu1_ref;
434
435 if(!(ps_prms->i4_grid_mask & (1 << i)))
436 continue;
437
438 pu1_ref += x_off * gai1_grid_id_to_x[i];
439 pu1_ref += y_off * gai1_grid_id_to_y[i];
440 *pi4_sad++ = ihevce_4mx4n_sad_computer_neon(
441 pu1_inp,
442 pu1_ref,
443 ps_prms->i4_inp_stride,
444 ps_prms->i4_ref_stride,
445 ps_prms->i4_blk_wd,
446 ps_prms->i4_blk_ht);
447 }
448 }
449
hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t * ps_prms)450 void hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t *ps_prms)
451 {
452 ps_prms->pi4_sad_grid[0] = ihevce_4mx4n_sad_computer_neon(
453 ps_prms->pu1_inp,
454 ps_prms->pu1_ref,
455 ps_prms->i4_inp_stride,
456 ps_prms->i4_ref_stride,
457 ps_prms->i4_blk_wd,
458 ps_prms->i4_blk_ht);
459 }
460
hme_calc_sad_and_1_best_result_neon(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)461 void hme_calc_sad_and_1_best_result_neon(
462 hme_search_prms_t *ps_search_prms,
463 wgt_pred_ctxt_t *ps_wt_inp_prms,
464 err_prms_t *ps_err_prms,
465 result_upd_prms_t *ps_result_prms,
466 U08 **ppu1_ref,
467 S32 i4_ref_stride)
468 {
469 mv_refine_ctxt_t *refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
470 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
471 S32 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
472 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
473 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
474 S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
475 S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
476 S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
477 S32 i4_inp_off, i4_ref_off;
478 S32 i;
479
480 i4_inp_off = ps_search_prms->i4_cu_x_off;
481 i4_inp_off += (ps_search_prms->i4_cu_y_off * cur_buf_stride);
482 i4_ref_off = ps_search_prms->i4_x_off;
483 i4_ref_off += (ps_search_prms->i4_y_off * i4_ref_stride);
484
485 /* Run through each of the candts in a loop */
486 for(i = 0; i < i4_num_nodes; i++)
487 {
488 U16 au2_4x4_sad[16];
489 S32 i4_mv_cost;
490 S32 j;
491
492 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
493 {
494 continue;
495 }
496
497 ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
498 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_off;
499 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
500 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
501
502 /* Loop to compute the SAD's */
503 for(j = 0; j < 4; j++)
504 {
505 UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
506 UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
507
508 ihevce_sad4_4x4_neon(
509 pu1_curr + j * cur_buf_stride_ls2,
510 cur_buf_stride,
511 pu1_ref + j * ref_buf_stride_ls2,
512 ref_buf_stride,
513 &au2_4x4_sad[4 * j]);
514 }
515
516 COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
517
518 // calculate MV cost
519 {
520 S16 mvdx1, mvdy1;
521 S32 i4_ref_idx = ps_result_prms->i1_ref_idx;
522 search_results_t *ps_srch_rslts = ps_result_prms->ps_search_results;
523
524 pred_ctxt_t *ps_pred_ctxt = &ps_srch_rslts->as_pred_ctxt[i4_ref_idx];
525 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
526 search_node_t *ps_mvp_node = ps_pred_nodes->ps_mvp_node;
527
528 S32 inp_shift = 2;
529 S32 pred_shift = ps_mvp_node->u1_subpel_done ? 0 : 2;
530 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
531 S32 lambda = ps_pred_ctxt->lambda;
532 S32 rnd = 1 << (lambda_q_shift - 1);
533 S32 mv_p_x = ps_mvp_node->s_mv.i2_mvx;
534 S32 mv_p_y = ps_mvp_node->s_mv.i2_mvy;
535 S32 ref_bits =
536 ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
537
538 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
539
540 mvdx1 = ABS(mvdx1);
541 mvdy1 = ABS(mvdy1);
542
543 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) +
544 ref_bits + 2;
545
546 i4_mv_cost *= lambda;
547 i4_mv_cost += rnd;
548 i4_mv_cost >>= lambda_q_shift;
549
550 i4_mv_cost = CLIP_U16(i4_mv_cost);
551 }
552
553 {
554 S32 i4_sad, i4_tot_cost;
555 S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
556 S32 best_node_cost;
557
558 /* For each valid partition, update the refine_prm structure to
559 * reflect the best and second best candidates for that partition */
560 for(j = 0; j < refine_ctxt->i4_num_valid_parts; j++)
561 {
562 S32 part_id = pi4_valid_part_ids[j];
563 S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : j;
564
565 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
566 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
567
568 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
569
570 if(i4_tot_cost < best_node_cost)
571 {
572 refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
573 refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
574 refine_ctxt->i2_mv_x[0][id] = ps_search_node->s_mv.i2_mvx;
575 refine_ctxt->i2_mv_y[0][id] = ps_search_node->s_mv.i2_mvy;
576 refine_ctxt->i2_ref_idx[0][id] = ps_search_node->i1_ref_idx;
577 }
578 }
579 }
580
581 ps_search_node++;
582 }
583
584 ps_search_node = ps_search_prms->ps_search_nodes;
585
586 for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
587 {
588 S32 part_id = refine_ctxt->ai4_part_id[i];
589
590 if(refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
591 {
592 assert(refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
593 assert(refine_ctxt->i2_mv_x[0][part_id] == 0);
594 assert(refine_ctxt->i2_mv_y[0][part_id] == 0);
595
596 refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
597 }
598 if(refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
599 {
600 assert(refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
601 assert(refine_ctxt->i2_mv_x[1][part_id] == 0);
602 assert(refine_ctxt->i2_mv_y[1][part_id] == 0);
603
604 refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
605 }
606 }
607 }
608
hme_calc_sad_and_1_best_result_subpel_neon(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)609 void hme_calc_sad_and_1_best_result_subpel_neon(
610 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
611 {
612 mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
613 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
614 S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
615 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
616 S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
617 S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
618 S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
619 U16 au2_4x4_sad[16];
620 S32 i;
621
622 /* Loop to compute the SAD's */
623 for(i = 0; i < 4; i++)
624 {
625 UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
626 UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
627
628 ihevce_sad4_4x4_neon(
629 pu1_curr + i * cur_buf_stride_ls2,
630 cur_buf_stride,
631 pu1_ref + i * ref_buf_stride_ls2,
632 ref_buf_stride,
633 &au2_4x4_sad[4 * i]);
634 }
635
636 COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
637
638 /* For each valid partition, update the refine_prm structure to
639 * reflect the best and second best candidates for that partition */
640 for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
641 {
642 S32 part_id = pi4_valid_part_ids[i];
643 S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
644 S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
645 S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
646 S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
647 S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
648
649 if(i4_tot_cost < best_node_cost)
650 {
651 refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
652 refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
653 refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
654 refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
655 refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
656 }
657 }
658
659 for(i = 0; i < TOT_NUM_PARTS; i++)
660 {
661 if(refine_ctxt->i2_tot_cost[0][i] >= MAX_SIGNED_16BIT_VAL)
662 {
663 refine_ctxt->ai2_fullpel_satd[0][i] = MAX_SIGNED_16BIT_VAL;
664 }
665 }
666 }
667