• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 ******************************************************************************
22 * @file
23 *  ihevce_me_neon.c
24 *
25 * @brief
26 *  Subpel refinement modules for ME algo
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <assert.h>
46 #include <arm_neon.h>
47 
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_cmn_utils_neon.h"
52 #include "ihevc_chroma_itrans_recon.h"
53 #include "ihevc_chroma_intra_pred.h"
54 #include "ihevc_debug.h"
55 #include "ihevc_deblk.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_itrans_recon.h"
58 #include "ihevc_intra_pred.h"
59 #include "ihevc_inter_pred.h"
60 #include "ihevc_macros.h"
61 #include "ihevc_mem_fns.h"
62 #include "ihevc_padding.h"
63 #include "ihevc_quant_iquant_ssd.h"
64 #include "ihevc_resi_trans.h"
65 #include "ihevc_sao.h"
66 #include "ihevc_structs.h"
67 #include "ihevc_weighted_pred.h"
68 
69 #include "rc_cntrl_param.h"
70 #include "rc_frame_info_collector.h"
71 #include "rc_look_ahead_params.h"
72 
73 #include "ihevce_api.h"
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_function_selector.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_enc_structs.h"
80 #include "ihevce_had_satd.h"
81 #include "ihevce_ipe_instr_set_router.h"
82 #include "ihevce_global_tables.h"
83 
84 #include "hme_datatype.h"
85 #include "hme_common_defs.h"
86 #include "hme_common_utils.h"
87 #include "hme_interface.h"
88 #include "hme_defs.h"
89 #include "hme_err_compute.h"
90 #include "hme_globals.h"
91 
92 #include "ihevce_me_instr_set_router.h"
93 
94 /*****************************************************************************/
95 /* Typedefs                                                                  */
96 /*****************************************************************************/
97 typedef void ft_calc_sad4_nxn(
98     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad);
99 
100 /*****************************************************************************/
101 /* Function Macros                                                           */
102 /*****************************************************************************/
103 #define COMBINE_SADS(pps, as, i)                                                                   \
104     {                                                                                              \
105         pps[PART_ID_NxN_TL][i] = (as[0] + as[1] + as[4] + as[5]);                                  \
106         pps[PART_ID_NxN_TR][i] = (as[2] + as[3] + as[6] + as[7]);                                  \
107         pps[PART_ID_NxN_BL][i] = (as[8] + as[9] + as[12] + as[13]);                                \
108         pps[PART_ID_NxN_BR][i] = (as[10] + as[11] + as[14] + as[15]);                              \
109                                                                                                    \
110         pps[PART_ID_Nx2N_L][i] = pps[PART_ID_NxN_TL][i] + pps[PART_ID_NxN_BL][i];                  \
111         pps[PART_ID_Nx2N_R][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_BR][i];                  \
112         pps[PART_ID_2NxN_T][i] = pps[PART_ID_NxN_TR][i] + pps[PART_ID_NxN_TL][i];                  \
113         pps[PART_ID_2NxN_B][i] = pps[PART_ID_NxN_BR][i] + pps[PART_ID_NxN_BL][i];                  \
114                                                                                                    \
115         pps[PART_ID_nLx2N_L][i] = (as[8] + as[0] + as[12] + as[4]);                                \
116         pps[PART_ID_nRx2N_R][i] = (as[3] + as[7] + as[15] + as[11]);                               \
117         pps[PART_ID_2NxnU_T][i] = (as[1] + as[0] + as[2] + as[3]);                                 \
118         pps[PART_ID_2NxnD_B][i] = (as[15] + as[14] + as[12] + as[13]);                             \
119                                                                                                    \
120         pps[PART_ID_2Nx2N][i] = pps[PART_ID_2NxN_T][i] + pps[PART_ID_2NxN_B][i];                   \
121                                                                                                    \
122         pps[PART_ID_2NxnU_B][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnU_T][i];                 \
123         pps[PART_ID_2NxnD_T][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_2NxnD_B][i];                 \
124         pps[PART_ID_nRx2N_L][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nRx2N_R][i];                 \
125         pps[PART_ID_nLx2N_R][i] = pps[PART_ID_2Nx2N][i] - pps[PART_ID_nLx2N_L][i];                 \
126     }
127 
128 #define COMBINE_SADS_2(ps, as)                                                                     \
129     {                                                                                              \
130         ps[PART_ID_NxN_TL] = (as[0] + as[1] + as[4] + as[5]);                                      \
131         ps[PART_ID_NxN_TR] = (as[2] + as[3] + as[6] + as[7]);                                      \
132         ps[PART_ID_NxN_BL] = (as[8] + as[9] + as[12] + as[13]);                                    \
133         ps[PART_ID_NxN_BR] = (as[10] + as[11] + as[14] + as[15]);                                  \
134                                                                                                    \
135         ps[PART_ID_Nx2N_L] = ps[PART_ID_NxN_TL] + ps[PART_ID_NxN_BL];                              \
136         ps[PART_ID_Nx2N_R] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_BR];                              \
137         ps[PART_ID_2NxN_T] = ps[PART_ID_NxN_TR] + ps[PART_ID_NxN_TL];                              \
138         ps[PART_ID_2NxN_B] = ps[PART_ID_NxN_BR] + ps[PART_ID_NxN_BL];                              \
139                                                                                                    \
140         ps[PART_ID_nLx2N_L] = (as[8] + as[0] + as[12] + as[4]);                                    \
141         ps[PART_ID_nRx2N_R] = (as[3] + as[7] + as[15] + as[11]);                                   \
142         ps[PART_ID_2NxnU_T] = (as[1] + as[0] + as[2] + as[3]);                                     \
143         ps[PART_ID_2NxnD_B] = (as[15] + as[14] + as[12] + as[13]);                                 \
144                                                                                                    \
145         ps[PART_ID_2Nx2N] = ps[PART_ID_2NxN_T] + ps[PART_ID_2NxN_B];                               \
146                                                                                                    \
147         ps[PART_ID_2NxnU_B] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnU_T];                             \
148         ps[PART_ID_2NxnD_T] = ps[PART_ID_2Nx2N] - ps[PART_ID_2NxnD_B];                             \
149         ps[PART_ID_nRx2N_L] = ps[PART_ID_2Nx2N] - ps[PART_ID_nRx2N_R];                             \
150         ps[PART_ID_nLx2N_R] = ps[PART_ID_2Nx2N] - ps[PART_ID_nLx2N_L];                             \
151     }
152 
153 /*****************************************************************************/
154 /* Function Definitions                                                      */
155 /*****************************************************************************/
156 
ihevce_sad4_2x2_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)157 static void ihevce_sad4_2x2_neon(
158     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
159 {
160     uint16x8_t abs = vdupq_n_u16(0);
161     uint32x4_t sad;
162     WORD32 i;
163 
164     /* -------- Compute four 2x2 SAD Transforms of 8x2 in one call--------- */
165     for(i = 0; i < 2; i++)
166     {
167         const uint8x8_t src = vld1_u8(pu1_src);
168         const uint8x8_t pred = vld1_u8(pu1_pred);
169 
170         abs = vabal_u8(abs, src, pred);
171         pu1_src += src_strd;
172         pu1_pred += pred_strd;
173     }
174     sad = vpaddlq_u16(abs);
175     vst1q_u32(pu4_sad, sad);
176 }
177 
ihevce_sad4_4x4_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD16 * pu2_sad)178 static void ihevce_sad4_4x4_neon(
179     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD16 *pu2_sad)
180 {
181     uint16x8_t abs_01 = vdupq_n_u16(0);
182     uint16x8_t abs_23 = vdupq_n_u16(0);
183     uint16x4_t tmp_a0, tmp_a1;
184     WORD32 i;
185 
186     /* -------- Compute four 4x4 SAD Transforms of 16x4 in one call--------- */
187     for(i = 0; i < 4; i++)
188     {
189         const uint8x16_t src = vld1q_u8(pu1_src);
190         const uint8x16_t pred = vld1q_u8(pu1_pred);
191 
192         abs_01 = vabal_u8(abs_01, vget_low_u8(src), vget_low_u8(pred));
193         abs_23 = vabal_u8(abs_23, vget_high_u8(src), vget_high_u8(pred));
194         pu1_src += src_strd;
195         pu1_pred += pred_strd;
196     }
197     tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
198     tmp_a1 = vpadd_u16(vget_low_u16(abs_23), vget_high_u16(abs_23));
199     abs_01 = vcombine_u16(tmp_a0, tmp_a1);
200     tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
201     vst1_u16(pu2_sad, tmp_a0);
202 }
203 
ihevce_sad4_8x8_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)204 static void ihevce_sad4_8x8_neon(
205     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
206 {
207     uint16x8_t abs_0 = vdupq_n_u16(0);
208     uint16x8_t abs_1 = vdupq_n_u16(0);
209     uint16x8_t abs_2 = vdupq_n_u16(0);
210     uint16x8_t abs_3 = vdupq_n_u16(0);
211     uint16x4_t tmp_a0, tmp_a1;
212     uint32x4_t sad;
213     WORD32 i;
214 
215     /* -------- Compute four 8x8 SAD Transforms of 32x8 in one call--------- */
216     for(i = 0; i < 8; i++)
217     {
218         uint8x16_t src_01 = vld1q_u8(pu1_src);
219         uint8x16_t pred_01 = vld1q_u8(pu1_pred);
220         uint8x16_t src_23 = vld1q_u8(pu1_src + 16);
221         uint8x16_t pred_23 = vld1q_u8(pu1_pred + 16);
222 
223         abs_0 = vabal_u8(abs_0, vget_low_u8(src_01), vget_low_u8(pred_01));
224         abs_1 = vabal_u8(abs_1, vget_high_u8(src_01), vget_high_u8(pred_01));
225         abs_2 = vabal_u8(abs_2, vget_low_u8(src_23), vget_low_u8(pred_23));
226         abs_3 = vabal_u8(abs_3, vget_high_u8(src_23), vget_high_u8(pred_23));
227         pu1_src += src_strd;
228         pu1_pred += pred_strd;
229     }
230     tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
231     tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
232     abs_0 = vcombine_u16(tmp_a0, tmp_a1);
233     tmp_a0 = vpadd_u16(vget_low_u16(abs_2), vget_high_u16(abs_2));
234     tmp_a1 = vpadd_u16(vget_low_u16(abs_3), vget_high_u16(abs_3));
235     abs_1 = vcombine_u16(tmp_a0, tmp_a1);
236     tmp_a0 = vpadd_u16(vget_low_u16(abs_0), vget_high_u16(abs_0));
237     tmp_a1 = vpadd_u16(vget_low_u16(abs_1), vget_high_u16(abs_1));
238     abs_0 = vcombine_u16(tmp_a0, tmp_a1);
239     sad = vpaddlq_u16(abs_0);
240     vst1q_u32(pu4_sad, sad);
241 }
242 
ihevce_sad4_16x16_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,UWORD32 * pu4_sad)243 static void ihevce_sad4_16x16_neon(
244     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_pred, WORD32 pred_strd, UWORD32 *pu4_sad)
245 {
246     WORD32 i;
247 
248     /* ------ Compute four 16x16 SAD Transforms of 64x16 in one call-------- */
249     for(i = 0; i < 4; i++)
250     {
251         pu4_sad[i] = ihevce_nxn_sad_computer_neon(
252             pu1_src + (i * 16), src_strd, pu1_pred + (i * 16), pred_strd, 16);
253     }
254 }
255 
compute_part_sads_for_MxM_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)256 void compute_part_sads_for_MxM_blk_neon(
257     grid_ctxt_t *ps_grid,
258     UWORD8 *pu1_cur_ptr,
259     WORD32 cur_buf_stride,
260     WORD32 **pp_part_sads,
261     cand_t *ps_cand,
262     WORD32 *num_cands,
263     CU_SIZE_T e_cu_size)
264 {
265     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
266     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
267 
268     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
269     WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0,         -grd_sz_x, 0,         grd_sz_x, 0,
270                                                 -grd_sz_x, grd_sz_x,  -grd_sz_x, grd_sz_x };
271     WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0,         0,         -grd_sz_y, 0,       grd_sz_y,
272                                                 -grd_sz_y, -grd_sz_y, grd_sz_y,  grd_sz_y };
273     WORD32 shift = (WORD32)e_cu_size;
274 
275     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
276     WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
277     WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
278 
279     cand_t *cand0 = ps_cand;
280 
281     ft_calc_sad4_nxn *calc_sad4 = NULL;
282 
283     /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
284      * AMP cases */
285     UWORD32 au4_nxn_sad[16];
286 
287     WORD32 i, j;
288 
289     *num_cands = 0;
290 
291     /* Loop to fill up the cand_t array and to calculate num_cands */
292     for(i = 0; i < ps_grid->num_grids; i++)
293     {
294         WORD32 j;
295         WORD32 mask = ps_grid->pi4_grd_mask[i];
296         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
297         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
298         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
299 
300         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
301         {
302             if(mask & 1)
303             {
304                 *num_cands = *num_cands + 1;
305                 cand0->grid_ix = i;
306                 cand0->ref_idx = ps_grid->p_ref_idx[i];
307                 cand0->pu1_ref_ptr =
308                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
309                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
310                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
311                 cand0++;
312             }
313         }
314     }
315 
316     /* fn selector */
317     if(e_cu_size == CU_8x8)
318         calc_sad4 = ihevce_sad4_2x2_neon;
319     else if(e_cu_size == CU_32x32)
320         calc_sad4 = ihevce_sad4_8x8_neon;
321     else if(e_cu_size == CU_64x64)
322         calc_sad4 = ihevce_sad4_16x16_neon;
323 
324     /* Loop to compute the SAD's */
325     for(i = 0; i < *num_cands; i++)
326     {
327         cand_t *cand = ps_cand + i;
328 
329         for(j = 0; j < 4; j++)
330             (*calc_sad4)(
331                 pu1_cur_ptr + j * cur_buf_stride_lsN,
332                 cur_buf_stride,
333                 cand->pu1_ref_ptr + j * ref_buf_stride_lsN,
334                 ref_buf_stride,
335                 &au4_nxn_sad[4 * j]);
336 
337         COMBINE_SADS(pp_part_sads, au4_nxn_sad, i);
338     }
339 }
340 
compute_4x4_sads_for_16x16_blk_neon(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands)341 void compute_4x4_sads_for_16x16_blk_neon(
342     grid_ctxt_t *ps_grid,
343     UWORD8 *pu1_cur_ptr,
344     WORD32 cur_buf_stride,
345     UWORD16 **pp_part_sads,
346     cand_t *ps_cand,
347     WORD32 *num_cands)
348 {
349     WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
350     WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
351 
352     /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
353     WORD32 offset_x[NUM_CANDIDATES_IN_GRID] = { 0,         -grd_sz_x, 0,         grd_sz_x, 0,
354                                                 -grd_sz_x, grd_sz_x,  -grd_sz_x, grd_sz_x };
355     WORD32 offset_y[NUM_CANDIDATES_IN_GRID] = { 0,         0,         -grd_sz_y, 0,       grd_sz_y,
356                                                 -grd_sz_y, -grd_sz_y, grd_sz_y,  grd_sz_y };
357 
358     WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
359     WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
360     WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
361 
362     cand_t *cand0 = ps_cand;
363 
364     /* for a 2Nx2N partition we evaluate (N/2)x(N/2) SADs. This is needed for
365      * AMP cases */
366     UWORD16 au2_4x4_sad[16];
367 
368     WORD32 i, j;
369 
370     *num_cands = 0;
371 
372     /* Loop to fill up the cand_t array and to calculate num_cands */
373     for(i = 0; i < ps_grid->num_grids; i++)
374     {
375         WORD32 j;
376         WORD32 mask = ps_grid->pi4_grd_mask[i];
377         UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
378         WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
379         WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
380 
381         for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
382         {
383             if(mask & 1)
384             {
385                 *num_cands = *num_cands + 1;
386                 cand0->grid_ix = i;
387                 cand0->ref_idx = ps_grid->p_ref_idx[i];
388                 cand0->pu1_ref_ptr =
389                     pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
390                 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
391                 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
392                 cand0++;
393             }
394         }
395     }
396 
397     /* Loop to compute the SAD's */
398     for(i = 0; i < *num_cands; i++)
399     {
400         cand_t *cand = ps_cand + i;
401 
402         for(j = 0; j < 4; j++)
403             ihevce_sad4_4x4_neon(
404                 pu1_cur_ptr + j * cur_buf_stride_ls2,
405                 cur_buf_stride,
406                 cand->pu1_ref_ptr + j * ref_buf_stride_ls2,
407                 ref_buf_stride,
408                 &au2_4x4_sad[4 * j]);
409 
410         COMBINE_SADS(pp_part_sads, au2_4x4_sad, i);
411     }
412 }
413 
hme_evalsad_grid_npu_MxN_neon(err_prms_t * ps_prms)414 void hme_evalsad_grid_npu_MxN_neon(err_prms_t *ps_prms)
415 {
416     S32 *pi4_sad = ps_prms->pi4_sad_grid;
417     S32 i, grid_count = 0;
418     S32 x_off = ps_prms->i4_step;
419     S32 y_off = ps_prms->i4_step * ps_prms->i4_ref_stride;
420 
421     assert((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
422 
423     for(i = 0; i < 9; i++)
424     {
425         if(ps_prms->i4_grid_mask & (1 << i))
426             grid_count++;
427     }
428     pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
429 
430     for(i = 0; i < 9; i++)
431     {
432         U08 *pu1_inp = ps_prms->pu1_inp;
433         U08 *pu1_ref = ps_prms->pu1_ref;
434 
435         if(!(ps_prms->i4_grid_mask & (1 << i)))
436             continue;
437 
438         pu1_ref += x_off * gai1_grid_id_to_x[i];
439         pu1_ref += y_off * gai1_grid_id_to_y[i];
440         *pi4_sad++ = ihevce_4mx4n_sad_computer_neon(
441             pu1_inp,
442             pu1_ref,
443             ps_prms->i4_inp_stride,
444             ps_prms->i4_ref_stride,
445             ps_prms->i4_blk_wd,
446             ps_prms->i4_blk_ht);
447     }
448 }
449 
hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t * ps_prms)450 void hme_evalsad_pt_npu_MxN_8bit_neon(err_prms_t *ps_prms)
451 {
452     ps_prms->pi4_sad_grid[0] = ihevce_4mx4n_sad_computer_neon(
453         ps_prms->pu1_inp,
454         ps_prms->pu1_ref,
455         ps_prms->i4_inp_stride,
456         ps_prms->i4_ref_stride,
457         ps_prms->i4_blk_wd,
458         ps_prms->i4_blk_ht);
459 }
460 
hme_calc_sad_and_1_best_result_neon(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)461 void hme_calc_sad_and_1_best_result_neon(
462     hme_search_prms_t *ps_search_prms,
463     wgt_pred_ctxt_t *ps_wt_inp_prms,
464     err_prms_t *ps_err_prms,
465     result_upd_prms_t *ps_result_prms,
466     U08 **ppu1_ref,
467     S32 i4_ref_stride)
468 {
469     mv_refine_ctxt_t *refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
470     search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
471     S32 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
472     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
473     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
474     S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
475     S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
476     S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
477     S32 i4_inp_off, i4_ref_off;
478     S32 i;
479 
480     i4_inp_off = ps_search_prms->i4_cu_x_off;
481     i4_inp_off += (ps_search_prms->i4_cu_y_off * cur_buf_stride);
482     i4_ref_off = ps_search_prms->i4_x_off;
483     i4_ref_off += (ps_search_prms->i4_y_off * i4_ref_stride);
484 
485     /* Run through each of the candts in a loop */
486     for(i = 0; i < i4_num_nodes; i++)
487     {
488         U16 au2_4x4_sad[16];
489         S32 i4_mv_cost;
490         S32 j;
491 
492         if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
493         {
494             continue;
495         }
496 
497         ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
498         ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_off;
499         ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
500         ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
501 
502         /* Loop to compute the SAD's */
503         for(j = 0; j < 4; j++)
504         {
505             UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
506             UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
507 
508             ihevce_sad4_4x4_neon(
509                 pu1_curr + j * cur_buf_stride_ls2,
510                 cur_buf_stride,
511                 pu1_ref + j * ref_buf_stride_ls2,
512                 ref_buf_stride,
513                 &au2_4x4_sad[4 * j]);
514         }
515 
516         COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
517 
518         // calculate MV cost
519         {
520             S16 mvdx1, mvdy1;
521             S32 i4_ref_idx = ps_result_prms->i1_ref_idx;
522             search_results_t *ps_srch_rslts = ps_result_prms->ps_search_results;
523 
524             pred_ctxt_t *ps_pred_ctxt = &ps_srch_rslts->as_pred_ctxt[i4_ref_idx];
525             pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
526             search_node_t *ps_mvp_node = ps_pred_nodes->ps_mvp_node;
527 
528             S32 inp_shift = 2;
529             S32 pred_shift = ps_mvp_node->u1_subpel_done ? 0 : 2;
530             S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
531             S32 lambda = ps_pred_ctxt->lambda;
532             S32 rnd = 1 << (lambda_q_shift - 1);
533             S32 mv_p_x = ps_mvp_node->s_mv.i2_mvx;
534             S32 mv_p_y = ps_mvp_node->s_mv.i2_mvy;
535             S32 ref_bits =
536                 ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
537 
538             COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
539 
540             mvdx1 = ABS(mvdx1);
541             mvdy1 = ABS(mvdy1);
542 
543             i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) +
544                          ref_bits + 2;
545 
546             i4_mv_cost *= lambda;
547             i4_mv_cost += rnd;
548             i4_mv_cost >>= lambda_q_shift;
549 
550             i4_mv_cost = CLIP_U16(i4_mv_cost);
551         }
552 
553         {
554             S32 i4_sad, i4_tot_cost;
555             S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
556             S32 best_node_cost;
557 
558             /* For each valid partition, update the refine_prm structure to
559              * reflect the best and second best candidates for that partition */
560             for(j = 0; j < refine_ctxt->i4_num_valid_parts; j++)
561             {
562                 S32 part_id = pi4_valid_part_ids[j];
563                 S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : j;
564 
565                 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
566                 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
567 
568                 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
569 
570                 if(i4_tot_cost < best_node_cost)
571                 {
572                     refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
573                     refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
574                     refine_ctxt->i2_mv_x[0][id] = ps_search_node->s_mv.i2_mvx;
575                     refine_ctxt->i2_mv_y[0][id] = ps_search_node->s_mv.i2_mvy;
576                     refine_ctxt->i2_ref_idx[0][id] = ps_search_node->i1_ref_idx;
577                 }
578             }
579         }
580 
581         ps_search_node++;
582     }
583 
584     ps_search_node = ps_search_prms->ps_search_nodes;
585 
586     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
587     {
588         S32 part_id = refine_ctxt->ai4_part_id[i];
589 
590         if(refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
591         {
592             assert(refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
593             assert(refine_ctxt->i2_mv_x[0][part_id] == 0);
594             assert(refine_ctxt->i2_mv_y[0][part_id] == 0);
595 
596             refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
597         }
598         if(refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
599         {
600             assert(refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
601             assert(refine_ctxt->i2_mv_x[1][part_id] == 0);
602             assert(refine_ctxt->i2_mv_y[1][part_id] == 0);
603 
604             refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
605         }
606     }
607 }
608 
hme_calc_sad_and_1_best_result_subpel_neon(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)609 void hme_calc_sad_and_1_best_result_subpel_neon(
610     err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
611 {
612     mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
613     S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
614     S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
615     S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
616     S32 ref_buf_stride = ps_err_prms->i4_ref_stride;
617     S32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
618     S32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
619     U16 au2_4x4_sad[16];
620     S32 i;
621 
622     /* Loop to compute the SAD's */
623     for(i = 0; i < 4; i++)
624     {
625         UWORD8 *pu1_curr = ps_err_prms->pu1_inp;
626         UWORD8 *pu1_ref = ps_err_prms->pu1_ref;
627 
628         ihevce_sad4_4x4_neon(
629             pu1_curr + i * cur_buf_stride_ls2,
630             cur_buf_stride,
631             pu1_ref + i * ref_buf_stride_ls2,
632             ref_buf_stride,
633             &au2_4x4_sad[4 * i]);
634     }
635 
636     COMBINE_SADS_2(pi4_sad_grid, au2_4x4_sad);
637 
638     /* For each valid partition, update the refine_prm structure to
639      * reflect the best and second best candidates for that partition */
640     for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
641     {
642         S32 part_id = pi4_valid_part_ids[i];
643         S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
644         S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
645         S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
646         S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
647         S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
648 
649         if(i4_tot_cost < best_node_cost)
650         {
651             refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
652             refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
653             refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
654             refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
655             refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
656         }
657     }
658 
659     for(i = 0; i < TOT_NUM_PARTS; i++)
660     {
661         if(refine_ctxt->i2_tot_cost[0][i] >= MAX_SIGNED_16BIT_VAL)
662         {
663             refine_ctxt->ai2_fullpel_satd[0][i] = MAX_SIGNED_16BIT_VAL;
664         }
665     }
666 }
667