• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_hme_utils_neon.c
24 *
25 * @brief
26 *  Contains function definitions for hme utils function in neon intrinsic
27 *
28 *
29 * @author
30 * ittian
31 *
32 * @par List of Functions:
33 *   - ihevce_get_wt_inp_8x8_neon()
34 *   - ihevce_get_wt_inp_ctb_neon()
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 /*****************************************************************************/
41 /* File Includes                                                             */
42 /*****************************************************************************/
43 /* System include files */
44 #include <stdio.h>
45 #include <string.h>
46 #include <assert.h>
47 #include <arm_neon.h>
48 
49 /* User include files */
50 #include "ihevc_typedefs.h"
51 #include "itt_video_api.h"
52 #include "ihevc_cmn_utils_neon.h"
53 #include "ihevc_chroma_itrans_recon.h"
54 #include "ihevc_chroma_intra_pred.h"
55 #include "ihevc_debug.h"
56 #include "ihevc_deblk.h"
57 #include "ihevc_defs.h"
58 #include "ihevc_itrans_recon.h"
59 #include "ihevc_intra_pred.h"
60 #include "ihevc_inter_pred.h"
61 #include "ihevc_macros.h"
62 #include "ihevc_mem_fns.h"
63 #include "ihevc_padding.h"
64 #include "ihevc_quant_iquant_ssd.h"
65 #include "ihevc_resi_trans.h"
66 #include "ihevc_sao.h"
67 #include "ihevc_structs.h"
68 #include "ihevc_weighted_pred.h"
69 
70 #include "rc_cntrl_param.h"
71 #include "rc_frame_info_collector.h"
72 #include "rc_look_ahead_params.h"
73 
74 #include "ihevce_api.h"
75 #include "ihevce_defs.h"
76 #include "ihevce_lap_enc_structs.h"
77 #include "ihevce_multi_thrd_structs.h"
78 #include "ihevce_function_selector.h"
79 #include "ihevce_me_common_defs.h"
80 #include "ihevce_enc_structs.h"
81 #include "ihevce_had_satd.h"
82 #include "ihevce_ipe_instr_set_router.h"
83 #include "ihevce_global_tables.h"
84 
85 #include "hme_datatype.h"
86 #include "hme_interface.h"
87 #include "hme_common_defs.h"
88 #include "hme_defs.h"
89 #include "ihevce_me_instr_set_router.h"
90 #include "hme_globals.h"
91 #include "hme_utils.h"
92 #include "hme_coarse.h"
93 #include "hme_refine.h"
94 
95 /*****************************************************************************/
96 /* Constant Macros                                                           */
97 /*****************************************************************************/
98 #define IHEVCE_WT_PRED_SHIFT 15
99 
100 /*****************************************************************************/
101 /* Function Definitions                                                      */
102 /*****************************************************************************/
103 
ihevce_get_wt_inp_4x8_neon(const UWORD8 * pu1_src,UWORD8 * pu1_dst,wgt_pred_ctxt_t * ps_wt_inp_prms,WORD32 u1_num_valid_refs,WORD32 * ai4_wt_refs,WORD32 src_stride,WORD32 dst_stride)104 static INLINE void ihevce_get_wt_inp_4x8_neon(
105     const UWORD8 *pu1_src,
106     UWORD8 *pu1_dst,
107     wgt_pred_ctxt_t *ps_wt_inp_prms,
108     WORD32 u1_num_valid_refs,
109     WORD32 *ai4_wt_refs,
110     WORD32 src_stride,
111     WORD32 dst_stride)
112 {
113     S32 inv_wt;
114     S16 off;
115     uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
116     int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
117     int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
118     int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
119     int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
120     int32x4_t add_4x32b, inv_wt_4x32b;
121     U08 ref;
122     int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
123 
124     src0_8x8b = vld1_u8((pu1_src + 0 * src_stride));
125     src1_8x8b = vld1_u8((pu1_src + 1 * src_stride));
126     src2_8x8b = vld1_u8((pu1_src + 2 * src_stride));
127     src3_8x8b = vld1_u8((pu1_src + 3 * src_stride));
128     /* Store */
129     vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
130     vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
131     vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
132     vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
133 
134     if(u1_num_valid_refs)
135     {
136         /* Right 4x4 Block */
137         src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
138         src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
139         src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
140         src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
141 
142         /* add value */
143         add_4x32b = vdupq_n_s32(0x4000);
144     }
145 
146     /* Run thro all ref ids, except ref==num_ref, which is already done */
147     for(ref = 0; ref < u1_num_valid_refs; ref++)
148     {
149         S32 i4_ref_idx = ai4_wt_refs[ref];
150 
151         /* InvWt and off specific to this ref id */
152         inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[i4_ref_idx];
153         off = (S16)ps_wt_inp_prms->a_wpred_off[i4_ref_idx];
154 
155         /* set1 uses multiple instructions : Try to AVOID it */
156         off_8x16b = vdupq_n_s16(off);
157         inv_wt_4x32b = vdupq_n_s32(inv_wt);
158 
159         /* Each ref id may have differnet wt/offset. */
160         /* So we have unique inp buf for each ref id */
161         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[i4_ref_idx];
162 
163         /* inp - off */
164         src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
165         src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
166         src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
167         src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
168 
169         dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
170         dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
171         dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
172         dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
173 
174         dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
175         dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
176         dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
177         dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
178 
179         /* (inp-off) << shift */
180         dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
181         dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
182         dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
183         dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
184 
185         /* (inp-off) << shift */
186         dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
187         dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
188         dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
189         dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
190 
191         /* ((inp-off) << shift) * inv_wt + 1<<14 */
192         dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
193         dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
194         dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
195         dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
196 
197         /* ((inp-off) << shift) * inv_wt + 1<<14 */
198         dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
199         dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
200         dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
201         dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
202 
203         /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
204         src4_8x16b = vcombine_s16(
205             vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
206             vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
207         src5_8x16b = vcombine_s16(
208             vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
209             vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
210         src6_8x16b = vcombine_s16(
211             vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
212             vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
213         src7_8x16b = vcombine_s16(
214             vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
215             vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
216         /* Store */
217         vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
218         vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
219         vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
220         vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
221     }
222 }
223 
hme_get_wt_inp_8x8_neon(layer_ctxt_t * ps_curr_layer,wgt_pred_ctxt_t * ps_wt_inp_prms,S32 dst_stride,S32 pos_x,S32 pos_y,S32 size,S32 num_ref,U08 u1_is_wt_pred_on)224 void hme_get_wt_inp_8x8_neon(
225     layer_ctxt_t *ps_curr_layer,
226     wgt_pred_ctxt_t *ps_wt_inp_prms,
227     S32 dst_stride,
228     S32 pos_x,
229     S32 pos_y,
230     S32 size,
231     S32 num_ref,
232     U08 u1_is_wt_pred_on)
233 {
234     WORD32 ref;
235     UWORD8 *pu1_src, *pu1_dst;
236     WORD32 x_count, y_count;
237     WORD32 src_stride = ps_curr_layer->i4_inp_stride;
238 
239     /* Make sure the start positions of block are inside frame limits */
240     pos_x = MIN(pos_x, ps_curr_layer->i4_wd - 1);
241     pos_y = MIN(pos_y, ps_curr_layer->i4_ht - 1);
242 
243     /* In case we handle imcomplete CTBs, we copy only as much as reqd */
244     /* from input buffers to prevent out of bound accesses. In this    */
245     /* case, we do padding in x or y or both dirns */
246     x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
247     y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
248 
249     /* Fixed source */
250     pu1_src = ps_curr_layer->pu1_inp;
251     pu1_src += (pos_x + (pos_y * src_stride));
252 
253     if(!u1_is_wt_pred_on)
254     {
255         uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
256 
257         /*************         Top 4x8 Processing        ****************/
258         /* Load Source : Lower 64 bit */
259         src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
260         src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
261         src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
262         src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
263 
264         /* ref==num_ref */ /* last ref will be non weighted input */
265         pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
266         /* Store */
267         vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
268         vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
269         vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
270         vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
271 
272         /*************       Bottom 4x8 Processing        ****************/
273         pu1_src += 4 * src_stride;
274         pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref] + 4 * dst_stride;
275 
276         /* Load Source : Lower 64 bit */
277         src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
278         src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
279         src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
280         src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
281         /* ref==num_ref */ /* last ref will be non weighted input */
282         /* Store */
283         vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
284         vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
285         vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
286         vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
287 
288         pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
289 
290         if(x_count != size)
291         {
292             hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
293         }
294 
295         /* Check and do padding in bottom directino if need be */
296         if(y_count != size)
297         {
298             hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
299         }
300 
301         for(ref = 0; ref < num_ref + 1; ref++)
302         {
303             ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
304         }
305     }
306     else
307     {
308         S32 wt, off;
309         S32 ai4_wt_refs[MAX_NUM_REF];
310         U08 u1_num_valid_refs = 0;
311 
312         for(ref = 0; ref < num_ref; ref++)
313         {
314             wt = ps_wt_inp_prms->a_wpred_wt[ref];
315             off = ps_wt_inp_prms->a_wpred_off[ref];
316 
317             if((WGHT_DEFAULT == wt) && (0 == off))
318             {
319                 ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
320             }
321             else
322             {
323                 ai4_wt_refs[u1_num_valid_refs++] = ref;
324                 ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[ref];
325             }
326         }
327 
328         ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
329 
330         /*************         Top 4x8 Processing        ****************/
331         /* ref==num_ref */ /* last ref will be non weighted input */
332         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
333         ihevce_get_wt_inp_4x8_neon(
334             pu1_src,
335             pu1_dst,
336             ps_wt_inp_prms,
337             u1_num_valid_refs,
338             ai4_wt_refs,
339             src_stride,
340             dst_stride);
341         /*************       Bottom 4x8 Processing        ****************/
342         pu1_src += 4 * src_stride;
343         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref] + 4 * dst_stride;
344         ihevce_get_wt_inp_4x8_neon(
345             pu1_src,
346             pu1_dst,
347             ps_wt_inp_prms,
348             u1_num_valid_refs,
349             ai4_wt_refs,
350             src_stride,
351             dst_stride);
352 
353         for(ref = 0; ref < u1_num_valid_refs; ref++)
354         {
355             /* Check and do padding in right direction if need be */
356             pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
357             if(x_count != size)
358             {
359                 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
360             }
361 
362             /* Check and do padding in bottom directino if need be */
363             if(y_count != size)
364             {
365                 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
366             }
367         }
368 
369         /* Check and do padding in right direction if need be */
370         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
371         if(x_count != size)
372         {
373             hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
374         }
375 
376         /* Check and do padding in bottom directino if need be */
377         if(y_count != size)
378         {
379             hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
380         }
381     }
382 }
383 
hme_get_wt_inp_ctb_neon(layer_ctxt_t * ps_curr_layer,wgt_pred_ctxt_t * ps_wt_inp_prms,S32 dst_stride,S32 pos_x,S32 pos_y,S32 size,S32 num_ref,U08 u1_is_wt_pred_on)384 void hme_get_wt_inp_ctb_neon(
385     layer_ctxt_t *ps_curr_layer,
386     wgt_pred_ctxt_t *ps_wt_inp_prms,
387     S32 dst_stride,
388     S32 pos_x,
389     S32 pos_y,
390     S32 size,
391     S32 num_ref,
392     U08 u1_is_wt_pred_on)
393 {
394     WORD32 ref, i, j;
395     UWORD8 *pu1_src, *pu1_dst;
396     WORD32 x_count, y_count;
397     WORD32 src_stride = ps_curr_layer->i4_inp_stride;
398 
399     /* In case we handle imcomplete CTBs, we copy only as much as reqd */
400     /* from input buffers to prevent out of bound accesses. In this    */
401     /* case, we do padding in x or y or both dirns */
402     x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
403     y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
404 
405     /* Fixed source */
406     pu1_src = ps_curr_layer->pu1_inp;
407     pu1_src += (pos_x + (pos_y * src_stride));
408 
409     if(!u1_is_wt_pred_on)
410     {
411         pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
412 
413         if(0 == (x_count & 15))
414         {
415             uint8x16_t src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
416 
417             for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
418             {
419                 for(j = 0; j < x_count; j += 16) /* 16 cols at a time */
420                 {
421                     /* Load 4x16 Source */
422                     src0_16x8b = vld1q_u8(pu1_src + 0 * src_stride);
423                     src1_16x8b = vld1q_u8(pu1_src + 1 * src_stride);
424                     src2_16x8b = vld1q_u8(pu1_src + 2 * src_stride);
425                     src3_16x8b = vld1q_u8(pu1_src + 3 * src_stride);
426 
427                     /* ref==num_ref */ /* last ref will be non weighted input */
428                     /* Store */
429                     vst1q_u8((pu1_dst + 0 * dst_stride), src0_16x8b);
430                     vst1q_u8((pu1_dst + 1 * dst_stride), src1_16x8b);
431                     vst1q_u8((pu1_dst + 2 * dst_stride), src2_16x8b);
432                     vst1q_u8((pu1_dst + 3 * dst_stride), src3_16x8b);
433 
434                     pu1_src += 16;
435                     pu1_dst += 16;
436                 }
437 
438                 pu1_src = pu1_src - x_count + 4 * src_stride;
439                 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
440             }
441         }
442         else if(0 == (x_count & 7)) /* wd multiple of 8 case */
443         {
444             uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
445             for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
446             {
447                 for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
448                 {
449                     /* Load 4x8 Source */
450                     src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
451                     src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
452                     src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
453                     src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
454 
455                     /* ref==num_ref */ /* last ref will be non weighted input */
456                     /* Store */
457                     vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
458                     vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
459                     vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
460                     vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
461 
462                     pu1_src += 8;
463                     pu1_dst += 8;
464                 }
465 
466                 pu1_src = pu1_src - x_count + 4 * src_stride;
467                 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
468             }
469         }
470         else /* wd multiple of 4 case */
471         {
472             for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
473             {
474                 for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
475                 {
476                     /* ref==num_ref */ /* last ref will be non weighted input */
477                     *(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
478                     *(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
479                     *(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
480                     *(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
481 
482                     pu1_src += 4;
483                     pu1_dst += 4;
484                 }
485 
486                 pu1_src -= x_count + 4 * src_stride;
487                 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
488             }
489         }
490 
491         for(i = 0; i < num_ref + 1; i++)
492         {
493             ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
494         }
495 
496         /* Padding */
497         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
498 
499         if(x_count != size)
500         {
501             hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
502         }
503 
504         /* Check and do padding in bottom directino if need be */
505         if(y_count != size)
506         {
507             hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
508         }
509     }
510     else
511     {
512         S32 ai4_wt_refs[MAX_NUM_REF];
513         U08 u1_num_valid_refs = 0;
514         int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
515         int32x4_t inv_wt_4x32b, off_4x32b;
516         int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
517 
518         /* add value */
519         int32x4_t add_4x32b = vdupq_n_s32(0x4000);
520         int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
521 
522         for(i = 0; i < num_ref; i++)
523         {
524             if((WGHT_DEFAULT == (ps_wt_inp_prms->a_wpred_wt[i])) &&
525                (0 == (ps_wt_inp_prms->a_wpred_off[i])))
526             {
527                 ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
528             }
529             else
530             {
531                 ai4_wt_refs[u1_num_valid_refs++] = i;
532                 ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[i];
533             }
534         }
535 
536         ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
537 
538         if(0 == (x_count & 7)) /* wd multiple of 8 case */
539         {
540             uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
541             int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
542             int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
543 
544             for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
545             {
546                 for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
547                 {
548                     /* Load 4x8 Source */
549                     /* Load Source : Lower 32 bit, Upper 32 bit neglected */
550                     src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
551                     src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
552                     src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
553                     src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
554 
555                     /* ref==num_ref */ /* last ref will be non weighted input */
556                     pu1_dst = (ps_wt_inp_prms->apu1_wt_inp[num_ref]) + (i * dst_stride) + j;
557 
558                     /* Store */
559                     vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
560                     vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
561                     vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
562                     vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
563 
564                     if(u1_num_valid_refs)
565                     {
566                         /* Right 4x4 Block */
567                         src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
568                         src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
569                         src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
570                         src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
571                     }
572 
573                     /* Run thro all ref ids, except ref==num_ref, which is already done */
574                     for(ref = 0; ref < u1_num_valid_refs; ref++)
575                     {
576                         U08 u1_ref_idx = ai4_wt_refs[ref];
577 
578                         /* Each ref id may have differnet wt/offset. */
579                         /* So we have unique inp buf for each ref id */
580                         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx] + (i * dst_stride) + j;
581 
582                         /* InvWt and off specific to this ref id */
583                         off_8x16b = vdupq_n_s16(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
584                         inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
585 
586                         /* inp - off */
587                         src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
588                         src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
589                         src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
590                         src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
591 
592                         dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
593                         dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
594                         dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
595                         dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
596 
597                         dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
598                         dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
599                         dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
600                         dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
601 
602                         /* (inp-off) << shift */
603                         dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
604                         dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
605                         dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
606                         dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
607 
608                         /* (inp-off) << shift */
609                         dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
610                         dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
611                         dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
612                         dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
613 
614                         /* ((inp-off) << shift) * inv_wt + 1<<14 */
615                         dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
616                         dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
617                         dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
618                         dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
619 
620                         /* ((inp-off) << shift) * inv_wt + 1<<14 */
621                         dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
622                         dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
623                         dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
624                         dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
625 
626                         /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
627                         src4_8x16b = vcombine_s16(
628                             vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
629                             vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
630                         src5_8x16b = vcombine_s16(
631                             vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
632                             vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
633                         src6_8x16b = vcombine_s16(
634                             vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
635                             vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
636                         src7_8x16b = vcombine_s16(
637                             vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
638                             vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
639                         /* Store */
640                         vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
641                         vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
642                         vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
643                         vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
644                     }
645                     /* Pointer update */
646                     pu1_src += 8;
647                 }
648                 /* Pointer update */
649                 pu1_src = pu1_src - x_count + 4 * src_stride;
650             }
651         }
652         else /* wd multiple of 4 case */
653         {
654             uint8x16_t src0_16x8b;
655             int32x4_t src0_4x32b, src1_4x32b, src2_4x32b, src3_4x32b;
656             WORD32 dst0, dst1, dst2, dst3;
657             pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
658             for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
659             {
660                 for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
661                 {
662                     /* ref==num_ref */ /* last ref will be non weighted input */
663 
664                     *(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
665                     *(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
666                     *(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
667                     *(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
668 
669                     /* Pointer update */
670                     pu1_src += 4;
671                     pu1_dst += 4;
672                 }
673                 /* Pointer update */
674                 pu1_src = pu1_src - x_count + 4 * src_stride;
675                 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
676             }
677 
678             if(u1_num_valid_refs)
679             {
680                 pu1_src = ps_curr_layer->pu1_inp;
681                 pu1_src += (pos_x + (pos_y * src_stride));
682 
683                 /* Run thro all ref ids, except ref==num_ref, which is already done */
684                 for(ref = 0; ref < u1_num_valid_refs; ref++)
685                 {
686                     U08 u1_ref_idx = ai4_wt_refs[ref];
687 
688                     pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx];
689 
690                     /* InvWt and off specific to this ref id */
691                     off_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
692                     inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
693 
694                     for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
695                     {
696                         for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
697                         {
698                             src0_16x8b = load_unaligned_u8q(pu1_src, src_stride);
699 
700                             src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src0_16x8b)));
701                             src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src0_16x8b)));
702 
703                             src0_4x32b = vmovl_s16(vget_low_s16(src0_8x16b));
704                             src1_4x32b = vmovl_s16(vget_high_s16(src0_8x16b));
705                             src2_4x32b = vmovl_s16(vget_low_s16(src1_8x16b));
706                             src3_4x32b = vmovl_s16(vget_high_s16(src1_8x16b));
707 
708                             /* inp - off */
709                             dst0_4x32b = vsubq_s32(src0_4x32b, off_4x32b);
710                             dst1_4x32b = vsubq_s32(src1_4x32b, off_4x32b);
711                             dst2_4x32b = vsubq_s32(src2_4x32b, off_4x32b);
712                             dst3_4x32b = vsubq_s32(src3_4x32b, off_4x32b);
713 
714                             /* (inp-off) << shift */
715                             dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
716                             dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
717                             dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
718                             dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
719 
720                             /* ((inp-off) << shift) * inv_wt */
721                             dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
722                             dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
723                             dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
724                             dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
725 
726                             /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
727                             dst0 = (WORD32)vget_lane_u64(
728                                 vreinterpret_u64_u16(
729                                     vqshrun_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT)),
730                                 0);
731                             dst1 = (WORD32)vget_lane_u64(
732                                 vreinterpret_u64_u16(
733                                     vqshrun_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT)),
734                                 0);
735                             dst2 = (WORD32)vget_lane_u64(
736                                 vreinterpret_u64_u16(
737                                     vqshrun_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT)),
738                                 0);
739                             dst3 = (WORD32)vget_lane_u64(
740                                 vreinterpret_u64_u16(
741                                     vqshrun_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT)),
742                                 0);
743 
744                             *(WORD32 *)(&pu1_dst[0 * dst_stride]) = dst0;
745                             *(WORD32 *)(&pu1_dst[1 * dst_stride]) = dst1;
746                             *(WORD32 *)(&pu1_dst[2 * dst_stride]) = dst2;
747                             *(WORD32 *)(&pu1_dst[3 * dst_stride]) = dst3;
748 
749                             /* Pointer update */
750                             pu1_src += 4;
751                             pu1_dst += 4;
752                         }
753                         /* Pointer update */
754                         pu1_src = pu1_src - x_count + 4 * src_stride;
755                         pu1_dst = pu1_dst - x_count + 4 * dst_stride;
756                     }
757                 }
758             }
759         }
760 
761         /* Padding */
762         for(ref = 0; ref < u1_num_valid_refs; ref++)
763         {
764             /* Check and do padding in right direction if need be */
765             pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
766             if(x_count != size)
767             {
768                 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
769             }
770 
771             /* Check and do padding in bottom directino if need be */
772             if(y_count != size)
773             {
774                 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
775             }
776         }
777 
778         /* Check and do padding in right direction if need be */
779         pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
780 
781         if(x_count != size)
782         {
783             hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
784         }
785 
786         /* Check and do padding in bottom directino if need be */
787         if(y_count != size)
788         {
789             hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
790         }
791     }
792 }
793