1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_hme_utils_neon.c
24 *
25 * @brief
26 * Contains function definitions for hme utils function in neon intrinsic
27 *
28 *
29 * @author
30 * ittian
31 *
32 * @par List of Functions:
33 * - ihevce_get_wt_inp_8x8_neon()
34 * - ihevce_get_wt_inp_ctb_neon()
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40 /*****************************************************************************/
41 /* File Includes */
42 /*****************************************************************************/
43 /* System include files */
44 #include <stdio.h>
45 #include <string.h>
46 #include <assert.h>
47 #include <arm_neon.h>
48
49 /* User include files */
50 #include "ihevc_typedefs.h"
51 #include "itt_video_api.h"
52 #include "ihevc_cmn_utils_neon.h"
53 #include "ihevc_chroma_itrans_recon.h"
54 #include "ihevc_chroma_intra_pred.h"
55 #include "ihevc_debug.h"
56 #include "ihevc_deblk.h"
57 #include "ihevc_defs.h"
58 #include "ihevc_itrans_recon.h"
59 #include "ihevc_intra_pred.h"
60 #include "ihevc_inter_pred.h"
61 #include "ihevc_macros.h"
62 #include "ihevc_mem_fns.h"
63 #include "ihevc_padding.h"
64 #include "ihevc_quant_iquant_ssd.h"
65 #include "ihevc_resi_trans.h"
66 #include "ihevc_sao.h"
67 #include "ihevc_structs.h"
68 #include "ihevc_weighted_pred.h"
69
70 #include "rc_cntrl_param.h"
71 #include "rc_frame_info_collector.h"
72 #include "rc_look_ahead_params.h"
73
74 #include "ihevce_api.h"
75 #include "ihevce_defs.h"
76 #include "ihevce_lap_enc_structs.h"
77 #include "ihevce_multi_thrd_structs.h"
78 #include "ihevce_function_selector.h"
79 #include "ihevce_me_common_defs.h"
80 #include "ihevce_enc_structs.h"
81 #include "ihevce_had_satd.h"
82 #include "ihevce_ipe_instr_set_router.h"
83 #include "ihevce_global_tables.h"
84
85 #include "hme_datatype.h"
86 #include "hme_interface.h"
87 #include "hme_common_defs.h"
88 #include "hme_defs.h"
89 #include "ihevce_me_instr_set_router.h"
90 #include "hme_globals.h"
91 #include "hme_utils.h"
92 #include "hme_coarse.h"
93 #include "hme_refine.h"
94
95 /*****************************************************************************/
96 /* Constant Macros */
97 /*****************************************************************************/
98 #define IHEVCE_WT_PRED_SHIFT 15
99
100 /*****************************************************************************/
101 /* Function Definitions */
102 /*****************************************************************************/
103
ihevce_get_wt_inp_4x8_neon(const UWORD8 * pu1_src,UWORD8 * pu1_dst,wgt_pred_ctxt_t * ps_wt_inp_prms,WORD32 u1_num_valid_refs,WORD32 * ai4_wt_refs,WORD32 src_stride,WORD32 dst_stride)104 static INLINE void ihevce_get_wt_inp_4x8_neon(
105 const UWORD8 *pu1_src,
106 UWORD8 *pu1_dst,
107 wgt_pred_ctxt_t *ps_wt_inp_prms,
108 WORD32 u1_num_valid_refs,
109 WORD32 *ai4_wt_refs,
110 WORD32 src_stride,
111 WORD32 dst_stride)
112 {
113 S32 inv_wt;
114 S16 off;
115 uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
116 int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
117 int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
118 int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
119 int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
120 int32x4_t add_4x32b, inv_wt_4x32b;
121 U08 ref;
122 int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
123
124 src0_8x8b = vld1_u8((pu1_src + 0 * src_stride));
125 src1_8x8b = vld1_u8((pu1_src + 1 * src_stride));
126 src2_8x8b = vld1_u8((pu1_src + 2 * src_stride));
127 src3_8x8b = vld1_u8((pu1_src + 3 * src_stride));
128 /* Store */
129 vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
130 vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
131 vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
132 vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
133
134 if(u1_num_valid_refs)
135 {
136 /* Right 4x4 Block */
137 src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
138 src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
139 src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
140 src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
141
142 /* add value */
143 add_4x32b = vdupq_n_s32(0x4000);
144 }
145
146 /* Run thro all ref ids, except ref==num_ref, which is already done */
147 for(ref = 0; ref < u1_num_valid_refs; ref++)
148 {
149 S32 i4_ref_idx = ai4_wt_refs[ref];
150
151 /* InvWt and off specific to this ref id */
152 inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[i4_ref_idx];
153 off = (S16)ps_wt_inp_prms->a_wpred_off[i4_ref_idx];
154
155 /* set1 uses multiple instructions : Try to AVOID it */
156 off_8x16b = vdupq_n_s16(off);
157 inv_wt_4x32b = vdupq_n_s32(inv_wt);
158
159 /* Each ref id may have differnet wt/offset. */
160 /* So we have unique inp buf for each ref id */
161 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[i4_ref_idx];
162
163 /* inp - off */
164 src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
165 src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
166 src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
167 src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
168
169 dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
170 dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
171 dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
172 dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
173
174 dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
175 dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
176 dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
177 dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
178
179 /* (inp-off) << shift */
180 dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
181 dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
182 dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
183 dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
184
185 /* (inp-off) << shift */
186 dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
187 dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
188 dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
189 dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
190
191 /* ((inp-off) << shift) * inv_wt + 1<<14 */
192 dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
193 dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
194 dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
195 dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
196
197 /* ((inp-off) << shift) * inv_wt + 1<<14 */
198 dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
199 dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
200 dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
201 dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
202
203 /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
204 src4_8x16b = vcombine_s16(
205 vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
206 vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
207 src5_8x16b = vcombine_s16(
208 vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
209 vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
210 src6_8x16b = vcombine_s16(
211 vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
212 vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
213 src7_8x16b = vcombine_s16(
214 vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
215 vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
216 /* Store */
217 vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
218 vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
219 vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
220 vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
221 }
222 }
223
hme_get_wt_inp_8x8_neon(layer_ctxt_t * ps_curr_layer,wgt_pred_ctxt_t * ps_wt_inp_prms,S32 dst_stride,S32 pos_x,S32 pos_y,S32 size,S32 num_ref,U08 u1_is_wt_pred_on)224 void hme_get_wt_inp_8x8_neon(
225 layer_ctxt_t *ps_curr_layer,
226 wgt_pred_ctxt_t *ps_wt_inp_prms,
227 S32 dst_stride,
228 S32 pos_x,
229 S32 pos_y,
230 S32 size,
231 S32 num_ref,
232 U08 u1_is_wt_pred_on)
233 {
234 WORD32 ref;
235 UWORD8 *pu1_src, *pu1_dst;
236 WORD32 x_count, y_count;
237 WORD32 src_stride = ps_curr_layer->i4_inp_stride;
238
239 /* Make sure the start positions of block are inside frame limits */
240 pos_x = MIN(pos_x, ps_curr_layer->i4_wd - 1);
241 pos_y = MIN(pos_y, ps_curr_layer->i4_ht - 1);
242
243 /* In case we handle imcomplete CTBs, we copy only as much as reqd */
244 /* from input buffers to prevent out of bound accesses. In this */
245 /* case, we do padding in x or y or both dirns */
246 x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
247 y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
248
249 /* Fixed source */
250 pu1_src = ps_curr_layer->pu1_inp;
251 pu1_src += (pos_x + (pos_y * src_stride));
252
253 if(!u1_is_wt_pred_on)
254 {
255 uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
256
257 /************* Top 4x8 Processing ****************/
258 /* Load Source : Lower 64 bit */
259 src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
260 src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
261 src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
262 src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
263
264 /* ref==num_ref */ /* last ref will be non weighted input */
265 pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
266 /* Store */
267 vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
268 vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
269 vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
270 vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
271
272 /************* Bottom 4x8 Processing ****************/
273 pu1_src += 4 * src_stride;
274 pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref] + 4 * dst_stride;
275
276 /* Load Source : Lower 64 bit */
277 src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
278 src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
279 src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
280 src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
281 /* ref==num_ref */ /* last ref will be non weighted input */
282 /* Store */
283 vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
284 vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
285 vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
286 vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
287
288 pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
289
290 if(x_count != size)
291 {
292 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
293 }
294
295 /* Check and do padding in bottom directino if need be */
296 if(y_count != size)
297 {
298 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
299 }
300
301 for(ref = 0; ref < num_ref + 1; ref++)
302 {
303 ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
304 }
305 }
306 else
307 {
308 S32 wt, off;
309 S32 ai4_wt_refs[MAX_NUM_REF];
310 U08 u1_num_valid_refs = 0;
311
312 for(ref = 0; ref < num_ref; ref++)
313 {
314 wt = ps_wt_inp_prms->a_wpred_wt[ref];
315 off = ps_wt_inp_prms->a_wpred_off[ref];
316
317 if((WGHT_DEFAULT == wt) && (0 == off))
318 {
319 ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
320 }
321 else
322 {
323 ai4_wt_refs[u1_num_valid_refs++] = ref;
324 ps_wt_inp_prms->apu1_wt_inp[ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[ref];
325 }
326 }
327
328 ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
329
330 /************* Top 4x8 Processing ****************/
331 /* ref==num_ref */ /* last ref will be non weighted input */
332 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
333 ihevce_get_wt_inp_4x8_neon(
334 pu1_src,
335 pu1_dst,
336 ps_wt_inp_prms,
337 u1_num_valid_refs,
338 ai4_wt_refs,
339 src_stride,
340 dst_stride);
341 /************* Bottom 4x8 Processing ****************/
342 pu1_src += 4 * src_stride;
343 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref] + 4 * dst_stride;
344 ihevce_get_wt_inp_4x8_neon(
345 pu1_src,
346 pu1_dst,
347 ps_wt_inp_prms,
348 u1_num_valid_refs,
349 ai4_wt_refs,
350 src_stride,
351 dst_stride);
352
353 for(ref = 0; ref < u1_num_valid_refs; ref++)
354 {
355 /* Check and do padding in right direction if need be */
356 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
357 if(x_count != size)
358 {
359 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
360 }
361
362 /* Check and do padding in bottom directino if need be */
363 if(y_count != size)
364 {
365 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
366 }
367 }
368
369 /* Check and do padding in right direction if need be */
370 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
371 if(x_count != size)
372 {
373 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
374 }
375
376 /* Check and do padding in bottom directino if need be */
377 if(y_count != size)
378 {
379 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
380 }
381 }
382 }
383
hme_get_wt_inp_ctb_neon(layer_ctxt_t * ps_curr_layer,wgt_pred_ctxt_t * ps_wt_inp_prms,S32 dst_stride,S32 pos_x,S32 pos_y,S32 size,S32 num_ref,U08 u1_is_wt_pred_on)384 void hme_get_wt_inp_ctb_neon(
385 layer_ctxt_t *ps_curr_layer,
386 wgt_pred_ctxt_t *ps_wt_inp_prms,
387 S32 dst_stride,
388 S32 pos_x,
389 S32 pos_y,
390 S32 size,
391 S32 num_ref,
392 U08 u1_is_wt_pred_on)
393 {
394 WORD32 ref, i, j;
395 UWORD8 *pu1_src, *pu1_dst;
396 WORD32 x_count, y_count;
397 WORD32 src_stride = ps_curr_layer->i4_inp_stride;
398
399 /* In case we handle imcomplete CTBs, we copy only as much as reqd */
400 /* from input buffers to prevent out of bound accesses. In this */
401 /* case, we do padding in x or y or both dirns */
402 x_count = MIN(size, (ps_curr_layer->i4_wd - pos_x));
403 y_count = MIN(size, (ps_curr_layer->i4_ht - pos_y));
404
405 /* Fixed source */
406 pu1_src = ps_curr_layer->pu1_inp;
407 pu1_src += (pos_x + (pos_y * src_stride));
408
409 if(!u1_is_wt_pred_on)
410 {
411 pu1_dst = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
412
413 if(0 == (x_count & 15))
414 {
415 uint8x16_t src0_16x8b, src1_16x8b, src2_16x8b, src3_16x8b;
416
417 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
418 {
419 for(j = 0; j < x_count; j += 16) /* 16 cols at a time */
420 {
421 /* Load 4x16 Source */
422 src0_16x8b = vld1q_u8(pu1_src + 0 * src_stride);
423 src1_16x8b = vld1q_u8(pu1_src + 1 * src_stride);
424 src2_16x8b = vld1q_u8(pu1_src + 2 * src_stride);
425 src3_16x8b = vld1q_u8(pu1_src + 3 * src_stride);
426
427 /* ref==num_ref */ /* last ref will be non weighted input */
428 /* Store */
429 vst1q_u8((pu1_dst + 0 * dst_stride), src0_16x8b);
430 vst1q_u8((pu1_dst + 1 * dst_stride), src1_16x8b);
431 vst1q_u8((pu1_dst + 2 * dst_stride), src2_16x8b);
432 vst1q_u8((pu1_dst + 3 * dst_stride), src3_16x8b);
433
434 pu1_src += 16;
435 pu1_dst += 16;
436 }
437
438 pu1_src = pu1_src - x_count + 4 * src_stride;
439 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
440 }
441 }
442 else if(0 == (x_count & 7)) /* wd multiple of 8 case */
443 {
444 uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
445 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
446 {
447 for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
448 {
449 /* Load 4x8 Source */
450 src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
451 src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
452 src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
453 src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
454
455 /* ref==num_ref */ /* last ref will be non weighted input */
456 /* Store */
457 vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
458 vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
459 vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
460 vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
461
462 pu1_src += 8;
463 pu1_dst += 8;
464 }
465
466 pu1_src = pu1_src - x_count + 4 * src_stride;
467 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
468 }
469 }
470 else /* wd multiple of 4 case */
471 {
472 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
473 {
474 for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
475 {
476 /* ref==num_ref */ /* last ref will be non weighted input */
477 *(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
478 *(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
479 *(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
480 *(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
481
482 pu1_src += 4;
483 pu1_dst += 4;
484 }
485
486 pu1_src -= x_count + 4 * src_stride;
487 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
488 }
489 }
490
491 for(i = 0; i < num_ref + 1; i++)
492 {
493 ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
494 }
495
496 /* Padding */
497 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
498
499 if(x_count != size)
500 {
501 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
502 }
503
504 /* Check and do padding in bottom directino if need be */
505 if(y_count != size)
506 {
507 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
508 }
509 }
510 else
511 {
512 S32 ai4_wt_refs[MAX_NUM_REF];
513 U08 u1_num_valid_refs = 0;
514 int32x4_t dst0_4x32b, dst1_4x32b, dst2_4x32b, dst3_4x32b;
515 int32x4_t inv_wt_4x32b, off_4x32b;
516 int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
517
518 /* add value */
519 int32x4_t add_4x32b = vdupq_n_s32(0x4000);
520 int32x4_t log_wdc = vdupq_n_s32(ps_wt_inp_prms->wpred_log_wdc);
521
522 for(i = 0; i < num_ref; i++)
523 {
524 if((WGHT_DEFAULT == (ps_wt_inp_prms->a_wpred_wt[i])) &&
525 (0 == (ps_wt_inp_prms->a_wpred_off[i])))
526 {
527 ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
528 }
529 else
530 {
531 ai4_wt_refs[u1_num_valid_refs++] = i;
532 ps_wt_inp_prms->apu1_wt_inp[i] = ps_wt_inp_prms->apu1_wt_inp_buf_array[i];
533 }
534 }
535
536 ps_wt_inp_prms->apu1_wt_inp[num_ref] = ps_wt_inp_prms->apu1_wt_inp_buf_array[num_ref];
537
538 if(0 == (x_count & 7)) /* wd multiple of 8 case */
539 {
540 uint8x8_t src0_8x8b, src1_8x8b, src2_8x8b, src3_8x8b;
541 int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b, off_8x16b;
542 int32x4_t dst4_4x32b, dst5_4x32b, dst6_4x32b, dst7_4x32b;
543
544 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
545 {
546 for(j = 0; j < x_count; j += 8) /* 8 cols at a time */
547 {
548 /* Load 4x8 Source */
549 /* Load Source : Lower 32 bit, Upper 32 bit neglected */
550 src0_8x8b = vld1_u8(pu1_src + 0 * src_stride);
551 src1_8x8b = vld1_u8(pu1_src + 1 * src_stride);
552 src2_8x8b = vld1_u8(pu1_src + 2 * src_stride);
553 src3_8x8b = vld1_u8(pu1_src + 3 * src_stride);
554
555 /* ref==num_ref */ /* last ref will be non weighted input */
556 pu1_dst = (ps_wt_inp_prms->apu1_wt_inp[num_ref]) + (i * dst_stride) + j;
557
558 /* Store */
559 vst1_u8((pu1_dst + 0 * dst_stride), src0_8x8b);
560 vst1_u8((pu1_dst + 1 * dst_stride), src1_8x8b);
561 vst1_u8((pu1_dst + 2 * dst_stride), src2_8x8b);
562 vst1_u8((pu1_dst + 3 * dst_stride), src3_8x8b);
563
564 if(u1_num_valid_refs)
565 {
566 /* Right 4x4 Block */
567 src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(src0_8x8b));
568 src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(src1_8x8b));
569 src2_8x16b = vreinterpretq_s16_u16(vmovl_u8(src2_8x8b));
570 src3_8x16b = vreinterpretq_s16_u16(vmovl_u8(src3_8x8b));
571 }
572
573 /* Run thro all ref ids, except ref==num_ref, which is already done */
574 for(ref = 0; ref < u1_num_valid_refs; ref++)
575 {
576 U08 u1_ref_idx = ai4_wt_refs[ref];
577
578 /* Each ref id may have differnet wt/offset. */
579 /* So we have unique inp buf for each ref id */
580 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx] + (i * dst_stride) + j;
581
582 /* InvWt and off specific to this ref id */
583 off_8x16b = vdupq_n_s16(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
584 inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
585
586 /* inp - off */
587 src4_8x16b = vsubq_s16(src0_8x16b, off_8x16b);
588 src5_8x16b = vsubq_s16(src1_8x16b, off_8x16b);
589 src6_8x16b = vsubq_s16(src2_8x16b, off_8x16b);
590 src7_8x16b = vsubq_s16(src3_8x16b, off_8x16b);
591
592 dst0_4x32b = vmovl_s16(vget_low_s16(src4_8x16b));
593 dst1_4x32b = vmovl_s16(vget_low_s16(src5_8x16b));
594 dst2_4x32b = vmovl_s16(vget_low_s16(src6_8x16b));
595 dst3_4x32b = vmovl_s16(vget_low_s16(src7_8x16b));
596
597 dst4_4x32b = vmovl_s16(vget_high_s16(src4_8x16b));
598 dst5_4x32b = vmovl_s16(vget_high_s16(src5_8x16b));
599 dst6_4x32b = vmovl_s16(vget_high_s16(src6_8x16b));
600 dst7_4x32b = vmovl_s16(vget_high_s16(src7_8x16b));
601
602 /* (inp-off) << shift */
603 dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
604 dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
605 dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
606 dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
607
608 /* (inp-off) << shift */
609 dst4_4x32b = vshlq_s32(dst4_4x32b, log_wdc);
610 dst5_4x32b = vshlq_s32(dst5_4x32b, log_wdc);
611 dst6_4x32b = vshlq_s32(dst6_4x32b, log_wdc);
612 dst7_4x32b = vshlq_s32(dst7_4x32b, log_wdc);
613
614 /* ((inp-off) << shift) * inv_wt + 1<<14 */
615 dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
616 dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
617 dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
618 dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
619
620 /* ((inp-off) << shift) * inv_wt + 1<<14 */
621 dst4_4x32b = vmlaq_s32(add_4x32b, dst4_4x32b, inv_wt_4x32b);
622 dst5_4x32b = vmlaq_s32(add_4x32b, dst5_4x32b, inv_wt_4x32b);
623 dst6_4x32b = vmlaq_s32(add_4x32b, dst6_4x32b, inv_wt_4x32b);
624 dst7_4x32b = vmlaq_s32(add_4x32b, dst7_4x32b, inv_wt_4x32b);
625
626 /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
627 src4_8x16b = vcombine_s16(
628 vshrn_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT),
629 vshrn_n_s32(dst4_4x32b, IHEVCE_WT_PRED_SHIFT));
630 src5_8x16b = vcombine_s16(
631 vshrn_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT),
632 vshrn_n_s32(dst5_4x32b, IHEVCE_WT_PRED_SHIFT));
633 src6_8x16b = vcombine_s16(
634 vshrn_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT),
635 vshrn_n_s32(dst6_4x32b, IHEVCE_WT_PRED_SHIFT));
636 src7_8x16b = vcombine_s16(
637 vshrn_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT),
638 vshrn_n_s32(dst7_4x32b, IHEVCE_WT_PRED_SHIFT));
639 /* Store */
640 vst1_u8((pu1_dst + 0 * dst_stride), vqmovun_s16(src4_8x16b));
641 vst1_u8((pu1_dst + 1 * dst_stride), vqmovun_s16(src5_8x16b));
642 vst1_u8((pu1_dst + 2 * dst_stride), vqmovun_s16(src6_8x16b));
643 vst1_u8((pu1_dst + 3 * dst_stride), vqmovun_s16(src7_8x16b));
644 }
645 /* Pointer update */
646 pu1_src += 8;
647 }
648 /* Pointer update */
649 pu1_src = pu1_src - x_count + 4 * src_stride;
650 }
651 }
652 else /* wd multiple of 4 case */
653 {
654 uint8x16_t src0_16x8b;
655 int32x4_t src0_4x32b, src1_4x32b, src2_4x32b, src3_4x32b;
656 WORD32 dst0, dst1, dst2, dst3;
657 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
658 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
659 {
660 for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
661 {
662 /* ref==num_ref */ /* last ref will be non weighted input */
663
664 *(WORD32 *)(&pu1_dst[0 * dst_stride]) = *(WORD32 *)(&pu1_src[0 * src_stride]);
665 *(WORD32 *)(&pu1_dst[1 * dst_stride]) = *(WORD32 *)(&pu1_src[1 * src_stride]);
666 *(WORD32 *)(&pu1_dst[2 * dst_stride]) = *(WORD32 *)(&pu1_src[2 * src_stride]);
667 *(WORD32 *)(&pu1_dst[3 * dst_stride]) = *(WORD32 *)(&pu1_src[3 * src_stride]);
668
669 /* Pointer update */
670 pu1_src += 4;
671 pu1_dst += 4;
672 }
673 /* Pointer update */
674 pu1_src = pu1_src - x_count + 4 * src_stride;
675 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
676 }
677
678 if(u1_num_valid_refs)
679 {
680 pu1_src = ps_curr_layer->pu1_inp;
681 pu1_src += (pos_x + (pos_y * src_stride));
682
683 /* Run thro all ref ids, except ref==num_ref, which is already done */
684 for(ref = 0; ref < u1_num_valid_refs; ref++)
685 {
686 U08 u1_ref_idx = ai4_wt_refs[ref];
687
688 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[u1_ref_idx];
689
690 /* InvWt and off specific to this ref id */
691 off_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_wpred_off[u1_ref_idx]);
692 inv_wt_4x32b = vdupq_n_s32(ps_wt_inp_prms->a_inv_wpred_wt[u1_ref_idx]);
693
694 for(i = 0; i < y_count; i += 4) /* 4 rows at a time */
695 {
696 for(j = 0; j < x_count; j += 4) /* 4 cols at a time */
697 {
698 src0_16x8b = load_unaligned_u8q(pu1_src, src_stride);
699
700 src0_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src0_16x8b)));
701 src1_8x16b = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src0_16x8b)));
702
703 src0_4x32b = vmovl_s16(vget_low_s16(src0_8x16b));
704 src1_4x32b = vmovl_s16(vget_high_s16(src0_8x16b));
705 src2_4x32b = vmovl_s16(vget_low_s16(src1_8x16b));
706 src3_4x32b = vmovl_s16(vget_high_s16(src1_8x16b));
707
708 /* inp - off */
709 dst0_4x32b = vsubq_s32(src0_4x32b, off_4x32b);
710 dst1_4x32b = vsubq_s32(src1_4x32b, off_4x32b);
711 dst2_4x32b = vsubq_s32(src2_4x32b, off_4x32b);
712 dst3_4x32b = vsubq_s32(src3_4x32b, off_4x32b);
713
714 /* (inp-off) << shift */
715 dst0_4x32b = vshlq_s32(dst0_4x32b, log_wdc);
716 dst1_4x32b = vshlq_s32(dst1_4x32b, log_wdc);
717 dst2_4x32b = vshlq_s32(dst2_4x32b, log_wdc);
718 dst3_4x32b = vshlq_s32(dst3_4x32b, log_wdc);
719
720 /* ((inp-off) << shift) * inv_wt */
721 dst0_4x32b = vmlaq_s32(add_4x32b, dst0_4x32b, inv_wt_4x32b);
722 dst1_4x32b = vmlaq_s32(add_4x32b, dst1_4x32b, inv_wt_4x32b);
723 dst2_4x32b = vmlaq_s32(add_4x32b, dst2_4x32b, inv_wt_4x32b);
724 dst3_4x32b = vmlaq_s32(add_4x32b, dst3_4x32b, inv_wt_4x32b);
725
726 /* (((inp-off) << shift) * inv_wt + 1<<14) >> 15 */
727 dst0 = (WORD32)vget_lane_u64(
728 vreinterpret_u64_u16(
729 vqshrun_n_s32(dst0_4x32b, IHEVCE_WT_PRED_SHIFT)),
730 0);
731 dst1 = (WORD32)vget_lane_u64(
732 vreinterpret_u64_u16(
733 vqshrun_n_s32(dst1_4x32b, IHEVCE_WT_PRED_SHIFT)),
734 0);
735 dst2 = (WORD32)vget_lane_u64(
736 vreinterpret_u64_u16(
737 vqshrun_n_s32(dst2_4x32b, IHEVCE_WT_PRED_SHIFT)),
738 0);
739 dst3 = (WORD32)vget_lane_u64(
740 vreinterpret_u64_u16(
741 vqshrun_n_s32(dst3_4x32b, IHEVCE_WT_PRED_SHIFT)),
742 0);
743
744 *(WORD32 *)(&pu1_dst[0 * dst_stride]) = dst0;
745 *(WORD32 *)(&pu1_dst[1 * dst_stride]) = dst1;
746 *(WORD32 *)(&pu1_dst[2 * dst_stride]) = dst2;
747 *(WORD32 *)(&pu1_dst[3 * dst_stride]) = dst3;
748
749 /* Pointer update */
750 pu1_src += 4;
751 pu1_dst += 4;
752 }
753 /* Pointer update */
754 pu1_src = pu1_src - x_count + 4 * src_stride;
755 pu1_dst = pu1_dst - x_count + 4 * dst_stride;
756 }
757 }
758 }
759 }
760
761 /* Padding */
762 for(ref = 0; ref < u1_num_valid_refs; ref++)
763 {
764 /* Check and do padding in right direction if need be */
765 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[ai4_wt_refs[ref]];
766 if(x_count != size)
767 {
768 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
769 }
770
771 /* Check and do padding in bottom directino if need be */
772 if(y_count != size)
773 {
774 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
775 }
776 }
777
778 /* Check and do padding in right direction if need be */
779 pu1_dst = ps_wt_inp_prms->apu1_wt_inp[num_ref];
780
781 if(x_count != size)
782 {
783 hme_pad_right(pu1_dst + x_count - 1, dst_stride, size - x_count, y_count);
784 }
785
786 /* Check and do padding in bottom directino if need be */
787 if(y_count != size)
788 {
789 hme_pad_bot(pu1_dst + (y_count - 1) * dst_stride, dst_stride, size - y_count, size);
790 }
791 }
792 }
793