1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_weighted_pred_neon_intr.c
22 *
23 * @brief
24 * Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 * Parthiban V
29 *
30 * @par List of Functions:
31 * - ihevc_weighted_pred_uni()
32 * - ihevc_weighted_pred_bi()
33 * - ihevc_weighted_pred_bi_default()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40 /*****************************************************************************/
41 /* File Includes */
42 /*****************************************************************************/
43 #include "ihevc_typedefs.h"
44 #include "ihevc_defs.h"
45 #include "ihevc_macros.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_inter_pred.h"
48 #include "arm_neon.h"
49
50
51 /**
52 *******************************************************************************
53 *
54 * @brief
55 * Does uni-weighted prediction on the array pointed by pi2_src and stores
56 * it at the location pointed by pi2_dst Assumptions : The function is
57 * optimized considering the fact Width and height are multiple of 2.
58 *
59 * @par Description:
60 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
61 * offset
62 *
63 * @param[in] pi2_src
64 * Pointer to the source
65 *
66 * @param[out] pu1_dst
67 * Pointer to the destination
68 *
69 * @param[in] src_strd
70 * Source stride
71 *
72 * @param[in] dst_strd
73 * Destination stride
74 *
75 * @param[in] wgt0
76 * weight to be multiplied to the source
77 *
78 * @param[in] off0
79 * offset to be added after rounding and
80 *
81 * @param[in] shifting
82 *
83 *
84 * @param[in] shift
85 * (14 Bit depth) + log2_weight_denominator
86 *
87 * @param[in] lvl_shift
88 * added before shift and offset
89 *
90 * @param[in] ht
91 * height of the source
92 *
93 * @param[in] wd
94 * width of the source
95 *
96 * @returns
97 *
98 * @remarks
99 * None
100 *
101 *******************************************************************************
102 */
103
ihevc_weighted_pred_uni_neonintr(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)104 void ihevc_weighted_pred_uni_neonintr(WORD16 *pi2_src,
105 UWORD8 *pu1_dst,
106 WORD32 src_strd,
107 WORD32 dst_strd,
108 WORD32 wgt0,
109 WORD32 off0,
110 WORD32 shift,
111 WORD32 lvl_shift,
112 WORD32 ht,
113 WORD32 wd)
114 {
115 WORD32 row, col;
116 int16x4_t pi2_src_val1;
117 int16x4_t pi2_src_val2;
118 int32x4_t i4_tmp1_t;
119 int32x4_t i4_tmp2_t;
120 int32x4_t sto_res_tmp1;
121 uint16x4_t sto_res_tmp2;
122 uint16x8_t sto_res_tmp3;
123 uint8x8_t sto_res;
124 int32x4_t tmp_lvl_shift_t;
125 WORD32 tmp_shift = 0 - shift;
126 int32x4_t tmp_shift_t;
127 WORD16 *pi2_src_tmp;
128 UWORD8 *pu1_dst_tmp;
129
130 WORD32 tmp_lvl_shift = lvl_shift * wgt0 + (off0 << shift);
131 tmp_lvl_shift += (1 << (shift - 1));
132 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
133 tmp_shift_t = vmovq_n_s32(tmp_shift);
134
135 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
136 /* height has also been unrolled, hence 2 rows will processed at a time */
137 /* store also has been taken care for two row process */
138 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
139 /* saturated and narrowed */
140
141 for(row = ht; row > 0; row -= 2)
142 {
143 for(col = wd; col > 0; col -= 4)
144 {
145 pi2_src_tmp = pi2_src + src_strd;
146
147 pu1_dst_tmp = pu1_dst + dst_strd;
148
149 pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
150 pi2_src += 4;
151
152 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
153 i4_tmp1_t = vmull_n_s16(pi2_src_val1, (int16_t)wgt0);
154
155 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t);
156 i4_tmp2_t = vmull_n_s16(pi2_src_val2, (int16_t)wgt0);
157
158 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
159 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t);
160
161 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
162 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
163
164 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
165 sto_res = vqmovn_u16(sto_res_tmp3);
166
167 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
168 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
169
170 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
171 pu1_dst += 4;
172
173 sto_res = vqmovn_u16(sto_res_tmp3);
174 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
175 }
176 pi2_src += 2 * src_strd - wd;
177 pu1_dst += 2 * dst_strd - wd;
178 }
179 }
180 //WEIGHTED_PRED_UNI
181
182 /**
183 *******************************************************************************
184 *
185 * @brief
186 * Chroma uni-weighted prediction on the array pointed by pi2_src and stores
187 * it at the location pointed by pi2_dst Assumptions : The function is
188 * optimized considering the fact Width and height are multiple of 2.
189 *
190 * @par Description:
191 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
192 * offset
193 *
194 * @param[in] pi2_src
195 * Pointer to the source
196 *
197 * @param[out] pu1_dst
198 * Pointer to the destination
199 *
200 * @param[in] src_strd
201 * Source stride
202 *
203 * @param[in] dst_strd
204 * Destination stride
205 *
206 * @param[in] wgt0
207 * weight to be multiplied to the source
208 *
209 * @param[in] off0
210 * offset to be added after rounding and
211 *
212 * @param[in] shifting
213 *
214 *
215 * @param[in] shift
216 * (14 Bit depth) + log2_weight_denominator
217 *
218 * @param[in] lvl_shift
219 * added before shift and offset
220 *
221 * @param[in] ht
222 * height of the source
223 *
224 * @param[in] wd
225 * width of the source
226 *
227 * @returns
228 *
229 * @remarks
230 * None
231 *
232 *******************************************************************************
233 */
234
ihevc_weighted_pred_chroma_uni_neonintr(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)235 void ihevc_weighted_pred_chroma_uni_neonintr(WORD16 *pi2_src,
236 UWORD8 *pu1_dst,
237 WORD32 src_strd,
238 WORD32 dst_strd,
239 WORD32 wgt0_cb,
240 WORD32 wgt0_cr,
241 WORD32 off0_cb,
242 WORD32 off0_cr,
243 WORD32 shift,
244 WORD32 lvl_shift,
245 WORD32 ht,
246 WORD32 wd)
247 {
248 WORD32 row, col;
249 int16x4_t pi2_src_val1;
250 int16x4_t pi2_src_val2;
251 int32x4_t i4_tmp1_t;
252 int32x4_t i4_tmp2_t;
253 int32x4_t sto_res_tmp1;
254 uint16x4_t sto_res_tmp2;
255 uint16x8_t sto_res_tmp3;
256 uint8x8_t sto_res;
257 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
258 int32x4x2_t tmp_lvl_shift_t;
259 WORD32 tmp_shift = 0 - shift;
260 int32x4_t tmp_shift_t;
261 int16x4_t tmp_wgt0_u, tmp_wgt0_v;
262 int16x4x2_t wgt0;
263 WORD16 *pi2_src_tmp;
264 UWORD8 *pu1_dst_tmp;
265
266 WORD32 tmp_lvl_shift = lvl_shift * wgt0_cb + (off0_cb << shift);
267 tmp_lvl_shift += (1 << (shift - 1));
268 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
269
270 tmp_lvl_shift = lvl_shift * wgt0_cr + (off0_cr << shift);
271 tmp_lvl_shift += (1 << (shift - 1));
272 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
273
274 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
275
276 tmp_shift_t = vmovq_n_s32(tmp_shift);
277
278 tmp_wgt0_u = vdup_n_s16(wgt0_cb);
279 tmp_wgt0_v = vdup_n_s16(wgt0_cr);
280 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
281
282 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
283 /* height has also been unrolled, hence 2 rows will processed at a time */
284 /* store also has been taken care for two row process */
285 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
286 /* saturated and narrowed */
287
288 for(row = ht; row > 0; row -= 2)
289 {
290 for(col = 2 * wd; col > 0; col -= 4)
291 {
292 pi2_src_tmp = pi2_src + src_strd;
293
294 pu1_dst_tmp = pu1_dst + dst_strd;
295
296 pi2_src_val1 = vld1_s16((int16_t *)pi2_src);
297 pi2_src += 4;
298
299 pi2_src_val2 = vld1_s16((int16_t *)pi2_src_tmp);
300 i4_tmp1_t = vmull_s16(pi2_src_val1, wgt0.val[0]);
301
302 i4_tmp1_t = vaddq_s32(i4_tmp1_t, tmp_lvl_shift_t.val[0]);
303 i4_tmp2_t = vmull_s16(pi2_src_val2, wgt0.val[0]);
304
305 sto_res_tmp1 = vshlq_s32(i4_tmp1_t, tmp_shift_t);
306 i4_tmp2_t = vaddq_s32(i4_tmp2_t, tmp_lvl_shift_t.val[0]);
307
308 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
309 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
310
311 sto_res_tmp1 = vshlq_s32(i4_tmp2_t, tmp_shift_t);
312 sto_res = vqmovn_u16(sto_res_tmp3);
313
314 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
315 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
316
317 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
318 pu1_dst += 4;
319
320 sto_res = vqmovn_u16(sto_res_tmp3);
321 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
322 }
323 pi2_src += 2 * src_strd - 2 * wd;
324 pu1_dst += 2 * dst_strd - 2 * wd;
325 }
326 }
327 //WEIGHTED_PRED_CHROMA_UNI
328
329 /**
330 *******************************************************************************
331 *
332 * @brief
333 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and
334 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
335 * function is optimized considering the fact Width and height are multiple
336 * of 2.
337 *
338 * @par Description:
339 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
340 * off1 + 1) << (shift - 1) ) >> shift
341 *
342 * @param[in] pi2_src1
343 * Pointer to source 1
344 *
345 * @param[in] pi2_src2
346 * Pointer to source 2
347 *
348 * @param[out] pu1_dst
349 * Pointer to destination
350 *
351 * @param[in] src_strd1
352 * Source stride 1
353 *
354 * @param[in] src_strd2
355 * Source stride 2
356 *
357 * @param[in] dst_strd
358 * Destination stride
359 *
360 * @param[in] wgt0
361 * weight to be multiplied to source 1
362 *
363 * @param[in] off0
364 * offset 0
365 *
366 * @param[in] wgt1
367 * weight to be multiplied to source 2
368 *
369 * @param[in] off1
370 * offset 1
371 *
372 * @param[in] shift
373 * (14 Bit depth) + log2_weight_denominator
374 *
375 * @param[in] lvl_shift1
376 * added before shift and offset
377 *
378 * @param[in] lvl_shift2
379 * added before shift and offset
380 *
381 * @param[in] ht
382 * height of the source
383 *
384 * @param[in] wd
385 * width of the source
386 *
387 * @returns
388 *
389 * @remarks
390 * None
391 *
392 *******************************************************************************
393 */
394
ihevc_weighted_pred_bi_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)395 void ihevc_weighted_pred_bi_neonintr(WORD16 *pi2_src1,
396 WORD16 *pi2_src2,
397 UWORD8 *pu1_dst,
398 WORD32 src_strd1,
399 WORD32 src_strd2,
400 WORD32 dst_strd,
401 WORD32 wgt0,
402 WORD32 off0,
403 WORD32 wgt1,
404 WORD32 off1,
405 WORD32 shift,
406 WORD32 lvl_shift1,
407 WORD32 lvl_shift2,
408 WORD32 ht,
409 WORD32 wd)
410 {
411 WORD32 row, col;
412 int16x4_t pi2_src1_val1;
413 int16x4_t pi2_src1_val2;
414 int16x4_t pi2_src2_val1;
415 int16x4_t pi2_src2_val2;
416 int32x4_t i4_tmp1_t1;
417 int32x4_t i4_tmp1_t2;
418 int32x4_t i4_tmp2_t1;
419 int32x4_t i4_tmp2_t2;
420 int32x4_t sto_res_tmp1;
421 uint16x4_t sto_res_tmp2;
422 uint16x8_t sto_res_tmp3;
423 uint8x8_t sto_res;
424 int32x4_t tmp_lvl_shift_t;
425 WORD32 tmp_shift = 0 - shift;
426 int32x4_t tmp_shift_t;
427 WORD16 *pi2_src_tmp1;
428 WORD16 *pi2_src_tmp2;
429 UWORD8 *pu1_dst_tmp;
430
431 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0) + (lvl_shift2 * wgt1);
432 tmp_lvl_shift += ((off0 + off1 + 1) << (shift - 1));
433 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
434 tmp_shift_t = vmovq_n_s32(tmp_shift);
435
436 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
437 /* height has also been unrolled, hence 2 rows will processed at a time */
438 /* store also has been taken care for two row process */
439 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
440 /* saturated and narrowed */
441
442 for(row = ht; row > 0; row -= 2)
443 {
444 for(col = wd; col > 0; col -= 4)
445 {
446 pi2_src_tmp1 = pi2_src1 + src_strd1;
447 pi2_src_tmp2 = pi2_src2 + src_strd2;
448
449 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
450 pi2_src1 += 4;
451 pu1_dst_tmp = pu1_dst + dst_strd;
452
453 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
454 pi2_src2 += 4;
455 i4_tmp1_t1 = vmull_n_s16(pi2_src1_val1, (int16_t)wgt0);
456
457 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
458 i4_tmp1_t2 = vmull_n_s16(pi2_src2_val1, (int16_t)wgt1);
459
460 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
461 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
462
463 i4_tmp2_t1 = vmull_n_s16(pi2_src1_val2, (int16_t)wgt0);
464 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
465
466 i4_tmp2_t2 = vmull_n_s16(pi2_src2_val2, (int16_t)wgt1);
467 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
468
469 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
470 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
471
472 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
473 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
474
475 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
476 sto_res = vqmovn_u16(sto_res_tmp3);
477
478 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
479 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
480
481 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
482 pu1_dst += 4;
483
484 sto_res = vqmovn_u16(sto_res_tmp3);
485 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
486 }
487 pi2_src1 += 2 * src_strd1 - wd;
488 pi2_src2 += 2 * src_strd2 - wd;
489 pu1_dst += 2 * dst_strd - wd;
490 }
491 }
492 //WEIGHTED_PRED_BI
493
494 /**
495 *******************************************************************************
496 *
497 * @brief
498 * Chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
499 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
500 * function is optimized considering the fact Width and height are multiple
501 * of 2.
502 *
503 * @par Description:
504 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
505 * off1 + 1) << (shift - 1) ) >> shift
506 *
507 * @param[in] pi2_src1
508 * Pointer to source 1
509 *
510 * @param[in] pi2_src2
511 * Pointer to source 2
512 *
513 * @param[out] pu1_dst
514 * Pointer to destination
515 *
516 * @param[in] src_strd1
517 * Source stride 1
518 *
519 * @param[in] src_strd2
520 * Source stride 2
521 *
522 * @param[in] dst_strd
523 * Destination stride
524 *
525 * @param[in] wgt0
526 * weight to be multiplied to source 1
527 *
528 * @param[in] off0
529 * offset 0
530 *
531 * @param[in] wgt1
532 * weight to be multiplied to source 2
533 *
534 * @param[in] off1
535 * offset 1
536 *
537 * @param[in] shift
538 * (14 Bit depth) + log2_weight_denominator
539 *
540 * @param[in] lvl_shift1
541 * added before shift and offset
542 *
543 * @param[in] lvl_shift2
544 * added before shift and offset
545 *
546 * @param[in] ht
547 * height of the source
548 *
549 * @param[in] wd
550 * width of the source
551 *
552 * @returns
553 *
554 * @remarks
555 * None
556 *
557 *******************************************************************************
558 */
559
ihevc_weighted_pred_chroma_bi_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)560 void ihevc_weighted_pred_chroma_bi_neonintr(WORD16 *pi2_src1,
561 WORD16 *pi2_src2,
562 UWORD8 *pu1_dst,
563 WORD32 src_strd1,
564 WORD32 src_strd2,
565 WORD32 dst_strd,
566 WORD32 wgt0_cb,
567 WORD32 wgt0_cr,
568 WORD32 off0_cb,
569 WORD32 off0_cr,
570 WORD32 wgt1_cb,
571 WORD32 wgt1_cr,
572 WORD32 off1_cb,
573 WORD32 off1_cr,
574 WORD32 shift,
575 WORD32 lvl_shift1,
576 WORD32 lvl_shift2,
577 WORD32 ht,
578 WORD32 wd)
579 {
580 WORD32 row, col;
581 int16x4_t pi2_src1_val1;
582 int16x4_t pi2_src1_val2;
583 int16x4_t pi2_src2_val1;
584 int16x4_t pi2_src2_val2;
585 int32x4_t i4_tmp1_t1;
586 int32x4_t i4_tmp1_t2;
587 int32x4_t i4_tmp2_t1;
588 int32x4_t i4_tmp2_t2;
589 int32x4_t sto_res_tmp1;
590 uint16x4_t sto_res_tmp2;
591 uint16x8_t sto_res_tmp3;
592 uint8x8_t sto_res;
593 int32x4_t tmp_lvl_shift_t_u, tmp_lvl_shift_t_v;
594 int32x4x2_t tmp_lvl_shift_t;
595 WORD32 tmp_shift = 0 - shift;
596 int32x4_t tmp_shift_t;
597 int16x4_t tmp_wgt0_u, tmp_wgt0_v, tmp_wgt1_u, tmp_wgt1_v;
598 int16x4x2_t wgt0, wgt1;
599 WORD16 *pi2_src_tmp1;
600 WORD16 *pi2_src_tmp2;
601 UWORD8 *pu1_dst_tmp;
602
603 WORD32 tmp_lvl_shift = (lvl_shift1 * wgt0_cb) + (lvl_shift2 * wgt1_cb);
604 tmp_lvl_shift += ((off0_cb + off1_cb + 1) << (shift - 1));
605 tmp_lvl_shift_t_u = vmovq_n_s32(tmp_lvl_shift);
606
607 tmp_lvl_shift = (lvl_shift1 * wgt0_cr) + (lvl_shift2 * wgt1_cr);
608 tmp_lvl_shift += ((off0_cr + off1_cr + 1) << (shift - 1));
609 tmp_lvl_shift_t_v = vmovq_n_s32(tmp_lvl_shift);
610
611 tmp_lvl_shift_t = vzipq_s32(tmp_lvl_shift_t_u, tmp_lvl_shift_t_v);
612
613 tmp_shift_t = vmovq_n_s32(tmp_shift);
614
615 tmp_wgt0_u = vdup_n_s16(wgt0_cb);
616 tmp_wgt0_v = vdup_n_s16(wgt0_cr);
617 wgt0 = vzip_s16(tmp_wgt0_u, tmp_wgt0_v);
618 tmp_wgt1_u = vdup_n_s16(wgt1_cb);
619 tmp_wgt1_v = vdup_n_s16(wgt1_cr);
620 wgt1 = vzip_s16(tmp_wgt1_u, tmp_wgt1_v);
621
622 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
623 /* height has also been unrolled, hence 2 rows will processed at a time */
624 /* store also has been taken care for two row process */
625 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
626 /* saturated and narrowed */
627
628 for(row = ht; row > 0; row -= 2)
629 {
630 for(col = 2 * wd; col > 0; col -= 4)
631 {
632 pi2_src_tmp1 = pi2_src1 + src_strd1;
633 pi2_src_tmp2 = pi2_src2 + src_strd2;
634
635 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
636 pi2_src1 += 4;
637 pu1_dst_tmp = pu1_dst + dst_strd;
638
639 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
640 pi2_src2 += 4;
641 i4_tmp1_t1 = vmull_s16(pi2_src1_val1, wgt0.val[0]);
642
643 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
644 i4_tmp1_t2 = vmull_s16(pi2_src2_val1, wgt1.val[0]);
645
646 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
647 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
648
649 i4_tmp2_t1 = vmull_s16(pi2_src1_val2, wgt0.val[0]);
650 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t.val[0]);
651
652 i4_tmp2_t2 = vmull_s16(pi2_src2_val2, wgt1.val[0]);
653 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
654
655 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
656 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
657
658 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t.val[0]);
659 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
660
661 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
662 sto_res = vqmovn_u16(sto_res_tmp3);
663
664 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
665 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
666
667 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
668 pu1_dst += 4;
669
670 sto_res = vqmovn_u16(sto_res_tmp3);
671 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
672 }
673 pi2_src1 += 2 * src_strd1 - 2 * wd;
674 pi2_src2 += 2 * src_strd2 - 2 * wd;
675 pu1_dst += 2 * dst_strd - 2 * wd;
676 }
677 }
678 //WEIGHTED_PRED_CHROMA_BI
679
680 /**
681 *******************************************************************************
682 *
683 * @brief
684 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
685 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
686 * function is optimized considering the fact Width and height are multiple
687 * of 2.
688 *
689 * @par Description:
690 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
691 * >> shift where shift = 15 - BitDepth
692 *
693 * @param[in] pi2_src1
694 * Pointer to source 1
695 *
696 * @param[in] pi2_src2
697 * Pointer to source 2
698 *
699 * @param[out] pu1_dst
700 * Pointer to destination
701 *
702 * @param[in] src_strd1
703 * Source stride 1
704 *
705 * @param[in] src_strd2
706 * Source stride 2
707 *
708 * @param[in] dst_strd
709 * Destination stride
710 *
711 * @param[in] lvl_shift1
712 * added before shift and offset
713 *
714 * @param[in] lvl_shift2
715 * added before shift and offset
716 *
717 * @param[in] ht
718 * height of the source
719 *
720 * @param[in] wd
721 * width of the source
722 *
723 * @returns
724 *
725 * @remarks
726 * None
727 *
728 *******************************************************************************
729 */
730
ihevc_weighted_pred_bi_default_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)731 void ihevc_weighted_pred_bi_default_neonintr(WORD16 *pi2_src1,
732 WORD16 *pi2_src2,
733 UWORD8 *pu1_dst,
734 WORD32 src_strd1,
735 WORD32 src_strd2,
736 WORD32 dst_strd,
737 WORD32 lvl_shift1,
738 WORD32 lvl_shift2,
739 WORD32 ht,
740 WORD32 wd)
741 {
742 WORD32 row, col;
743 int16x4_t pi2_src1_val1;
744 int16x4_t pi2_src1_val2;
745 int16x4_t pi2_src2_val1;
746 int16x4_t pi2_src2_val2;
747 int32x4_t i4_tmp1_t1;
748 int32x4_t i4_tmp1_t2;
749 int32x4_t i4_tmp2_t1;
750 int32x4_t i4_tmp2_t2;
751 int32x4_t sto_res_tmp1;
752 uint16x4_t sto_res_tmp2;
753 uint16x8_t sto_res_tmp3;
754 uint8x8_t sto_res;
755 int32x4_t tmp_lvl_shift_t;
756 int32x4_t tmp_shift_t;
757 WORD16 *pi2_src_tmp1;
758 WORD16 *pi2_src_tmp2;
759 UWORD8 *pu1_dst_tmp;
760 WORD32 shift;
761
762 shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
763 WORD32 tmp_shift = 0 - shift;
764 WORD32 tmp_lvl_shift = 1 << (shift - 1);
765 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
766 tmp_shift_t = vmovq_n_s32(tmp_shift);
767
768 int16x4_t lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
769 int16x4_t lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
770
771 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
772 /* height has also been unrolled, hence 2 rows will processed at a time */
773 /* store also has been taken care for two row process */
774 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
775 /* saturated and narrowed */
776
777 for(row = ht; row > 0; row -= 2)
778 {
779 for(col = wd; col > 0; col -= 4)
780 {
781 pi2_src_tmp1 = pi2_src1 + src_strd1;
782 pi2_src_tmp2 = pi2_src2 + src_strd2;
783
784 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
785 pi2_src1 += 4;
786 pu1_dst_tmp = pu1_dst + dst_strd;
787
788 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
789 pi2_src2 += 4;
790 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
791
792 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
793 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
794
795 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
796 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
797
798 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
799 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
800
801 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
802 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
803
804 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
805 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
806
807 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
808 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
809
810 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
811 sto_res = vqmovn_u16(sto_res_tmp3);
812
813 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
814 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
815
816 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
817 pu1_dst += 4;
818
819 sto_res = vqmovn_u16(sto_res_tmp3);
820 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
821 }
822 pi2_src1 += 2 * src_strd1 - wd;
823 pi2_src2 += 2 * src_strd2 - wd;
824 pu1_dst += 2 * dst_strd - wd;
825 }
826 }
827 //WEIGHTED_PRED_BI_DEFAULT
828
829 /**
830 *******************************************************************************
831 *
832 * @brief
833 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
834 * pi2_src2 and stores it at location pointed by pi2_dst Assumptions : The
835 * function is optimized considering the fact Width and height are multiple
836 * of 2.
837 *
838 * @par Description:
839 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
840 * >> shift where shift = 15 - BitDepth
841 *
842 * @param[in] pi2_src1
843 * Pointer to source 1
844 *
845 * @param[in] pi2_src2
846 * Pointer to source 2
847 *
848 * @param[out] pu1_dst
849 * Pointer to destination
850 *
851 * @param[in] src_strd1
852 * Source stride 1
853 *
854 * @param[in] src_strd2
855 * Source stride 2
856 *
857 * @param[in] dst_strd
858 * Destination stride
859 *
860 * @param[in] lvl_shift1
861 * added before shift and offset
862 *
863 * @param[in] lvl_shift2
864 * added before shift and offset
865 *
866 * @param[in] ht
867 * height of the source
868 *
869 * @param[in] wd
870 * width of the source
871 *
872 * @returns
873 *
874 * @remarks
875 * None
876 *
877 *******************************************************************************
878 */
879
ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)880 void ihevc_weighted_pred_chroma_bi_default_neonintr(WORD16 *pi2_src1,
881 WORD16 *pi2_src2,
882 UWORD8 *pu1_dst,
883 WORD32 src_strd1,
884 WORD32 src_strd2,
885 WORD32 dst_strd,
886 WORD32 lvl_shift1,
887 WORD32 lvl_shift2,
888 WORD32 ht,
889 WORD32 wd)
890 {
891 WORD32 row, col;
892 int16x4_t pi2_src1_val1;
893 int16x4_t pi2_src1_val2;
894 int16x4_t pi2_src2_val1;
895 int16x4_t pi2_src2_val2;
896 int32x4_t i4_tmp1_t1;
897 int32x4_t i4_tmp1_t2;
898 int32x4_t i4_tmp2_t1;
899 int32x4_t i4_tmp2_t2;
900 int32x4_t sto_res_tmp1;
901 uint16x4_t sto_res_tmp2;
902 uint16x8_t sto_res_tmp3;
903 uint8x8_t sto_res;
904 int32x4_t tmp_lvl_shift_t;
905 int32x4_t tmp_shift_t;
906 WORD16 *pi2_src_tmp1;
907 WORD16 *pi2_src_tmp2;
908 UWORD8 *pu1_dst_tmp;
909 WORD32 shift;
910 WORD32 tmp_shift;
911 WORD32 tmp_lvl_shift;
912 int16x4_t lvl_shift1_t;
913 int16x4_t lvl_shift2_t;
914 shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
915 tmp_shift = 0 - shift;
916 tmp_lvl_shift = 1 << (shift - 1);
917 tmp_lvl_shift_t = vmovq_n_s32(tmp_lvl_shift);
918 tmp_shift_t = vmovq_n_s32(tmp_shift);
919
920 lvl_shift1_t = vmov_n_s16((int16_t)lvl_shift1);
921 lvl_shift2_t = vmov_n_s16((int16_t)lvl_shift2);
922
923 /* Used i4_tmp1_t & i4_tmp1_t to process 2 rows at a time. */
924 /* height has also been unrolled, hence 2 rows will processed at a time */
925 /* store also has been taken care for two row process */
926 /* vcombine_u16 has been used since after narrowing we get 16x4 value which can't be */
927 /* saturated and narrowed */
928
929 for(row = ht; row > 0; row -= 2)
930 {
931 for(col = 2 * wd; col > 0; col -= 4)
932 {
933 pi2_src_tmp1 = pi2_src1 + src_strd1;
934 pi2_src_tmp2 = pi2_src2 + src_strd2;
935
936 pi2_src1_val1 = vld1_s16((int16_t *)pi2_src1);
937 pi2_src1 += 4;
938 pu1_dst_tmp = pu1_dst + dst_strd;
939
940 pi2_src2_val1 = vld1_s16((int16_t *)pi2_src2);
941 pi2_src2 += 4;
942 i4_tmp1_t1 = vaddl_s16(pi2_src1_val1, lvl_shift1_t);
943
944 pi2_src1_val2 = vld1_s16((int16_t *)pi2_src_tmp1);
945 i4_tmp1_t2 = vaddl_s16(pi2_src2_val1, lvl_shift2_t);
946
947 pi2_src2_val2 = vld1_s16((int16_t *)pi2_src_tmp2);
948 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, i4_tmp1_t2);
949
950 i4_tmp2_t1 = vaddl_s16(pi2_src1_val2, lvl_shift1_t);
951 i4_tmp1_t1 = vaddq_s32(i4_tmp1_t1, tmp_lvl_shift_t);
952
953 i4_tmp2_t2 = vaddl_s16(pi2_src2_val2, lvl_shift2_t);
954 sto_res_tmp1 = vshlq_s32(i4_tmp1_t1, tmp_shift_t);
955
956 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, i4_tmp2_t2);
957 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
958
959 i4_tmp2_t1 = vaddq_s32(i4_tmp2_t1, tmp_lvl_shift_t);
960 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
961
962 sto_res_tmp1 = vshlq_s32(i4_tmp2_t1, tmp_shift_t);
963 sto_res = vqmovn_u16(sto_res_tmp3);
964
965 sto_res_tmp2 = vqmovun_s32(sto_res_tmp1);
966 sto_res_tmp3 = vcombine_u16(sto_res_tmp2, sto_res_tmp2);
967
968 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
969 pu1_dst += 4;
970
971 sto_res = vqmovn_u16(sto_res_tmp3);
972 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 0);
973 }
974 pi2_src1 += 2 * src_strd1 - 2 * wd;
975 pi2_src2 += 2 * src_strd2 - 2 * wd;
976 pu1_dst += 2 * dst_strd - 2 * wd;
977 }
978 }
979 //WEIGHTED_PRED_CHROMA_BI_DEFAULT
980