1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 * Contains function Definition for intra prediction interpolation filters
25 *
26 *
27 * @author
28 * Yogeswaran RS
29 *
30 * @par List of Functions:
31 * - ihevc_intra_pred_luma_planar()
32 * - ihevc_intra_pred_luma_dc()
33 * - ihevc_intra_pred_luma_horz()
34 * - ihevc_intra_pred_luma_ver()
35 * - ihevc_intra_pred_luma_mode2()
36 * - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 * None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46 #include <stdio.h>
47
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55
56 /****************************************************************************/
57 /* Constant Macros */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63
64
65
66 /*****************************************************************************/
67 /* Table Look-up */
68 /*****************************************************************************/
69
70 #define GET_BITS(y,x) ((y) & (1 << x)) && (1 << x)
71
72 /*****************************************************************************/
73 /* Function Definition */
74 /*****************************************************************************/
75
76 /**
77 *******************************************************************************
78 *
79 * @brief
80 * Intra prediction interpolation filter for pu1_ref substitution
81 *
82 *
83 * @par Description:
84 * Reference substitution process for samples unavailable for prediction
85 * Refer to section 8.4.4.2.2
86 *
87 * @param[in] pu1_top_left
88 * UWORD8 pointer to the top-left
89 *
90 * @param[in] pu1_top
91 * UWORD8 pointer to the top
92 *
93 * @param[in] pu1_left
94 * UWORD8 pointer to the left
95 *
96 * @param[in] src_strd
97 * WORD32 Source stride
98 *
99 * @param[in] nbr_flags
100 * WORD32 neighbor availability flags
101 *
102 * @param[in] nt
103 * WORD32 transform Block size
104 *
105 * @param[in] dst_strd
106 * WORD32 Destination stride
107 *
108 * @returns
109 *
110 * @remarks
111 * None
112 *
113 *******************************************************************************
114 */
115
116
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)117 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
118 UWORD8 *pu1_top,
119 UWORD8 *pu1_left,
120 WORD32 src_strd,
121 WORD32 nt,
122 WORD32 nbr_flags,
123 UWORD8 *pu1_dst,
124 WORD32 dst_strd)
125 {
126 UWORD8 pu1_ref;
127 WORD32 dc_val, i;
128 WORD32 total_samples = (4 * nt) + 1;
129 WORD32 two_nt = 2 * nt;
130 WORD32 three_nt = 3 * nt;
131 WORD32 get_bits;
132 WORD32 next;
133 WORD32 bot_left, left, top, tp_right, tp_left;
134 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
135 UNUSED(dst_strd);
136 dc_val = 1 << (BIT_DEPTH - 1);
137
138 /* Neighbor Flag Structure*/
139 /* Top-Left | Top-Right | Top | Left | Bottom-Left
140 1 4 4 4 4
141 */
142
143 /* If no neighbor flags are present, fill the neighbor samples with DC value */
144 if(nbr_flags == 0)
145 {
146 for(i = 0; i < total_samples; i++)
147 {
148 pu1_dst[i] = dc_val;
149 }
150 }
151 else
152 {
153 /* Else fill the corresponding samples */
154 pu1_dst[two_nt] = *pu1_top_left;
155 UWORD8 *pu1_dst_tmp2 = pu1_dst;
156 UWORD8 *pu1_top_tmp = pu1_top;
157 pu1_dst_tmp2 += two_nt + 1;
158
159 for(i = 0; i < two_nt; i++)
160 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
161
162 uint8x8_t src;
163 for(i = two_nt; i > 0; i -= 8)
164 {
165 src = vld1_u8(pu1_top_tmp);
166 pu1_top_tmp += 8;
167 vst1_u8(pu1_dst_tmp2, src);
168 pu1_dst_tmp2 += 8;
169 }
170
171 if(nt <= 8)
172 {
173 /* 1 bit extraction for all the neighboring blocks */
174 tp_left = (nbr_flags & 0x10000) >> 16;
175 bot_left = nbr_flags & 0x1;
176 left = (nbr_flags & 0x10) >> 4;
177 top = (nbr_flags & 0x100) >> 8;
178 tp_right = (nbr_flags & 0x1000) >> 12;
179
180 next = 1;
181
182 /* If bottom -left is not available, reverse substitution process*/
183 if(bot_left == 0)
184 {
185 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
186
187 /* Check for the 1st available sample from bottom-left*/
188 while(!a_nbr_flag[next])
189 next++;
190
191 /* If Left, top-left are available*/
192 if(next <= 2)
193 {
194 idx = nt * next;
195 pu1_ref = pu1_dst[idx];
196 for(i = 0; i < idx; i++)
197 pu1_dst[i] = pu1_ref;
198 }
199 else /* If top, top-right are available */
200 {
201 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
202 idx = (nt * (next - 1)) + 1;
203 pu1_ref = pu1_dst[idx];
204 for(i = 0; i < idx; i++)
205 pu1_dst[i] = pu1_ref;
206 }
207 }
208
209 /* Forward Substitution Process */
210 /* If left is Unavailable, copy the last bottom-left value */
211
212 if(left == 0)
213 {
214 uint8x8_t dup_pu1_dst1;
215 UWORD8 *pu1_dst_const_nt = pu1_dst;
216 pu1_dst_const_nt += nt;
217
218 if(0 == (nt & 7))
219 {
220 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
221 for(i = nt; i > 0; i -= 8)
222 {
223 vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
224 pu1_dst_const_nt += 8;
225
226 }
227 }
228 else
229 {
230 //uint32x2_t dup_pu1_dst4;
231 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
232 //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
233 for(i = nt; i > 0; i -= 4)
234 {
235 vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
236 pu1_dst_const_nt += 4;
237
238 }
239
240 }
241
242 }
243 if(tp_left == 0)
244 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
245 if(top == 0)
246 {
247
248 if(0 == (nt & 7))
249 {
250 uint8x8_t dup_pu1_dst2;
251 UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
252 pu1_dst_const_two_nt_1 += (two_nt + 1);
253 dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
254 for(i = nt; i > 0; i -= 8)
255 {
256 vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
257 pu1_dst_const_two_nt_1 += 8;
258
259 }
260 }
261 else
262 {
263 for(i = 0; i < nt; i++)
264 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
265 }
266 }
267 if(tp_right == 0)
268 {
269 uint8x8_t dup_pu1_dst3;
270 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
271 pu1_dst_const_three_nt_1 += (three_nt + 1);
272 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
273 if(0 == (nt & 7))
274 {
275 for(i = nt; i > 0; i -= 8)
276 {
277 vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
278 pu1_dst_const_three_nt_1 += 8;
279
280 }
281 }
282 else
283 {
284 for(i = nt; i > 0; i -= 4)
285 {
286 vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
287 pu1_dst_const_three_nt_1 += 4;
288 }
289
290 }
291
292 }
293 }
294 if(nt == 16)
295 {
296 WORD32 nbr_flags_temp = 0;
297 nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
298 + ((nbr_flags & 0x300) >> 4)
299 + ((nbr_flags & 0x3000) >> 6)
300 + ((nbr_flags & 0x10000) >> 8);
301
302 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
303 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
304 {
305 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
306
307 if(nbr_id_from_bl == 64)
308 nbr_id_from_bl = 32;
309
310 if(nbr_id_from_bl == 32)
311 {
312 /* for top left : 1 pel per nbr bit */
313 if(!((nbr_flags_temp >> 8) & 0x1))
314 {
315 nbr_id_from_bl++;
316 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
317 }
318 }
319 /* Reverse Substitution Process*/
320 if(nbr_id_from_bl)
321 {
322 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
323 pu1_ref = pu1_dst[nbr_id_from_bl];
324 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
325 {
326 pu1_dst[i] = pu1_ref;
327 }
328 }
329 }
330
331 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
332 while(nbr_id_from_bl < ((T16_4NT) + 1))
333 {
334 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
335 /* Devide by 8 to obtain the original index */
336 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
337
338 /* The Top-left flag is at the last bit location of nbr_flags*/
339 if(nbr_id_from_bl == (T16_4NT / 2))
340 {
341 get_bits = GET_BITS(nbr_flags_temp, 8);
342
343 /* only pel substitution for TL */
344 if(!get_bits)
345 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
346 }
347 else
348 {
349 get_bits = GET_BITS(nbr_flags_temp, frwd_nbr_flag);
350 if(!get_bits)
351 {
352 /* 8 pel substitution (other than TL) */
353 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
354 for(i = 0; i < 8; i++)
355 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
356 }
357
358 }
359 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
360 }
361 }
362
363 if(nt == 32)
364 {
365 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
366 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
367 {
368 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
369
370 if(nbr_id_from_bl == 64)
371 {
372 /* for top left : 1 pel per nbr bit */
373 if(!((nbr_flags >> 16) & 0x1))
374 {
375 /* top left not available */
376 nbr_id_from_bl++;
377 /* top and top right; 8 pels per nbr bit */
378 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
379 }
380 }
381 /* Reverse Substitution Process*/
382 if(nbr_id_from_bl)
383 {
384 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
385 pu1_ref = pu1_dst[nbr_id_from_bl];
386 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
387 pu1_dst[i] = pu1_ref;
388 }
389 }
390
391 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
392 while(nbr_id_from_bl < ((T32_4NT)+1))
393 {
394 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
395 /* Devide by 8 to obtain the original index */
396 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
397
398 /* The Top-left flag is at the last bit location of nbr_flags*/
399 if(nbr_id_from_bl == (T32_4NT / 2))
400 {
401 get_bits = GET_BITS(nbr_flags, 16);
402 /* only pel substitution for TL */
403 if(!get_bits)
404 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
405 }
406 else
407 {
408 get_bits = GET_BITS(nbr_flags, frwd_nbr_flag);
409 if(!get_bits)
410 {
411 /* 8 pel substitution (other than TL) */
412 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
413 for(i = 0; i < 8; i++)
414 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
415 }
416
417 }
418 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
419 }
420 }
421
422 }
423
424 }
425
426 /**
427 *******************************************************************************
428 *
429 * @brief
430 * Intra prediction interpolation filter for ref_filtering
431 *
432 *
433 * @par Description:
434 * Reference DC filtering for neighboring samples dependent on TU size and
435 * mode Refer to section 8.4.4.2.3 in the standard
436 *
437 * @param[in] pu1_src
438 * UWORD8 pointer to the source
439 *
440 * @param[out] pu1_dst
441 * UWORD8 pointer to the destination
442 *
443 * @param[in] nt
444 * integer Transform Block size
445 *
446 * @param[in] mode
447 * integer intraprediction mode
448 *
449 * @returns
450 *
451 * @remarks
452 * None
453 *
454 *******************************************************************************
455 */
456
457
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)458 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
459 WORD32 nt,
460 UWORD8 *pu1_dst,
461 WORD32 mode,
462 WORD32 strong_intra_smoothing_enable_flag)
463 {
464 WORD32 filter_flag;
465 WORD32 i = 0;
466 WORD32 four_nt = 4 * nt;
467
468 WORD32 src_4nt;
469
470 /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */
471 /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */
472 UWORD8 *pu1_src_tmp_0 = pu1_src;
473 UWORD8 *pu1_src_tmp_1;
474 UWORD8 *pu1_src_tmp_2;
475 UWORD8 *pu1_dst_tmp_0 = pu1_dst;
476 UWORD8 *pu1_dst_tmp_1;
477
478 uint8x8_t src_val_0, src_val_2;
479 uint8x8_t src_val_1, shift_res;
480 uint8x8_t dup_const_2;
481 uint16x8_t mul_res, add_res;
482 WORD32 bi_linear_int_flag = 0;
483 WORD32 abs_cond_left_flag = 0;
484 WORD32 abs_cond_top_flag = 0;
485 WORD32 dc_val = 1 << (BIT_DEPTH - 5);
486 shift_res = vdup_n_u8(0);
487
488 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
489
490 if(0 == filter_flag)
491 {
492 if(pu1_src == pu1_dst)
493 {
494 return;
495 }
496 else
497 {
498 for(i = four_nt; i > 0; i -= 8)
499 {
500 src_val_0 = vld1_u8(pu1_src_tmp_0);
501 pu1_src_tmp_0 += 8;
502 vst1_u8(pu1_dst_tmp_0, src_val_0);
503 pu1_dst_tmp_0 += 8;
504 }
505 pu1_dst[four_nt] = pu1_src[four_nt];
506 }
507 }
508
509 else
510 {
511 /* If strong intra smoothin is enabled and transform size is 32 */
512 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
513 {
514 /*Strong Intra Filtering*/
515 abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
516 - (2 * pu1_src[3 * nt]))) < dc_val;
517 abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
518 - (2 * pu1_src[nt]))) < dc_val;
519
520 bi_linear_int_flag = ((1 == abs_cond_left_flag)
521 && (1 == abs_cond_top_flag));
522 }
523
524 src_4nt = pu1_src[4 * nt];
525 /* Strong filtering of reference samples */
526 if(1 == bi_linear_int_flag)
527 {
528 WORD32 two_nt = four_nt >> 1;
529
530 WORD32 pu1_src_0_val = pu1_src[0];
531 WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
532 WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
533
534 WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
535 uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
536
537 WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
538 uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
539
540 const UWORD8 *const_col_i;
541 uint8x8_t const_col_i_val;
542 uint16x8_t prod_val_1;
543 uint16x8_t prod_val_2;
544 uint16x8_t prod_val_3;
545 uint16x8_t prod_val_4;
546 uint8x8_t res_val_1;
547 uint8x8_t res_val_2;
548 uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
549 uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
550 uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
551 pu1_dst_tmp_0 = pu1_dst + 1;
552 pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
553
554 const_col_i = gau1_ihevc_planar_factor + 1;
555
556 for(i = two_nt; i > 0; i -= 8)
557 {
558 const_col_i_val = vld1_u8(const_col_i);
559 const_col_i += 8;
560
561 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
562 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
563
564 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
565 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
566
567 vst1_u8(pu1_dst_tmp_0, res_val_1);
568 pu1_dst_tmp_0 += 8;
569 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
570
571 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
572 vst1_u8(pu1_dst_tmp_1, res_val_2);
573 pu1_dst_tmp_1 += 8;
574 }
575 pu1_dst[2 * nt] = pu1_src[2 * nt];
576 }
577 else
578 {
579 pu1_src_tmp_1 = pu1_src + 1;
580 pu1_src_tmp_2 = pu1_src + 2;
581 pu1_dst_tmp_0 += 1;
582
583 dup_const_2 = vdup_n_u8(2);
584
585 /* Extremities Untouched*/
586 pu1_dst[0] = pu1_src[0];
587
588 /* To avoid the issue when the dest and src has the same pointer this load has been done
589 * outside and the 2nd consecutive load is done before the store of the 1st */
590
591 /* Perform bilinear filtering of Reference Samples */
592 for(i = (four_nt - 1); i > 0; i -= 8)
593 {
594 src_val_0 = vld1_u8(pu1_src_tmp_0);
595 pu1_src_tmp_0 += 8;
596
597 src_val_2 = vld1_u8(pu1_src_tmp_2);
598 pu1_src_tmp_2 += 8;
599
600 src_val_1 = vld1_u8(pu1_src_tmp_1);
601 pu1_src_tmp_1 += 8;
602
603 if(i < four_nt - 1)
604 {
605 vst1_u8(pu1_dst_tmp_0, shift_res);
606 pu1_dst_tmp_0 += 8;
607 }
608
609 add_res = vaddl_u8(src_val_0, src_val_2);
610
611 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
612 shift_res = vrshrn_n_u16(mul_res, 2);
613
614 }
615 vst1_u8(pu1_dst_tmp_0, shift_res);
616 pu1_dst_tmp_0 += 8;
617 }
618 pu1_dst[4 * nt] = src_4nt;
619
620 }
621
622 }
623
624
625
626 /**
627 *******************************************************************************
628 *
629 * @brief
630 * Intra prediction interpolation filter for luma planar
631 *
632 * @par Description:
633 * Planar Intraprediction with reference neighboring samples location
634 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
635 *
636 * @param[in] pu1_src
637 * UWORD8 pointer to the source
638 *
639 * @param[out] pu1_dst
640 * UWORD8 pointer to the destination
641 *
642 * @param[in] src_strd
643 * integer source stride
644 *
645 * @param[in] dst_strd
646 * integer destination stride
647 *
648 * @param[in] nt
649 * integer Transform Block size
650 *
651 * @param[in] wd
652 * integer width of the array
653 *
654 * @returns
655 *
656 * @remarks
657 * None
658 *
659 *******************************************************************************
660 */
661
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)662 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
663 WORD32 src_strd,
664 UWORD8 *pu1_dst,
665 WORD32 dst_strd,
666 WORD32 nt,
667 WORD32 mode)
668 {
669 /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */
670 /* load const_nt_1_col values into a d register */
671 /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */
672 /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */
673 /* log2nt + 1 is taken care while assigning the values itself */
674 /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
675
676 WORD32 row, col = 0;
677 WORD32 log2nt_plus1 = 6;
678 WORD32 two_nt, three_nt;
679 UWORD8 *pu1_ref_two_nt_1;
680 UWORD8 *pu1_dst_tmp;
681 const UWORD8 *const_nt_1_col;
682 uint8x8_t const_nt_1_col_t;
683 const UWORD8 *const_col_1;
684 uint8x8_t const_col_1_t;
685 uint8_t const_nt_1_row;
686 uint8x8_t const_nt_1_row_dup;
687 uint8_t const_row_1;
688 uint8x8_t const_row_1_dup;
689 uint8_t const_nt = nt;
690 uint16x8_t const_nt_dup;
691 uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
692 uint8x8_t pu1_ref_nt_1_dup;
693 uint8_t pu1_ref_two_nt_1_row;
694 uint8_t pu1_ref_three_nt_1;
695 uint8x8_t pu1_ref_two_nt_1_row_dup;
696 uint8x8_t pu1_ref_two_nt_1_t;
697 uint8x8_t pu1_ref_three_nt_1_dup;
698 uint16x8_t prod_t1;
699 uint16x8_t prod_t2;
700 uint16x8_t sto_res_tmp;
701 uint8x8_t sto_res;
702 int16x8_t log2nt_dup;
703 UNUSED(src_strd);
704 UNUSED(mode);
705 log2nt_plus1 = 32 - CLZ(nt);
706 two_nt = 2 * nt;
707 three_nt = 3 * nt;
708 /* loops have been unrolld considering the fact width is multiple of 8 */
709 if(0 == (nt & 7))
710 {
711 pu1_dst_tmp = pu1_dst;
712 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
713
714 const_col_1 = gau1_ihevc_planar_factor + 1;
715 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
716
717 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
718 const_nt_dup = vdupq_n_u16(const_nt);
719
720 log2nt_dup = vdupq_n_s16(log2nt_plus1);
721 log2nt_dup = vnegq_s16(log2nt_dup);
722
723 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
724
725 for(row = 0; row < nt; row++)
726 {
727 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
728 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
729
730 const_nt_1_row = nt - 1 - row;
731 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
732
733 const_row_1 = row + 1;
734 const_row_1_dup = vdup_n_u8(const_row_1);
735
736 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
737
738 const_col_1 = gau1_ihevc_planar_factor + 1;
739 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
740
741 for(col = nt; col > 0; col -= 8)
742 {
743 const_nt_1_col_t = vld1_u8(const_nt_1_col);
744 const_nt_1_col -= 8;
745 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
746
747 const_col_1_t = vld1_u8(const_col_1);
748 const_col_1 += 8;
749 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
750
751 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
752 pu1_ref_two_nt_1 += 8;
753 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
754
755 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
756 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
757 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
758 prod_t1 = vaddq_u16(prod_t1, prod_t2);
759
760 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
761 sto_res = vmovn_u16(sto_res_tmp);
762 vst1_u8(pu1_dst_tmp, sto_res);
763 pu1_dst_tmp += 8;
764 }
765 pu1_dst_tmp += dst_strd - nt;
766 }
767 }
768 /* loops have been unrolld considering the fact width is multiple of 4 */
769 /* If column is multiple of 4 then height should be multiple of 2 */
770 else
771 {
772 uint8x8_t const_row_1_dup1;
773 uint8x8_t pu1_ref_two_nt_1_t1;
774 uint8x8_t const_nt_1_col_t1;
775 uint8x8_t const_col_1_t1;
776 uint8x8_t pu1_ref_two_nt_1_row_dup1;
777 uint8x8_t const_nt_1_row_dup1;
778
779 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
780
781 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
782 const_nt_dup = vdupq_n_u16(const_nt);
783
784 log2nt_dup = vdupq_n_s16(log2nt_plus1);
785 log2nt_dup = vnegq_s16(log2nt_dup);
786
787 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
788
789 for(row = 0; row < nt; row += 2)
790 {
791 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
792 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
793 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
794 pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
795 pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
796
797 const_nt_1_row = nt - 1 - row;
798 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
799 const_nt_1_row = nt - 2 - row;
800 const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
801 const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
802
803 const_row_1 = row + 1;
804 const_row_1_dup = vdup_n_u8(const_row_1);
805 const_row_1 = row + 2;
806 const_row_1_dup1 = vdup_n_u8(const_row_1);
807 const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
808
809 const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
810
811 const_col_1 = gau1_ihevc_planar_factor + 1;
812
813 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
814
815 for(col = nt; col > 0; col -= 4)
816 {
817 const_nt_1_col_t = vld1_u8(const_nt_1_col);
818 const_nt_1_col -= 4;
819 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
820
821 const_col_1_t = vld1_u8(const_col_1);
822 const_col_1 += 4;
823 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
824
825 pu1_dst_tmp = pu1_dst;
826 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
827
828 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
829 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
830
831 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
832 pu1_ref_two_nt_1 += 4;
833 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
834
835 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
836 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
837
838 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
839 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
840
841 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
842 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
843 prod_t1 = vaddq_u16(prod_t1, prod_t2);
844
845 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
846 sto_res = vmovn_u16(sto_res_tmp);
847
848 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
849 pu1_dst_tmp += dst_strd;
850
851 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
852 pu1_dst += 4;
853 }
854 pu1_dst += 2 * dst_strd - nt;
855 }
856 }
857
858 }
859 /* INTRA_PRED_LUMA_PLANAR */
860
861 /**
862 *******************************************************************************
863 *
864 * @brief
865 * Intra prediction interpolation filter for luma dc
866 *
867 * @par Description:
868 * Intraprediction for DC mode with reference neighboring samples location
869 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
870 *
871 * @param[in] pu1_src
872 * UWORD8 pointer to the source
873 *
874 * @param[out] pu1_dst
875 * UWORD8 pointer to the destination
876 *
877 * @param[in] src_strd
878 * integer source stride
879 *
880 * @param[in] dst_strd
881 * integer destination stride
882 *
883 * @param[in] nt
884 * integer Transform Block size
885 *
886 * @param[in] wd
887 * integer width of the array
888 *
889 * @returns
890 *
891 * @remarks
892 * None
893 *
894 *******************************************************************************
895 */
896
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)897 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
898 WORD32 src_strd,
899 UWORD8 *pu1_dst,
900 WORD32 dst_strd,
901 WORD32 nt,
902 WORD32 mode)
903 {
904 WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
905 WORD32 i = 0;
906 WORD32 row = 0, col = 0, col_count;
907 WORD32 log2nt_plus1 = 6;
908 WORD32 two_nt = 0;
909 uint16x8_t ref_load_q;
910 uint16x8_t three_dc_val_t;
911 uint8x8_t sto_res_tmp;
912 uint8x8_t sto_res_tmp1;
913 uint8x8_t sto_res_tmp2;
914 uint8x8_t sto_res_tmp3;
915 uint8x8_t sto_res_tmp4;
916 uint8x8_t dc_val_t;
917
918 UWORD8 *pu1_ref_tmp;
919 UWORD8 *pu1_ref_tmp1;
920 UWORD8 *pu1_dst_tmp;
921 UWORD8 *pu1_dst_tmp1;
922 UWORD8 *pu1_dst_tmp2;
923 UNUSED(src_strd);
924 UNUSED(mode);
925
926 /* log2nt + 1 is taken care while assigning the values itself. */
927 log2nt_plus1 = 32 - CLZ(nt);
928
929 /* loops have been unrolld considering the fact width is multiple of 8 */
930 if(0 == (nt & 7))
931 {
932 uint8x8_t ref_load1;
933 uint8x8_t ref_load2;
934 uint16x4_t acc_dc_pair1;
935 uint32x2_t acc_dc_pair2;
936 uint64x1_t acc_dc = vdup_n_u64(col);
937
938 two_nt = 2 * nt;
939 pu1_ref_tmp = pu1_ref + nt;
940 pu1_ref_tmp1 = pu1_ref + two_nt + 1;
941
942 for(i = two_nt; i > nt; i -= 8)
943 {
944 ref_load1 = vld1_u8(pu1_ref_tmp);
945 pu1_ref_tmp += 8;
946 acc_dc_pair1 = vpaddl_u8(ref_load1);
947
948 ref_load2 = vld1_u8(pu1_ref_tmp1);
949 pu1_ref_tmp1 += 8;
950
951 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
952 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
953
954 acc_dc_pair1 = vpaddl_u8(ref_load2);
955 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
956 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
957 }
958
959 dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
960 dc_val_t = vdup_n_u8(dc_val);
961 two_dc_val = 2 * dc_val;
962 three_dc_val = 3 * dc_val;
963 three_dc_val += 2;
964
965 three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
966 pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
967 pu1_dst_tmp = pu1_dst;
968
969
970 if(nt == 32)
971 {
972 for(row = 0; row < nt; row++)
973 {
974 for(col = nt; col > 0; col -= 8)
975 {
976 vst1_u8(pu1_dst_tmp, dc_val_t);
977 pu1_dst_tmp += 8;
978 }
979 pu1_dst_tmp += dst_strd - nt;
980 }
981 }
982 else
983
984 {
985 for(col = nt; col > 0; col -= 8)
986 {
987 ref_load1 = vld1_u8(pu1_ref_tmp);
988 pu1_ref_tmp += 8;
989 ref_load_q = vmovl_u8(ref_load1);
990 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
991 ref_load_q = vshrq_n_u16(ref_load_q, 2);
992 sto_res_tmp = vmovn_u16(ref_load_q);
993 vst1_u8(pu1_dst_tmp, sto_res_tmp);
994 pu1_dst_tmp += 8;
995 }
996
997 pu1_ref_tmp = pu1_ref + two_nt - 9;
998 pu1_dst_tmp = pu1_dst + dst_strd;
999 col_count = nt - 8;
1000
1001 /* Except the first row the remaining rows are done here */
1002 /* Both column and row has been unrolled by 8 */
1003 /* Store has been taken care for the unrolling */
1004 /* Except the 1st column of the remaining rows(other than 1st row), the values are */
1005 /* constant hence it is extracted with an constant value and stored */
1006 /* If the column is greater than 8, then the remaining values are constant which is */
1007 /* taken care in the inner for loop */
1008
1009 for(row = nt; row > 0; row -= 8)
1010 {
1011 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1012 ref_load1 = vld1_u8(pu1_ref_tmp);
1013 pu1_ref_tmp -= 8;
1014 ref_load_q = vmovl_u8(ref_load1);
1015 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1016 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1017 sto_res_tmp = vmovn_u16(ref_load_q);
1018
1019 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1020
1021 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1022 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1023 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1024 pu1_dst_tmp += dst_strd;
1025
1026 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1027 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1028 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1029 pu1_dst_tmp += dst_strd;
1030
1031 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1032 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1033 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1034 pu1_dst_tmp += dst_strd;
1035
1036 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1037 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1038 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1039 pu1_dst_tmp += dst_strd;
1040
1041 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1042 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1043 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1044 pu1_dst_tmp += dst_strd;
1045
1046 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1047 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1048 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1049 pu1_dst_tmp += dst_strd;
1050
1051 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1052 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1053 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1054 pu1_dst_tmp += dst_strd;
1055 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1056 if(row != 8)
1057 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1058 pu1_dst_tmp += dst_strd;
1059
1060 for(col = col_count; col > 0; col -= 8)
1061 {
1062 pu1_dst_tmp2 = pu1_dst_tmp1;
1063 vst1_u8(pu1_dst_tmp1, dc_val_t);
1064 pu1_dst_tmp1 += dst_strd;
1065 vst1_u8(pu1_dst_tmp1, dc_val_t);
1066 pu1_dst_tmp1 += dst_strd;
1067 vst1_u8(pu1_dst_tmp1, dc_val_t);
1068 pu1_dst_tmp1 += dst_strd;
1069 vst1_u8(pu1_dst_tmp1, dc_val_t);
1070 pu1_dst_tmp1 += dst_strd;
1071 vst1_u8(pu1_dst_tmp1, dc_val_t);
1072 pu1_dst_tmp1 += dst_strd;
1073 vst1_u8(pu1_dst_tmp1, dc_val_t);
1074 pu1_dst_tmp1 += dst_strd;
1075 vst1_u8(pu1_dst_tmp1, dc_val_t);
1076 pu1_dst_tmp1 += dst_strd;
1077
1078 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1079 if(row != 8)
1080 vst1_u8(pu1_dst_tmp1, dc_val_t);
1081 pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1082 }
1083 }
1084 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1085 }
1086 }
1087 /* loops have been unrolld considering the fact width is multiple of 4 */
1088 else
1089 {
1090 WORD32 acc_dc;
1091 two_nt = 2 * nt;
1092
1093 acc_dc = 0;
1094 pu1_ref_tmp = pu1_ref + nt + 1;
1095 for(i = nt; i < two_nt; i++)
1096 {
1097 acc_dc += pu1_ref[i];
1098 acc_dc += pu1_ref_tmp[i];
1099 }
1100 dc_val = (acc_dc + nt) >> (log2nt_plus1);
1101 two_dc_val = 2 * dc_val;
1102 three_dc_val = 3 * dc_val;
1103 three_dc_val = three_dc_val + 2;
1104 dc_val_t = vdup_n_u8(dc_val);
1105
1106 if(nt == 32)
1107 {
1108 pu1_dst_tmp = pu1_dst;
1109 for(row = 0; row < nt; row++)
1110 {
1111 for(col = nt; col > 0; col -= 4)
1112 {
1113 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1114 pu1_dst_tmp += 4;
1115 }
1116 pu1_dst_tmp += dst_strd - nt;
1117 }
1118 }
1119 else
1120
1121 {
1122 for(col = 1; col < nt; col++)
1123 {
1124 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1125 }
1126
1127 pu1_dst_tmp = pu1_dst + dst_strd + 0;
1128 /* Since first row is already updated before, loop count is nt-1 */
1129 for(row = nt - 1; row > 0; row -= 1)
1130 {
1131 for(col = nt; col > 0; col -= 4)
1132 {
1133 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1134 pu1_dst_tmp += 4;
1135 }
1136 pu1_dst_tmp += dst_strd - nt;
1137 }
1138
1139 for(row = 1; row < nt; row++)
1140 {
1141 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1142 }
1143 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1144 }
1145 }
1146 }
1147 /* INTRA_PRED_LUMA_DC */
1148
1149 /**
1150 *******************************************************************************
1151 *
1152 * @brief
1153 * Intra prediction interpolation filter for horizontal luma variable.
1154 *
1155 * @par Description:
1156 * Horizontal intraprediction with reference neighboring samples location
1157 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1158 *
1159 * @param[in] pu1_src
1160 * UWORD8 pointer to the source
1161 *
1162 * @param[out] pu1_dst
1163 * UWORD8 pointer to the destination
1164 *
1165 * @param[in] src_strd
1166 * integer source stride
1167 *
1168 * @param[in] dst_strd
1169 * integer destination stride
1170 *
1171 * @param[in] nt
1172 * integer Transform Block size
1173 *
1174 * @param[in] wd
1175 * integer width of the array
1176 *
1177 * @returns
1178 *
1179 * @remarks
1180 * None
1181 *
1182 *******************************************************************************
1183 */
1184
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1185 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1186 WORD32 src_strd,
1187 UWORD8 *pu1_dst,
1188 WORD32 dst_strd,
1189 WORD32 nt,
1190 WORD32 mode)
1191 {
1192
1193 WORD32 row, col;
1194 WORD32 two_nt;
1195 UNUSED(src_strd);
1196 UNUSED(mode);
1197
1198 two_nt = 2 * nt;
1199
1200
1201 UWORD8 *pu1_dst_tmp = pu1_dst;
1202 UWORD32 pu1_val;
1203 uint8x8_t pu1_val_two_nt_1_row;
1204 if(nt == 32)
1205 {
1206 pu1_dst_tmp = pu1_dst;
1207 for(row = 0; row < nt; row++)
1208 {
1209 pu1_val = pu1_ref[two_nt - 1 - row];
1210 pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1211 for(col = nt; col > 0; col -= 8)
1212 {
1213 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1214 pu1_dst_tmp += 8;
1215 }
1216 pu1_dst_tmp += dst_strd - nt;
1217 }
1218 }
1219 else
1220
1221
1222 /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1223 /* naming of variables made according to the operation(instructions) it performs*/
1224 /* (eg. shift_val which contains the shifted value, */
1225 /* add_sat which has add and saturated value) */
1226 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1227 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1228 {
1229 if(0 != (nt & 7)) /* cond for multiple of 4 */
1230 {
1231 UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1232 UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1233 UWORD8 *pu1_dst_4 = pu1_dst;
1234 UWORD8 *pu1_dst_4_tmp = pu1_dst;
1235
1236 uint32x2_t pu1_ref_val1, pu1_ref_val2;
1237 uint8x8_t dup_sub, round_val, dup_val;
1238 uint16x8_t dup_add, sub_val;
1239 int16x8_t shift_val, add_sat;
1240
1241 pu1_ref_val1 = vdup_n_u32(0);
1242 pu1_ref_val2 = vdup_n_u32(0);
1243
1244 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1245
1246 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1247
1248 pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1249
1250 pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1251
1252 for(row = nt; row > 0; row -= 4)
1253 {
1254 for(col = nt; col > 0; col -= 4)
1255 {
1256 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1257 sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1258 shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1259
1260 add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1261 round_val = vqmovun_s16(add_sat);
1262 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1263 pu1_dst_4 += dst_strd;
1264
1265 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1266 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1267 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1268 pu1_dst_4 += dst_strd;
1269
1270 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1271 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1272 pu1_dst_4 += dst_strd;
1273
1274 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1275 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1276 pu1_dst_4 += dst_strd;
1277
1278
1279 }
1280 /* worst cases */
1281 pu1_ref_4_two_nt_minus_nt += 3;
1282 pu1_ref_4_two_nt_plus1 += 4;
1283 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1284 }
1285
1286 }
1287
1288 /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1289 /* naming of variables made according to the operation(instructions) it performs */
1290 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1291 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1292
1293 else
1294 {
1295 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1296 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1297
1298 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1299 UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1300 UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1301
1302 uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1303 uint16x8_t sub_res, dup_add;
1304 int16x8_t shift_res, add_res;
1305
1306 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1307 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1308
1309 pu1_ref_tmp_1 += (two_nt + 1);
1310 pu1_ref_tmp_2 += (two_nt - 1);
1311
1312 for(col = nt; col > 0; col -= 8)
1313 {
1314 src_tmp = vld1_u8(pu1_ref_tmp_1);
1315 pu1_ref_tmp_1 += 8;
1316
1317 sub_res = vsubl_u8(src_tmp, dup_sub);
1318 shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1319 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1320 round_val = vqmovun_s16(add_res);
1321 vst1_u8(pu1_dst_tmp_1, round_val);
1322 pu1_dst_tmp_1 += 8;
1323 }
1324
1325 for(row = nt; row > 0; row -= 8)
1326 {
1327 pu1_ref_tmp_2 -= 8;
1328
1329 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1330 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1331
1332 dup_1 = vdup_lane_u8(rev_res, 0);
1333 dup_2 = vdup_lane_u8(rev_res, 1);
1334 dup_3 = vdup_lane_u8(rev_res, 2);
1335 dup_4 = vdup_lane_u8(rev_res, 3);
1336 dup_5 = vdup_lane_u8(rev_res, 4);
1337 dup_6 = vdup_lane_u8(rev_res, 5);
1338 dup_7 = vdup_lane_u8(rev_res, 6);
1339 dup_8 = vdup_lane_u8(rev_res, 7);
1340
1341 for(col = nt; col > 0; col -= 8)
1342 {
1343 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1344
1345 vst1_u8(pu1_dst_tmp_2, dup_1);
1346 pu1_dst_tmp_2 += dst_strd;
1347
1348 vst1_u8(pu1_dst_tmp_2, dup_2);
1349 pu1_dst_tmp_2 += dst_strd;
1350
1351 vst1_u8(pu1_dst_tmp_2, dup_3);
1352 pu1_dst_tmp_2 += dst_strd;
1353
1354 vst1_u8(pu1_dst_tmp_2, dup_4);
1355 pu1_dst_tmp_2 += dst_strd;
1356
1357 vst1_u8(pu1_dst_tmp_2, dup_5);
1358 pu1_dst_tmp_2 += dst_strd;
1359
1360 vst1_u8(pu1_dst_tmp_2, dup_6);
1361 pu1_dst_tmp_2 += dst_strd;
1362
1363 vst1_u8(pu1_dst_tmp_2, dup_7);
1364 pu1_dst_tmp_2 += dst_strd;
1365
1366 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1367 if(row != 8)
1368 vst1_u8(pu1_dst_tmp_2, dup_8);
1369 pu1_dst_tmp_2 += dst_strd;
1370
1371 pu1_dst_tmp_3 += 8;
1372 }
1373 pu1_dst_tmp_2 -= (nt - 8);
1374 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1375 }
1376 }
1377 }
1378 }
1379 /* INTRA_PRED_LUMA_HORZ */
1380
1381 /**
1382 *******************************************************************************
1383 *
1384 * @brief
1385 * Intra prediction interpolation filter for vertical luma variable.
1386 *
1387 * @par Description:
1388 * Horizontal intraprediction with reference neighboring samples location
1389 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1390 *
1391 * @param[in] pu1_src
1392 * UWORD8 pointer to the source
1393 *
1394 * @param[out] pu1_dst
1395 * UWORD8 pointer to the destination
1396 *
1397 * @param[in] src_strd
1398 * integer source stride
1399 *
1400 * @param[in] dst_strd
1401 * integer destination stride
1402 *
1403 * @param[in] nt
1404 * integer Transform Block size
1405 *
1406 * @param[in] wd
1407 * integer width of the array
1408 *
1409 * @returns
1410 *
1411 * @remarks
1412 * None
1413 *
1414 *******************************************************************************
1415 */
1416
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1417 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1418 WORD32 src_strd,
1419 UWORD8 *pu1_dst,
1420 WORD32 dst_strd,
1421 WORD32 nt,
1422 WORD32 mode)
1423 {
1424 WORD32 row, col;
1425 WORD32 two_nt;
1426 UNUSED(src_strd);
1427 UNUSED(mode);
1428
1429 two_nt = 2 * nt;
1430
1431 UWORD8 *pu1_dst_tmp = pu1_dst;
1432 UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1433 uint8x8_t pu1_val_two_nt_1_col;
1434 if(nt == 32)
1435 {
1436 pu1_dst_tmp = pu1_dst;
1437 for(row = 0; row < nt; row++)
1438 {
1439 for(col = nt; col > 0; col -= 8)
1440 {
1441 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1442 pu1_ref_tmp_1 += 8;
1443 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1444 pu1_dst_tmp += 8;
1445 }
1446 pu1_ref_tmp_1 -= nt;
1447 pu1_dst_tmp += dst_strd - nt;
1448 }
1449 }
1450 else
1451
1452 {
1453 /* naming of variables made according to the operation(instructions) it performs */
1454 /* (eg. shift_val which contains the shifted value, */
1455 /* add_sat which has add and saturated value) */
1456 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1457 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1458
1459 if(0 != (nt & 7))
1460 {
1461 WORD32 cond_4 = 0;
1462 UWORD8 *pu1_ref_val1 = pu1_ref;
1463 UWORD8 *pu1_ref_val2 = pu1_ref;
1464 UWORD8 *pu1_ref_val3 = pu1_ref;
1465
1466 UWORD8 *pu1_dst_val1 = pu1_dst;
1467 UWORD8 *pu1_dst_val2 = pu1_dst;
1468 UWORD8 *pu1_dst_val3 = pu1_dst;
1469
1470 uint8x8_t dup_2_sub, round_val, vext_val;
1471 uint16x8_t dup_2_add;
1472 uint32x2_t src_val1, src_val2, src_val3;
1473 uint16x8_t sub_val;
1474 int16x8_t shift_val1, add_sat;
1475 uint64x1_t shift_val2;
1476
1477 src_val1 = vdup_n_u32(0);
1478 src_val2 = vdup_n_u32(0);
1479 src_val3 = vdup_n_u32(0);
1480 pu1_ref_val1 += (two_nt - nt);
1481 pu1_ref_val3 += (two_nt + 2);
1482 pu1_ref_val2 += (two_nt + 1);
1483
1484 dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1485 dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1486
1487 /* loops to store the first nt sets of values in the destination */
1488
1489 for(row = nt; row > 0; row -= 4)
1490 {
1491 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1492 {
1493 /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1494 src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1495 sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1496 shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1497 add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1498 round_val = vqmovun_s16(add_sat);
1499
1500 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1501 src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1502 vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1503 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1504 pu1_dst_val1 += dst_strd;
1505
1506 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1507
1508 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1509 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1510 pu1_dst_val1 += dst_strd;
1511
1512 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1513
1514 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1515 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1516 pu1_dst_val1 += dst_strd;
1517
1518 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1519
1520 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1521 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1522 pu1_dst_val1 += dst_strd;
1523
1524 pu1_ref_val1 -= 4;
1525 }
1526
1527 /* loop to store next sets of eight values in the destination */
1528
1529 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1530 {
1531 src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1532
1533 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1534 pu1_dst_val2 += dst_strd;
1535
1536 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1537 pu1_dst_val2 += dst_strd;
1538
1539 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1540 pu1_dst_val2 += dst_strd;
1541
1542 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1543 pu1_dst_val2 += dst_strd;
1544 }
1545 pu1_ref_val2 += 4;
1546 pu1_dst_val3 += 4;
1547 pu1_dst_val2 = pu1_dst_val3;
1548 cond_4 = 1;
1549 }
1550 }
1551
1552 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1553 else
1554 {
1555 WORD32 cond = 0, col_1;
1556 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1557 UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1558 UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1559
1560 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1561 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1562 UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1563
1564 uint8x8_t pu1_src_tmp1;
1565 uint8x8_t pu1_src_tmp2;
1566
1567 uint8x8_t dup_sub;
1568 uint16x8_t dup_add;
1569 int16x8_t subsh_val;
1570 int16x8_t addsat_val;
1571 uint16x8_t sub_val;
1572 uint8x8_t round_val;
1573 uint8x8_t vext_t;
1574 uint64x1_t shift_64;
1575
1576 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1577 dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1578
1579 pu1_ref_tmp_1 += (two_nt);
1580 pu1_ref_tmp_1 -= 8;
1581 pu1_ref_tmp_2 += (two_nt + 2);
1582 pu1_ref_tmp_3 += (two_nt + 1);
1583
1584 /* loops to store the first nt sets of values in the destination */
1585
1586 for(row = nt; row > 0; row -= 8)
1587 {
1588 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1589 {
1590 pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1591
1592 sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1593 subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1594 addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1595 round_val = vqmovun_s16(addsat_val);
1596
1597 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1598
1599 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1600 vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1601 vst1_u8(pu1_dst_tmp_1, vext_t);
1602 pu1_dst_tmp_1 += dst_strd;
1603
1604 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1605
1606 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1607 vst1_u8(pu1_dst_tmp_1, vext_t);
1608 pu1_dst_tmp_1 += dst_strd;
1609
1610 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1611 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1612 vst1_u8(pu1_dst_tmp_1, vext_t);
1613 pu1_dst_tmp_1 += dst_strd;
1614
1615 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1616 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1617 vst1_u8(pu1_dst_tmp_1, vext_t);
1618 pu1_dst_tmp_1 += dst_strd;
1619
1620 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1621 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1622 vst1_u8(pu1_dst_tmp_1, vext_t);
1623 pu1_dst_tmp_1 += dst_strd;
1624
1625 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1626 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1627 vst1_u8(pu1_dst_tmp_1, vext_t);
1628 pu1_dst_tmp_1 += dst_strd;
1629
1630 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1631 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1632 vst1_u8(pu1_dst_tmp_1, vext_t);
1633 pu1_dst_tmp_1 += dst_strd;
1634
1635 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1636 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1637 vst1_u8(pu1_dst_tmp_1, vext_t);
1638 pu1_dst_tmp_1 += dst_strd;
1639
1640 pu1_ref_tmp_1 -= 8;
1641 }
1642
1643 /* loop to store next sets of eight values in the destination */
1644
1645 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1646 {
1647 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1648
1649 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1650 pu1_dst_tmp_2 += dst_strd;
1651
1652 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1653 pu1_dst_tmp_2 += dst_strd;
1654
1655 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1656 pu1_dst_tmp_2 += dst_strd;
1657
1658 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1659 pu1_dst_tmp_2 += dst_strd;
1660
1661 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1662 pu1_dst_tmp_2 += dst_strd;
1663
1664 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1665 pu1_dst_tmp_2 += dst_strd;
1666
1667 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1668 pu1_dst_tmp_2 += dst_strd;
1669
1670 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1671 pu1_dst_tmp_2 += dst_strd;
1672 }
1673 pu1_ref_tmp_3 += 8;
1674 pu1_dst_tmp_3 += 8;
1675 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1676 cond = 1;
1677 }
1678 }
1679 }
1680 }
1681 /* INTRA_PRED_LUMA_VER */
1682
1683 /**
1684 *******************************************************************************
1685 *
1686 * @brief
1687 * Intra prediction interpolation filter for luma mode2.
1688 *
1689 * @par Description:
1690 * Intraprediction for mode 2 (sw angle) with reference neighboring samples
1691 * location pointed by 'pu1_ref' to the TU block location pointed by
1692 * 'pu1_dst'
1693 *
1694 * @param[in] pu1_src
1695 * UWORD8 pointer to the source
1696 *
1697 * @param[out] pu1_dst
1698 * UWORD8 pointer to the destination
1699 *
1700 * @param[in] src_strd
1701 * integer source stride
1702 *
1703 * @param[in] dst_strd
1704 * integer destination stride
1705 *
1706 * @param[in] nt
1707 * integer Transform Block size
1708 *
1709 * @param[in] wd
1710 * integer width of the array
1711 *
1712 * @returns
1713 *
1714 * @remarks
1715 * None
1716 *
1717 *******************************************************************************
1718 */
1719
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1720 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1721 WORD32 src_strd,
1722 UWORD8 *pu1_dst,
1723 WORD32 dst_strd,
1724 WORD32 nt,
1725 WORD32 mode)
1726 {
1727
1728 WORD32 row, col;
1729 WORD32 two_nt;
1730 UNUSED(src_strd);
1731 UNUSED(mode);
1732
1733 /* rev_res naming has been made to have the reverse result value in it */
1734 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1735 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1736
1737 if(0 != (nt & 7))
1738 {
1739 UWORD8 *pu1_ref_tmp = pu1_ref;
1740 UWORD8 *pu1_dst_tmp = pu1_dst;
1741 uint8x8_t pu1_src_val, rev_res;
1742 uint64x1_t shift_res;
1743
1744 for(col = nt; col > 0; col -= 4)
1745 {
1746 for(row = nt; row > 0; row -= 4)
1747 {
1748 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1749
1750 pu1_src_val = vld1_u8(pu1_ref_tmp);
1751 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1752 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1753
1754 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1755 pu1_dst_tmp += dst_strd;
1756
1757 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1758 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1759 pu1_dst_tmp += dst_strd;
1760
1761 shift_res = vshr_n_u64(shift_res, 8);
1762 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1763 pu1_dst_tmp += dst_strd;
1764
1765 shift_res = vshr_n_u64(shift_res, 8);
1766 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1767 pu1_dst_tmp += dst_strd;
1768 }
1769 }
1770 }
1771
1772 /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */
1773 /* shift_64 to shift the reversed 2nd values to get the value what we need */
1774 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1775
1776 else
1777 {
1778 UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1779 UWORD8 *pu1_dst_tmp = pu1_dst;
1780 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1781
1782 uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1783 uint64x1_t shift_val;
1784
1785 two_nt = 2 * nt;
1786 pu1_ref_two_nt_minus2 += (two_nt);
1787 pu1_ref_two_nt_minus2 -= 8;
1788
1789 for(col = nt; col > 0; col -= 8)
1790 {
1791 for(row = nt; row > 0; row -= 8)
1792 {
1793 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1794 rev_val_first = vrev64_u8(pu1_src_val2);
1795
1796 pu1_ref_two_nt_minus2 -= 8;
1797 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1798 rev_val_second = vrev64_u8(pu1_src_val1);
1799
1800 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1801 vst1_u8(pu1_dst_tmp, vext_t);
1802 pu1_dst_tmp += dst_strd;
1803
1804 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1805 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1806 vst1_u8(pu1_dst_tmp, vext_t);
1807 pu1_dst_tmp += dst_strd;
1808
1809 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1810 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1811 vst1_u8(pu1_dst_tmp, vext_t);
1812 pu1_dst_tmp += dst_strd;
1813
1814 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1815 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1816 vst1_u8(pu1_dst_tmp, vext_t);
1817 pu1_dst_tmp += dst_strd;
1818
1819 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1820 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1821 vst1_u8(pu1_dst_tmp, vext_t);
1822 pu1_dst_tmp += dst_strd;
1823
1824 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1825 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1826 vst1_u8(pu1_dst_tmp, vext_t);
1827 pu1_dst_tmp += dst_strd;
1828
1829 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1830 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1831 vst1_u8(pu1_dst_tmp, vext_t);
1832 pu1_dst_tmp += dst_strd;
1833
1834 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1835 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1836 vst1_u8(pu1_dst_tmp, vext_t);
1837 pu1_dst_tmp += dst_strd;
1838 }
1839 pu1_dst_tmp_plus8 += 8;
1840 pu1_dst_tmp = pu1_dst_tmp_plus8;
1841 pu1_ref_two_nt_minus2 += (nt - 8);
1842 }
1843 }
1844 }
1845 /* INTRA_PRED_LUMA_MODE2 */
1846
1847 /**
1848 *******************************************************************************
1849 *
1850 * @brief
1851 * Intra prediction interpolation filter for luma mode 18 & mode 34.
1852 *
1853 * @par Description:
1854 * Intraprediction for mode 34 (ne angle) with reference neighboring
1855 * samples location pointed by 'pu1_ref' to the TU block location pointed by
1856 * 'pu1_dst'
1857 *
1858 * @param[in] pu1_src
1859 * UWORD8 pointer to the source
1860 *
1861 * @param[out] pu1_dst
1862 * UWORD8 pointer to the destination
1863 *
1864 * @param[in] src_strd
1865 * integer source stride
1866 *
1867 * @param[in] dst_strd
1868 * integer destination stride
1869 *
1870 * @param[in] nt
1871 * integer Transform Block size
1872 *
1873 * @param[in] wd
1874 * integer width of the array
1875 *
1876 * @returns
1877 *
1878 * @remarks
1879 * None
1880 *
1881 *******************************************************************************
1882 */
1883
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1884 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1885 WORD32 src_strd,
1886 UWORD8 *pu1_dst,
1887 WORD32 dst_strd,
1888 WORD32 nt,
1889 WORD32 mode)
1890 {
1891
1892 WORD32 row, col, idx;
1893 WORD32 intraPredAngle = 32;
1894 WORD32 two_nt;
1895 UNUSED(src_strd);
1896 two_nt = 2 * nt;
1897
1898 UWORD8 *pu1_ref_tmp = pu1_ref;
1899 UWORD8 *pu1_ref_tmp1 = pu1_ref;
1900 UWORD8 *pu1_dst_tmp = pu1_dst;
1901 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1902
1903 uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1904
1905 /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */
1906 /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */
1907 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1908 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1909 /* loops are maintained separately for mode18 and mode34 */
1910
1911 /* cond to allow multiples of 8 */
1912 if(0 == (nt & 7))
1913 {
1914 if(mode == 34)
1915 {
1916 pu1_ref_tmp += (two_nt + 2);
1917
1918 for(row = nt; row > 0; row -= 8)
1919 {
1920 for(col = nt; col > 0; col -= 8)
1921 {
1922 /* Loading 1st eight values */
1923 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1924 pu1_ref_tmp += 8;
1925
1926 /* Loading next eight values */
1927 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1928
1929 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1930 vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1931 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1932 pu1_dst_tmp += dst_strd;
1933
1934 vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1935 vst1_u8(pu1_dst_tmp, vext1);
1936 pu1_dst_tmp += dst_strd;
1937
1938 vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1939 vst1_u8(pu1_dst_tmp, vext2);
1940 pu1_dst_tmp += dst_strd;
1941
1942 vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1943 vst1_u8(pu1_dst_tmp, vext3);
1944 pu1_dst_tmp += dst_strd;
1945
1946 vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1947 vst1_u8(pu1_dst_tmp, vext4);
1948 pu1_dst_tmp += dst_strd;
1949
1950 vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1951 vst1_u8(pu1_dst_tmp, vext5);
1952 pu1_dst_tmp += dst_strd;
1953
1954 vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1955 vst1_u8(pu1_dst_tmp, vext6);
1956 pu1_dst_tmp += dst_strd;
1957
1958 vst1_u8(pu1_dst_tmp, vext7);
1959 pu1_dst_tmp += dst_strd;
1960 }
1961
1962 pu1_dst_tmp_plus8 += 8;
1963 pu1_dst_tmp = pu1_dst_tmp_plus8;
1964 pu1_ref_tmp -= (nt - 8);
1965 }
1966 }
1967 else /* Loop for mode 18 */
1968 {
1969 pu1_ref_tmp += (two_nt);
1970
1971 for(row = nt; row > 0; row -= 8)
1972 {
1973 for(col = nt; col > 0; col -= 8)
1974 {
1975 /* Loading 1st eight values */
1976 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1977 pu1_ref_tmp -= 8;
1978
1979 /* Loading next eight values */
1980 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1981
1982 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1983 vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1984 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1985 pu1_dst_tmp += dst_strd;
1986
1987 vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1988 vst1_u8(pu1_dst_tmp, vext1);
1989 pu1_dst_tmp += dst_strd;
1990
1991 vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1992 vst1_u8(pu1_dst_tmp, vext2);
1993 pu1_dst_tmp += dst_strd;
1994
1995 vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1996 vst1_u8(pu1_dst_tmp, vext3);
1997 pu1_dst_tmp += dst_strd;
1998
1999 vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
2000 vst1_u8(pu1_dst_tmp, vext4);
2001 pu1_dst_tmp += dst_strd;
2002
2003 vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
2004 vst1_u8(pu1_dst_tmp, vext5);
2005 pu1_dst_tmp += dst_strd;
2006
2007 vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2008 vst1_u8(pu1_dst_tmp, vext6);
2009 pu1_dst_tmp += dst_strd;
2010
2011 vst1_u8(pu1_dst_tmp, vext7);
2012 pu1_dst_tmp += dst_strd;
2013 }
2014 pu1_dst_tmp_plus8 += 8;
2015 pu1_dst_tmp = pu1_dst_tmp_plus8;
2016 pu1_ref_tmp += (nt + 8);
2017 }
2018 }
2019 }
2020
2021 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
2022
2023 else /* loop for multiples of 4 */
2024 {
2025 uint8x8_t src_val1;
2026 uint8x8_t src_val2;
2027
2028 if(mode == 18)
2029 intraPredAngle = -32;
2030 else if(mode == 34)
2031 intraPredAngle = 32;
2032
2033 for(row = 0; row < nt; row += 2)
2034 {
2035 /* unrolling 2 rows */
2036 idx = ((row + 1) * intraPredAngle) >> 5;
2037 pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2038 src_val1 = vld1_u8(pu1_ref_tmp);
2039
2040 idx = ((row + 2) * intraPredAngle) >> 5;
2041 pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2042 src_val2 = vld1_u8(pu1_ref_tmp1);
2043
2044 /* unrolling 4 col */
2045 for(col = nt; col > 0; col -= 4)
2046 {
2047 pu1_dst_tmp = pu1_dst;
2048 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2049 pu1_dst_tmp += dst_strd;
2050 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2051 pu1_dst += 4;
2052 }
2053 pu1_dst += 2 * dst_strd - nt;
2054 }
2055 }
2056 }
2057 /* INTRA_PRED_LUMA_MODE_18_34 */
2058
2059 /**
2060 *******************************************************************************
2061 *
2062 * @brief
2063 * Intra prediction interpolation filter for luma mode 3 to mode 9
2064 *
2065 * @par Description:
2066 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
2067 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2068 * block location pointed by 'pu1_dst'
2069 *
2070 * @param[in] pu1_src
2071 * UWORD8 pointer to the source
2072 *
2073 * @param[out] pu1_dst
2074 * UWORD8 pointer to the destination
2075 *
2076 * @param[in] src_strd
2077 * integer source stride
2078 *
2079 * @param[in] dst_strd
2080 * integer destination stride
2081 *
2082 * @param[in] nt
2083 * integer Transform Block size
2084 *
2085 * @param[in] mode
2086 * integer intraprediction mode
2087 *
2088 * @returns
2089 *
2090 * @remarks
2091 * None
2092 *
2093 *******************************************************************************
2094 */
2095
2096
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2097 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2098 WORD32 src_strd,
2099 UWORD8 *pu1_dst,
2100 WORD32 dst_strd,
2101 WORD32 nt,
2102 WORD32 mode)
2103 {
2104
2105 WORD32 row, col;
2106 WORD32 intra_pred_ang;
2107 WORD32 pos, fract = 100, fract_prev;
2108 UNUSED(src_strd);
2109 if(0 == (nt & 7))
2110 {
2111
2112 UWORD8 *pu1_ref_main_idx = pu1_ref;
2113 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2114
2115 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2116 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2117
2118 WORD32 two_nt = 2 * nt;
2119
2120 pu1_ref_main_idx += two_nt;
2121 pu1_ref_main_idx_1 += two_nt - 1;
2122
2123 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2124 uint8x8_t shift_res;
2125 uint16x8_t mul_res1, mul_res2, add_res;
2126
2127 /* Intra Pred Angle according to the mode */
2128 intra_pred_ang = gai4_ihevc_ang_table[mode];
2129
2130 pu1_ref_main_idx -= 8;
2131 pu1_ref_main_idx_1 -= 8;
2132
2133 for(col = 0; col < nt; col++)
2134 {
2135 fract_prev = fract;
2136
2137 pos = ((col + 1) * intra_pred_ang);
2138 fract = pos & (31);
2139
2140 if(fract_prev < fract)
2141 {
2142 pu1_ref_main_idx += 1;
2143 pu1_ref_main_idx_1 += 1;
2144 }
2145
2146 dup_const_fract = vdup_n_u8((uint8_t)fract);
2147 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2148
2149 for(row = nt; row > 0; row -= 8)
2150 {
2151 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2152 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2153
2154 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2155 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2156
2157 add_res = vaddq_u16(mul_res1, mul_res2);
2158
2159 shift_res = vrshrn_n_u16(add_res, 5);
2160
2161 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2162 pu1_dst_tmp1 += dst_strd;
2163
2164 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2165 pu1_dst_tmp1 += dst_strd;
2166
2167 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2168 pu1_dst_tmp1 += dst_strd;
2169
2170 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2171 pu1_dst_tmp1 += dst_strd;
2172
2173 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2174 pu1_dst_tmp1 += dst_strd;
2175
2176 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2177 pu1_dst_tmp1 += dst_strd;
2178
2179 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2180 pu1_dst_tmp1 += dst_strd;
2181
2182 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2183 pu1_dst_tmp1 += dst_strd;
2184
2185 pu1_ref_main_idx -= 8;
2186 pu1_ref_main_idx_1 -= 8;
2187
2188 }
2189 pu1_dst_tmp2 += 1;
2190 pu1_dst_tmp1 = pu1_dst_tmp2;
2191
2192 pu1_ref_main_idx += nt;
2193 pu1_ref_main_idx_1 += nt;
2194
2195 pu1_ref_main_idx -= 1;
2196 pu1_ref_main_idx_1 -= 1;
2197
2198 }
2199 }
2200 else
2201 {
2202 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2203 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2204 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2205 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2206
2207 pu1_ref_tmp1 += nt;
2208 pu1_ref_tmp2 += (nt - 1);
2209
2210 uint8x8_t dup_fract, dup_32_fract, shift_res;
2211 uint16x8_t mul_res1, mul_res2, add_res;
2212 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2213
2214 pu1_ref_val1 = vdup_n_u32(0);
2215 pu1_ref_val2 = vdup_n_u32(0);
2216
2217 /* Intra Pred Angle according to the mode */
2218 intra_pred_ang = gai4_ihevc_ang_table[mode];
2219
2220
2221 for(col = 0; col < nt; col++)
2222 {
2223 fract_prev = fract;
2224 pos = ((col + 1) * intra_pred_ang);
2225 fract = pos & (31);
2226 if(fract_prev < fract)
2227 {
2228 pu1_ref_tmp1 += 1;
2229 pu1_ref_tmp2 += 1;
2230 }
2231 dup_fract = vdup_n_u8((uint8_t)fract);
2232 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2233
2234 for(row = nt; row > 0; row -= 4)
2235 {
2236 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2237 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2238
2239 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2240 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2241
2242 add_res = vaddq_u16(mul_res1, mul_res2);
2243
2244 shift_res = vrshrn_n_u16(add_res, 5);
2245
2246 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2247 pu1_dst_tmp1 += dst_strd;
2248
2249 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2250 pu1_dst_tmp1 += dst_strd;
2251
2252 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2253 pu1_dst_tmp1 += dst_strd;
2254
2255 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2256
2257 }
2258 pu1_ref_tmp1 -= 1;
2259 pu1_ref_tmp2 -= 1;
2260
2261 pu1_dst_tmp2 += 1;
2262 pu1_dst_tmp1 = pu1_dst_tmp2;
2263
2264 }
2265
2266
2267 }
2268
2269 }
2270
2271 /**
2272 *******************************************************************************
2273 *
2274 * @brief
2275 * Intra prediction interpolation filter for luma mode 11 to mode 17
2276 *
2277 * @par Description:
2278 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
2279 * with reference neighboring samples location pointed by 'pu1_ref' to the
2280 * TU block location pointed by 'pu1_dst'
2281 *
2282 * @param[in] pu1_src
2283 * UWORD8 pointer to the source
2284 *
2285 * @param[out] pu1_dst
2286 * UWORD8 pointer to the destination
2287 *
2288 * @param[in] src_strd
2289 * integer source stride
2290 *
2291 * @param[in] dst_strd
2292 * integer destination stride
2293 *
2294 * @param[in] nt
2295 * integer Transform Block size
2296 *
2297 * @param[in] mode
2298 * integer intraprediction mode
2299 *
2300 * @returns
2301 *
2302 * @remarks
2303 * None
2304 *
2305 *******************************************************************************
2306 */
2307
2308
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2309 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2310 WORD32 src_strd,
2311 UWORD8 *pu1_dst,
2312 WORD32 dst_strd,
2313 WORD32 nt,
2314 WORD32 mode)
2315 {
2316
2317 WORD32 row, col, k;
2318 WORD32 two_nt;
2319 WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2320 WORD32 pos, fract = 1000, fract_prev;
2321 WORD32 ref_idx;
2322
2323 UWORD8 *ref_main;
2324 UWORD8 *ref_main_tmp;
2325
2326 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2327 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2328 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2329 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2330
2331 UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2332
2333 uint16x8_t mul_res1, mul_res2, add_res;
2334 uint8x8_t dup_const_fract, dup_const_32_fract;
2335 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2336 uint8x8_t ref_left_t;
2337 uint32x2_t ref_left_tmp;
2338 UNUSED(src_strd);
2339 ref_left_tmp = vdup_n_u32(0);
2340
2341 inv_ang_sum = 128;
2342 two_nt = 2 * nt;
2343
2344 intra_pred_ang = gai4_ihevc_ang_table[mode];
2345
2346 inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2347
2348 pu1_ref_tmp1 += two_nt;
2349
2350 ref_main = ref_temp + (nt - 1);
2351 ref_main_tmp = ref_main;
2352
2353 if(0 == (nt & 7))
2354 {
2355 pu1_ref_tmp2 += (two_nt - 7);
2356
2357 for(k = nt - 1; k >= 0; k -= 8)
2358 {
2359
2360 ref_left_t = vld1_u8(pu1_ref_tmp2);
2361
2362 ref_left_t = vrev64_u8(ref_left_t);
2363 vst1_u8(ref_main_tmp, ref_left_t);
2364 ref_main_tmp += 8;
2365 pu1_ref_tmp2 -= 8;
2366
2367 }
2368
2369 }
2370 else
2371 {
2372 uint8x8_t rev_val;
2373 pu1_ref_tmp2 += (two_nt - (nt - 1));
2374
2375 for(k = nt - 1; k >= 0; k -= 8)
2376 {
2377
2378 ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2379
2380 rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2381 vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2382
2383 }
2384
2385 }
2386
2387 ref_main[nt] = pu1_ref[two_nt - nt];
2388
2389 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2390
2391 ref_idx = (nt * intra_pred_ang) >> 5;
2392
2393 /* SIMD Optimization can be done using look-up table for the loop */
2394 /* For negative angled derive the main reference samples from side */
2395 /* reference samples refer to section 8.4.4.2.6 */
2396 for(k = -1; k > ref_idx; k--)
2397 {
2398 inv_ang_sum += inv_ang;
2399 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2400 }
2401
2402 UWORD8 *ref_main_tmp1 = ref_main;
2403 UWORD8 *ref_main_tmp2 = ref_main;
2404
2405 ref_main_tmp2 += 1;
2406
2407 if(0 == (nt & 7))
2408 {
2409 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2410 /* samples dependent on distance to obtain destination sample */
2411 for(col = 0; col < nt; col++)
2412 {
2413
2414 fract_prev = fract;
2415 pos = ((col + 1) * intra_pred_ang);
2416 fract = pos & (31);
2417
2418 if(fract_prev < fract)
2419 {
2420 ref_main_tmp1 -= 1;
2421 ref_main_tmp2 -= 1;
2422 }
2423
2424 dup_const_fract = vdup_n_u8((uint8_t)fract);
2425 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2426
2427 // Do linear filtering
2428 for(row = nt; row > 0; row -= 8)
2429 {
2430 ref_main_idx = vld1_u8(ref_main_tmp1);
2431
2432 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2433
2434 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2435 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2436
2437 add_res = vaddq_u16(mul_res1, mul_res2);
2438
2439 shift_res = vrshrn_n_u16(add_res, 5);
2440
2441 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2442 pu1_dst_tmp1 += dst_strd;
2443
2444 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2445 pu1_dst_tmp1 += dst_strd;
2446
2447 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2448 pu1_dst_tmp1 += dst_strd;
2449
2450 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2451 pu1_dst_tmp1 += dst_strd;
2452
2453 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2454 pu1_dst_tmp1 += dst_strd;
2455
2456 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2457 pu1_dst_tmp1 += dst_strd;
2458
2459 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2460 pu1_dst_tmp1 += dst_strd;
2461
2462 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2463 pu1_dst_tmp1 += dst_strd;
2464
2465 ref_main_tmp1 += 8;
2466 ref_main_tmp2 += 8;
2467 }
2468
2469 ref_main_tmp1 -= nt;
2470 ref_main_tmp2 -= nt;
2471
2472 pu1_dst_tmp2 += 1;
2473 pu1_dst_tmp1 = pu1_dst_tmp2;
2474 }
2475 }
2476 else
2477 {
2478 uint32x2_t ref_main_idx1, ref_main_idx2;
2479
2480 ref_main_idx1 = vdup_n_u32(0);
2481 ref_main_idx2 = vdup_n_u32(0);
2482
2483 for(col = 0; col < nt; col++)
2484 {
2485 fract_prev = fract;
2486 pos = ((col + 1) * intra_pred_ang);
2487 fract = pos & (31);
2488
2489 if(fract_prev < fract)
2490 {
2491 ref_main_tmp1 -= 1;
2492 ref_main_tmp2 -= 1;
2493 }
2494
2495 dup_const_fract = vdup_n_u8((uint8_t)fract);
2496 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2497
2498 for(row = nt; row > 0; row -= 4)
2499 {
2500
2501 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2502 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2503
2504 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2505 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2506
2507 add_res = vaddq_u16(mul_res1, mul_res2);
2508
2509 shift_res = vrshrn_n_u16(add_res, 5);
2510
2511 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2512 pu1_dst_tmp1 += dst_strd;
2513
2514 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2515 pu1_dst_tmp1 += dst_strd;
2516
2517 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2518 pu1_dst_tmp1 += dst_strd;
2519
2520 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2521 pu1_dst_tmp1 += dst_strd;
2522
2523 }
2524
2525 pu1_dst_tmp2 += 1;
2526 pu1_dst_tmp1 = pu1_dst_tmp2;
2527
2528 }
2529
2530 }
2531 }
2532
2533 /**
2534 *******************************************************************************
2535 *
2536 * @brief
2537 * Intra prediction interpolation filter for luma mode 19 to mode 25
2538 *
2539 * @par Description:
2540 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
2541 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2542 * block location pointed by 'pu1_dst'
2543 *
2544 * @param[in] pu1_src
2545 * UWORD8 pointer to the source
2546 *
2547 * @param[out] pu1_dst
2548 * UWORD8 pointer to the destination
2549 *
2550 * @param[in] src_strd
2551 * integer source stride
2552 *
2553 * @param[in] dst_strd
2554 * integer destination stride
2555 *
2556 * @param[in] nt
2557 * integer Transform Block size
2558 *
2559 * @param[in] mode
2560 * integer intraprediction mode
2561 *
2562 * @returns
2563 *
2564 * @remarks
2565 * None
2566 *
2567 *******************************************************************************
2568 */
2569
2570
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2571 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2572 WORD32 src_strd,
2573 UWORD8 *pu1_dst,
2574 WORD32 dst_strd,
2575 WORD32 nt,
2576 WORD32 mode)
2577 {
2578
2579 WORD32 row, col, k;
2580 WORD32 two_nt, intra_pred_ang;
2581 WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2582 WORD32 ref_idx;
2583 UWORD8 *ref_main;
2584 UWORD8 *ref_main_tmp;
2585 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2586
2587 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2588 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2589 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2590
2591 uint16x8_t mul_res1, mul_res2, add_res;
2592 uint8x8_t dup_const_fract, dup_const_32_fract;
2593 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2594 uint8x8_t ref_above_t;
2595 uint32x2_t ref_above_tmp;
2596 UNUSED(src_strd);
2597 ref_above_tmp = vdup_n_u32(0);
2598
2599 two_nt = 2 * nt;
2600 intra_pred_ang = gai4_ihevc_ang_table[mode];
2601 inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2602
2603 /* Intermediate reference samples for negative angle modes */
2604 /* This have to be removed during optimization*/
2605 pu1_ref_tmp1 += two_nt;
2606
2607
2608 ref_main = ref_temp + (nt - 1);
2609 ref_main_tmp = ref_main;
2610
2611 if(0 == (nt & 7))
2612 {
2613 pu1_ref_tmp2 += (two_nt - 7);
2614 for(k = nt - 1; k >= 0; k -= 8)
2615 {
2616
2617 ref_above_t = vld1_u8(pu1_ref_tmp1);
2618 vst1_u8(ref_main_tmp, ref_above_t);
2619 ref_main_tmp += 8;
2620 pu1_ref_tmp1 += 8;
2621
2622 }
2623
2624 }
2625 else
2626 {
2627 pu1_ref_tmp2 += (two_nt - (nt - 1));
2628
2629 for(k = nt - 1; k >= 0; k -= 4)
2630 {
2631
2632 ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2633 vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2634
2635 }
2636
2637 }
2638
2639 ref_main[nt] = pu1_ref[two_nt + nt];
2640
2641 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2642
2643 ref_idx = (nt * intra_pred_ang) >> 5;
2644 inv_ang_sum = 128;
2645
2646 /* SIMD Optimization can be done using look-up table for the loop */
2647 /* For negative angled derive the main reference samples from side */
2648 /* reference samples refer to section 8.4.4.2.6 */
2649 for(k = -1; k > ref_idx; k--)
2650 {
2651 inv_ang_sum += inv_ang;
2652 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2653 }
2654
2655 UWORD8 *ref_main_tmp1 = ref_main;
2656 UWORD8 *ref_main_tmp2 = ref_main;
2657
2658 ref_main_tmp2 += 1;
2659
2660 if(0 == (nt & 7))
2661 {
2662 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2663 /* samples dependent on distance to obtain destination sample */
2664 for(row = 0; row < nt; row++)
2665 {
2666
2667 fract_prev = fract;
2668 pos = ((row + 1) * intra_pred_ang);
2669 fract = pos & (31);
2670
2671 if(fract_prev < fract)
2672 {
2673 ref_main_tmp1 -= 1;
2674 ref_main_tmp2 -= 1;
2675 }
2676
2677 dup_const_fract = vdup_n_u8((uint8_t)fract);
2678 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2679
2680 // Do linear filtering
2681 for(col = nt; col > 0; col -= 8)
2682 {
2683 ref_main_idx = vld1_u8(ref_main_tmp1);
2684
2685 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2686
2687 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2688 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2689
2690 add_res = vaddq_u16(mul_res1, mul_res2);
2691
2692 shift_res = vrshrn_n_u16(add_res, 5);
2693
2694 vst1_u8(pu1_dst_tmp1, shift_res);
2695 pu1_dst_tmp1 += 8;
2696
2697 ref_main_tmp1 += 8;
2698 ref_main_tmp2 += 8;
2699 }
2700
2701 ref_main_tmp1 -= nt;
2702 ref_main_tmp2 -= nt;
2703
2704 pu1_dst_tmp1 += (dst_strd - nt);
2705 }
2706 }
2707 else
2708 {
2709 uint32x2_t ref_main_idx1, ref_main_idx2;
2710
2711 ref_main_idx1 = vdup_n_u32(0);
2712 ref_main_idx2 = vdup_n_u32(0);
2713
2714 for(row = 0; row < nt; row++)
2715 {
2716 fract_prev = fract;
2717 pos = ((row + 1) * intra_pred_ang);
2718 fract = pos & (31);
2719
2720 if(fract_prev < fract)
2721 {
2722 ref_main_tmp1 -= 1;
2723 ref_main_tmp2 -= 1;
2724 }
2725
2726 dup_const_fract = vdup_n_u8((uint8_t)fract);
2727 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2728
2729 for(col = nt; col > 0; col -= 4)
2730 {
2731
2732 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2733 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2734
2735 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2736 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2737
2738 add_res = vaddq_u16(mul_res1, mul_res2);
2739
2740 shift_res = vrshrn_n_u16(add_res, 5);
2741
2742 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2743 pu1_dst_tmp1 += 4;
2744
2745 }
2746 pu1_dst_tmp1 += (dst_strd - nt);
2747 }
2748
2749 }
2750
2751 }
2752
2753 /**
2754 *******************************************************************************
2755 *
2756 * @brief
2757 * Intra prediction interpolation filter for luma mode 27 to mode 33
2758 *
2759 * @par Description:
2760 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
2761 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2762 * block location pointed by 'pu1_dst'
2763 *
2764 * @param[in] pu1_src
2765 * UWORD8 pointer to the source
2766 *
2767 * @param[out] pu1_dst
2768 * UWORD8 pointer to the destination
2769 *
2770 * @param[in] src_strd
2771 * integer source stride
2772 *
2773 * @param[in] dst_strd
2774 * integer destination stride
2775 *
2776 * @param[in] nt
2777 * integer Transform Block size
2778 *
2779 * @param[in] mode
2780 * integer intraprediction mode
2781 *
2782 * @returns
2783 *
2784 * @remarks
2785 * None
2786 *
2787 *******************************************************************************
2788 */
2789
2790
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2791 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2792 WORD32 src_strd,
2793 UWORD8 *pu1_dst,
2794 WORD32 dst_strd,
2795 WORD32 nt,
2796 WORD32 mode)
2797 {
2798
2799 WORD32 row, col;
2800 WORD32 intra_pred_ang;
2801 WORD32 pos, fract = 0, fract_prev;
2802
2803 WORD32 two_nt = 2 * nt;
2804 UNUSED(src_strd);
2805 if(0 == (nt & 7))
2806 {
2807
2808 UWORD8 *pu1_ref_main_idx = pu1_ref;
2809 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2810
2811 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2812 pu1_ref_main_idx += (two_nt + 1);
2813 pu1_ref_main_idx_1 += (two_nt + 2);
2814
2815 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2816 uint8x8_t shift_res;
2817 uint16x8_t mul_res1, mul_res2, add_res;
2818
2819 /* Intra Pred Angle according to the mode */
2820 intra_pred_ang = gai4_ihevc_ang_table[mode];
2821
2822 for(row = 0; row < nt; row++)
2823 {
2824 fract_prev = fract;
2825
2826 pos = ((row + 1) * intra_pred_ang);
2827 fract = pos & (31);
2828
2829 if(fract_prev > fract)
2830 {
2831 pu1_ref_main_idx += 1;
2832 pu1_ref_main_idx_1 += 1;
2833 }
2834
2835 dup_const_fract = vdup_n_u8((uint8_t)fract);
2836 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2837
2838 for(col = nt; col > 0; col -= 8)
2839 {
2840 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2841 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2842
2843 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2844 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2845
2846 add_res = vaddq_u16(mul_res1, mul_res2);
2847
2848 shift_res = vrshrn_n_u16(add_res, 5);
2849
2850 vst1_u8(pu1_dst_tmp1, shift_res);
2851 pu1_dst_tmp1 += 8;
2852
2853 pu1_ref_main_idx += 8;
2854 pu1_ref_main_idx_1 += 8;
2855 }
2856
2857 pu1_ref_main_idx -= nt;
2858 pu1_ref_main_idx_1 -= nt;
2859
2860 pu1_dst_tmp1 += (dst_strd - nt);
2861 }
2862
2863 }
2864 else
2865 {
2866 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2867 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2868 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2869
2870 pu1_ref_tmp1 += (two_nt + 1);;
2871 pu1_ref_tmp2 += (two_nt + 2);;
2872
2873 uint8x8_t dup_fract, dup_32_fract, shift_res;
2874 uint16x8_t mul_res1, mul_res2, add_res;
2875 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2876
2877 pu1_ref_val1 = vdup_n_u32(0);
2878 pu1_ref_val2 = vdup_n_u32(0);
2879
2880 /* Intra Pred Angle according to the mode */
2881 intra_pred_ang = gai4_ihevc_ang_table[mode];
2882
2883 for(row = 0; row < nt; row++)
2884 {
2885 fract_prev = fract;
2886 pos = ((row + 1) * intra_pred_ang);
2887 fract = pos & (31);
2888 if(fract_prev > fract)
2889 {
2890 pu1_ref_tmp1 += 1;
2891 pu1_ref_tmp2 += 1;
2892 }
2893 dup_fract = vdup_n_u8((uint8_t)fract);
2894 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2895
2896 for(col = nt; col > 0; col -= 4)
2897 {
2898 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2899 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2900
2901 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2902 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2903
2904 add_res = vaddq_u16(mul_res1, mul_res2);
2905
2906 shift_res = vrshrn_n_u16(add_res, 5);
2907
2908 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2909 pu1_dst_tmp1 += 4;
2910
2911 }
2912
2913 pu1_dst_tmp1 += (dst_strd - nt);
2914
2915 }
2916
2917
2918 }
2919
2920 }
2921