• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_weighted_pred_x86_intr.c
22 *
23 * @brief
24 *  Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 *
29 *
30 * @par List of Functions:
31 *   - ihevc_weighted_pred_uni_sse42()
32 *   - ihevc_weighted_pred_bi_sse42()
33 *   - ihevc_weighted_pred_bi_default_sse42()
34 *   - ihevc_weighted_pred_chroma_uni_sse42()
35 *   - ihevc_weighted_pred_chroma_bi_sse42()
36 *
37 * @remarks
38 *  None
39 *
40 *******************************************************************************
41 */
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 #include <stdio.h>
46 #include <assert.h>
47 
48 #include "ihevc_debug.h"
49 #include "ihevc_typedefs.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_platform_macros.h"
52 #include "ihevc_func_selector.h"
53 #include "ihevc_defs.h"
54 #include "ihevc_weighted_pred.h"
55 #include "ihevc_inter_pred.h"
56 
57 #include <immintrin.h>
58 
59 /**
60 *******************************************************************************
61 *
62 * @brief
63 *  Does uni-weighted prediction on the array pointed by  pi2_src and stores
64 * it at the location pointed by pi2_dst
65 *
66 * @par Description:
67 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
68 * offset
69 *
70 * @param[in] pi2_src
71 *  Pointer to the source
72 *
73 * @param[out] pu1_dst
74 *  Pointer to the destination
75 *
76 * @param[in] src_strd
77 *  Source stride
78 *
79 * @param[in] dst_strd
80 *  Destination stride
81 *
82 * @param[in] wgt0
83 *  weight to be multiplied to the source
84 *
85 * @param[in] off0
86 *  offset to be added after rounding and
87 *
88 * @param[in] shifting
89 *
90 *
91 * @param[in] shift
92 *  (14 Bit depth) + log2_weight_denominator
93 *
94 * @param[in] lvl_shift
95 *  added before shift and offset
96 *
97 * @param[in] ht
98 *  height of the source
99 *
100 * @param[in] wd
101 *  width of the source
102 *
103 * @returns
104 *
105 * @remarks
106 *  None
107 *
108 *******************************************************************************
109 */
110 
ihevc_weighted_pred_uni_sse42(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)111 void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
112                                    UWORD8 *pu1_dst,
113                                    WORD32 src_strd,
114                                    WORD32 dst_strd,
115                                    WORD32 wgt0,
116                                    WORD32 off0,
117                                    WORD32 shift,
118                                    WORD32 lvl_shift,
119                                    WORD32 ht,
120                                    WORD32 wd)
121 {
122     WORD32 row, col, temp;
123     WORD32 dst0, dst1, dst2, dst3;
124 
125     /* all 128 bit registers are named with a suffix mxnb, where m is the */
126     /* number of n bits packed in the register                            */
127     __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
128     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
129 
130     ASSERT(wd % 4 == 0); /* checking assumption*/
131     ASSERT(ht % 4 == 0); /* checking assumption*/
132 
133     temp = 1 << (shift - 1);
134 
135     // seting values in register
136     const_temp_4x32b = _mm_set1_epi32(temp);
137     lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
138     wgt0_4x32b = _mm_set1_epi32(wgt0);
139     off0_4x32b = _mm_set1_epi32(off0);
140 
141     if(0 == (wd & 7)) /* wd multiple of 8 case */
142     {
143         __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
144 
145         /*  outer for loop starts from here */
146         for(row = 0; row < ht; row += 4)
147         {
148             for(col = 0; col < wd; col += 8)
149             {   /* for row =0 ,1,2,3*/
150 
151                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
152                 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
153                 /* row = 1 */
154                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
155                 /* row = 2 */
156                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
157                 /* row = 3 */
158                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
159 
160                 /* row = 0 */ /* Last 4 pixels */
161                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
162                 /* row = 1 */
163                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
164                 /* row = 2 */
165                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
166                 /* row = 3 */
167                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
168 
169                 /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
170                 src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
171                 src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
172                 src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
173                 src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
174 
175                 /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
176                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
177                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
178                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
179                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
180 
181                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
182                 src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
183                 src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
184                 src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
185                 src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
186 
187                 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
188                 src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
189                 src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
190                 src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
191                 src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
192 
193                 /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
194                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
195                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
196                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
197                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
198 
199                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
200                 src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
201                 src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
202                 src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
203                 src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
204 
205                 /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
206                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
207                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
208                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
209                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
210 
211                 /* (i4_tmp >> shift) */ /* First 4 pixels */
212                 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
213                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
214                 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
215                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
216 
217                 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
218                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
219                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
220                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
221                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
222 
223                 /* (i4_tmp >> shift) */ /* Last 4 pixels */
224                 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
225                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
226                 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
227                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
228 
229                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
230                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
231                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
232                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
233                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
234 
235                 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
236                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
237                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
238                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
239                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
240 
241                 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
242                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
243                 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
244                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
245                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
246                 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
247                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
248                 src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
249                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
250 
251                 /* store four 8-bit output values  */
252                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
253                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
254                 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
255                 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
256 
257                 /* To update pointer */
258                 pi2_src += 8;
259                 pu1_dst += 8;
260 
261             } /* inner loop ends here(4-output values in single iteration) */
262 
263             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
264             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
265 
266         }
267     }
268     else  /* wd multiple of 4 case */
269     {
270         /*  outer for loop starts from here */
271         for(row = 0; row < ht; row += 4)
272         {
273             for(col = 0; col < wd; col += 4)
274             {   /* for row =0 ,1,2,3*/
275 
276                 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
277                 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
278                 /* row = 1 */
279                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
280                 /* row = 2 */
281                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
282                 /* row = 3 */
283                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
284 
285                 /* considering pix. 4:0 by converting 16-into 32 bit */
286                 src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
287                 src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
288                 src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
289                 src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
290 
291                 /* (pi2_src[col] + lvl_shift)*/
292                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
293                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
294                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
295                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
296 
297                 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
298                 src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
299                 src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
300                 src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
301                 src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
302 
303                 /* i4_tmp += 1 << (shift - 1) */
304                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
305                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
306                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
307                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
308 
309                 /* (i4_tmp >> shift) */
310                 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
311                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
312                 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
313                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
314 
315                 /*i4_tmp = (i4_tmp >> shift) + off0; */
316                 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
317                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
318                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
319                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
320 
321                 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
322                 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
323 
324                 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
325                 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
326 
327                 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
328                 /* dst row = 1 to 3 */
329                 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
330                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
331                 src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
332 
333                 /* store four 8-bit output values  */
334                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
335 
336                 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
337                 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
338                 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
339 
340                 /* row = 1 to row = 3 */
341                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
342                 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
343                 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
344 
345                 /* To update pointer */
346                 pi2_src += 4;
347                 pu1_dst += 4;
348 
349             } /* inner loop ends here(4-output values in single iteration) */
350 
351             pi2_src = pi2_src - wd + 4 * src_strd;    /* Pointer update */
352             pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
353 
354         }
355     }
356 }
357 
358 /**
359 *******************************************************************************
360 *
361 * @brief
362 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
363 * it at the location pointed by pi2_dst
364 *
365 * @par Description:
366 *  dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) )  >> shift +
367 * offset
368 *
369 * @param[in] pi2_src
370 *  Pointer to the source
371 *
372 * @param[out] pu1_dst
373 *  Pointer to the destination
374 *
375 * @param[in] src_strd
376 *  Source stride
377 *
378 * @param[in] dst_strd
379 *  Destination stride
380 *
381 * @param[in] wgt0
382 *  weight to be multiplied to the source
383 *
384 * @param[in] off0
385 *  offset to be added after rounding and
386 *
387 * @param[in] shifting
388 *
389 *
390 * @param[in] shift
391 *  (14 Bit depth) + log2_weight_denominator
392 *
393 * @param[in] lvl_shift
394 *  added before shift and offset
395 *
396 * @param[in] ht
397 *  height of the source
398 *
399 * @param[in] wd
400 *  width of the source (each colour component)
401 *
402 * @returns
403 *
404 * @remarks
405 *  None
406 *
407 *******************************************************************************
408 */
409 
ihevc_weighted_pred_chroma_uni_sse42(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)410 void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
411                                           UWORD8 *pu1_dst,
412                                           WORD32 src_strd,
413                                           WORD32 dst_strd,
414                                           WORD32 wgt0_cb,
415                                           WORD32 wgt0_cr,
416                                           WORD32 off0_cb,
417                                           WORD32 off0_cr,
418                                           WORD32 shift,
419                                           WORD32 lvl_shift,
420                                           WORD32 ht,
421                                           WORD32 wd)
422 {
423     WORD32 row, col, temp, wdx2;
424     /* all 128 bit registers are named with a suffix mxnb, where m is the */
425     /* number of n bits packed in the register                            */
426 
427     __m128i src_temp0_4x32b, src_temp1_4x32b;
428     __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
429 
430     ASSERT(wd % 2 == 0); /* checking assumption*/
431     ASSERT(ht % 2 == 0); /* checking assumption*/
432 
433     temp = 1 << (shift - 1);
434     wdx2 = 2 * wd;
435 
436     // seting values in register
437     const_temp_4x32b = _mm_set1_epi32(temp);
438     lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
439     wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
440     off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
441 
442 #if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
443     if( 0 == (ht & 3)) /* ht multiple of 4 case */
444     {
445         if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
446         {
447             __m128i src_temp2_4x32b, src_temp3_4x32b;
448             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
449             __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
450             __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
451             /*  outer for loop starts from here */
452             for(row = 0; row < ht; row +=4)
453             {
454                 for(col = 0; col < wdx2; col +=16)
455                 {
456                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
458                     /* row = 1 */
459                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
460                     /* row = 0 */ /* Second 4 pixels */
461                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
462                     /* row = 1 */
463                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
464                     /* row = 0 */ /* Third 4 pixels */
465                     src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
466                     /* row = 1 */
467                     src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
468                     /* row = 0 */ /* Last 4 pixels */
469                     src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
470                     /* row = 1 */
471                     src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
472 
473                     /* considering pix. 4:0 by converting 16-into 32 bit */
474                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
475                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
476                     /* (pi2_src[col] + lvl_shift)*/
477                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
478                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
479                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
480                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
481                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
482 
483                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
484                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
485                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
486                     /* (pi2_src[col] + lvl_shift)*/
487                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
488                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
489                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
490                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
491                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
492 
493                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
494                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
495                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
496                     /* (pi2_src[col] + lvl_shift)*/
497                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
498                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
499                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
500                     src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
501                     src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
502 
503                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
504                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
505                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
506                     /* (pi2_src[col] + lvl_shift)*/
507                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
508                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
509                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
510                     src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
511                     src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
512 
513                     /* i4_tmp += 1 << (shift - 1) */
514                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
515                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
516                     /* (i4_tmp >> shift) */
517                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
518                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
519 
520                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
521                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
522                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
523                     /* (i4_tmp >> shift) */
524                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
525                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
526 
527                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
528                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
529                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
530                     /* (i4_tmp >> shift) */
531                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
532                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
533 
534                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
535                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
536                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
537                     /* (i4_tmp >> shift) */
538                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
539                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
540 
541                     /*i4_tmp = (i4_tmp >> shift) + off0; */
542                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
543                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
544                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
545                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
546                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
547                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
548                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
549                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
550                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
551                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
552                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
553 
554                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
555                     src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
556                     src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
557                     src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
558                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
559                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
560                     src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
561 
562                     /* store 16 8-bit output values  */
563                     _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
564                     _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
565 
566                     /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
567                     src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
568                     /* row = 3 */
569                     src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
570                     /* row = 2 */ /* Second 4 pixels */
571                     src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
572                     /* row = 3 */
573                     src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
574                     /* row = 2 */ /* Third 4 pixels */
575                     src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
576                     /* row = 3 */
577                     src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
578                     /* row = 2 */ /* Last 4 pixels */
579                     src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
580                     /* row = 3 */
581                     src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
582 
583                     /* considering pix. 4:0 by converting 16-into 32 bit */
584                     src_temp8_4x32b  = _mm_cvtepi16_epi32(src_temp8_4x32b);
585                     src_temp9_4x32b  = _mm_cvtepi16_epi32(src_temp9_4x32b);
586                     /* (pi2_src[col] + lvl_shift)*/
587                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
588                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
589                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
590                     src_temp8_4x32b  = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
591                     src_temp9_4x32b  = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
592 
593                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
594                     src_temp10_4x32b  = _mm_cvtepi16_epi32(src_temp10_4x32b);
595                     src_temp11_4x32b  = _mm_cvtepi16_epi32(src_temp11_4x32b);
596                     /* (pi2_src[col] + lvl_shift)*/
597                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
598                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
599                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
600                     src_temp10_4x32b  = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
601                     src_temp11_4x32b  = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
602 
603                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
604                     src_temp12_4x32b  = _mm_cvtepi16_epi32(src_temp12_4x32b);
605                     src_temp13_4x32b  = _mm_cvtepi16_epi32(src_temp13_4x32b);
606                     /* (pi2_src[col] + lvl_shift)*/
607                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
608                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
609                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
610                     src_temp12_4x32b  = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
611                     src_temp13_4x32b  = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
612 
613                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
614                     src_temp14_4x32b  = _mm_cvtepi16_epi32(src_temp14_4x32b);
615                     src_temp15_4x32b  = _mm_cvtepi16_epi32(src_temp15_4x32b);
616                     /* (pi2_src[col] + lvl_shift)*/
617                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
618                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
619                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
620                     src_temp14_4x32b  = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
621                     src_temp15_4x32b  = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
622 
623                     /* i4_tmp += 1 << (shift - 1) */
624                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
625                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
626                     /* (i4_tmp >> shift) */
627                     src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b,  shift);
628                     src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b,  shift);
629 
630                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
631                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
632                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
633                     /* (i4_tmp >> shift) */
634                     src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b,  shift);
635                     src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b,  shift);
636 
637                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
638                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
639                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
640                     /* (i4_tmp >> shift) */
641                     src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b,  shift);
642                     src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b,  shift);
643 
644                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
645                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
646                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
647                     /* (i4_tmp >> shift) */
648                     src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b,  shift);
649                     src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b,  shift);
650 
651                     /*i4_tmp = (i4_tmp >> shift) + off0; */
652                     src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
653                     src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
654                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
655                     src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
656                     src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
657                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
658                     src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
659                     src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
660                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
661                     src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
662                     src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
663 
664                     src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
665                     src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
666                     src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
667                     src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
668                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
669                     src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
670                     src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
671 
672                     /* store 16 8-bit output values  */
673                     _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
674                     _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
675 
676                     pi2_src += 16;  /* Pointer update */
677                     pu1_dst += 16; /* Pointer update */
678 
679                 } /* inner loop ends here(4-output values in single iteration) */
680                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
681                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
682             }
683         }
684         else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
685         {
686             __m128i src_temp2_4x32b,src_temp3_4x32b;
687             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
688             /*  outer for loop starts from here */
689             for(row = 0; row < ht; row +=4)
690             {
691                 for(col = 0; col < wdx2; col +=8)
692                 {
693                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
694                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
695                     /* row = 1 */
696                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
697                     /* row = 2 */
698                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
699                     /* row = 3 */
700                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
701 
702                     /* row = 0 */ /* Last 4 pixels */
703                     src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
704                     /* row = 1 */
705                     src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
706                     /* row = 2 */
707                     src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
708                     /* row = 3 */
709                     src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
710 
711                     /* considering pix. 4:0 by converting 16-into 32 bit */
712                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
713                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
714                     /* (pi2_src[col] + lvl_shift)*/
715                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
716                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
717                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
718                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
719                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
720 
721                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
722                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
723                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
724                     /* (pi2_src[col] + lvl_shift)*/
725                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
726                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
727                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
728                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
729                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
730 
731                     /* considering pix. 4:0 by converting 16-into 32 bit */
732                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
733                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
734                     /* (pi2_src[col] + lvl_shift)*/
735                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
736                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
737                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
738                     src_temp4_4x32b  = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
739                     src_temp5_4x32b  = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
740 
741                     /* considering pix. 4:0 by converting 16-into 32 bit */
742                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
743                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
744                     /* (pi2_src[col] + lvl_shift)*/
745                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
746                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
747                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
748                     src_temp6_4x32b  = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
749                     src_temp7_4x32b  = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
750 
751                     /* i4_tmp += 1 << (shift - 1) */
752                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
753                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
754                     /* (i4_tmp >> shift) */
755                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
756                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
757 
758                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
759                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
760                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
761                     /* (i4_tmp >> shift) */
762                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
763                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
764 
765                     /* i4_tmp += 1 << (shift - 1) */
766                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
767                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
768                     /* (i4_tmp >> shift) */
769                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
770                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
771 
772                     /* i4_tmp += 1 << (shift - 1) */
773                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
774                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
775                     /* (i4_tmp >> shift) */
776                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
777                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
778 
779                     /*i4_tmp = (i4_tmp >> shift) + off0; */
780                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
781                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
782                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
783                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
784                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
785                     /*i4_tmp = (i4_tmp >> shift) + off0; */
786                     src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
787                     src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
788                     /*i4_tmp = (i4_tmp >> shift) + off0; */
789                     src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
790                     src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
791 
792                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
793                     src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
794                     src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
795                     src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
796 
797                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
798                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
799                     src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
800                     src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
801                     src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
802 
803                     /* store four 8-bit output values  */
804                     _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
805                     _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
806                     _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
807                     _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
808 
809                     pi2_src += 8;   /* Pointer update */
810                     pu1_dst += 8; /* Pointer update */
811 
812                 } /* inner loop ends here(4-output values in single iteration) */
813                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
814                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
815             }
816         }
817         else /* 2*wd multiple of 4 case */
818         {
819             WORD32 dst0, dst1, dst2, dst3;
820             __m128i src_temp2_4x32b,src_temp3_4x32b;
821             /*  outer for loop starts from here */
822             for(row = 0; row < ht; row +=4)
823             {
824                 for(col = 0; col < wdx2; col +=4)
825                 {
826                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
827                     src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
828                     /* row = 1 */
829                     src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
830                     /* row = 2 */
831                     src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
832                     /* row = 3 */
833                     src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
834 
835                     /* considering pix. 4:0 by converting 16-into 32 bit */
836                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
837                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
838                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
839                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
840 
841                     /* (pi2_src[col] + lvl_shift)*/
842                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
843                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
844                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
845                     src_temp0_4x32b  = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
846                     src_temp1_4x32b  = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
847 
848                     /* (pi2_src[col] + lvl_shift)*/
849                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
850                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
851                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
852                     src_temp2_4x32b  = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
853                     src_temp3_4x32b  = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
854 
855                     /* i4_tmp += 1 << (shift - 1) */
856                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
857                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
858                     /* (i4_tmp >> shift) */
859                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
860                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
861                     /*i4_tmp = (i4_tmp >> shift) + off0; */
862                     src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
863                     src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
864 
865                     /* i4_tmp += 1 << (shift - 1) */
866                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
867                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
868                     /* (i4_tmp >> shift) */
869                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
870                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
871                     /*i4_tmp = (i4_tmp >> shift) + off0; */
872                     src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
873                     src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
874 
875                     src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
876                     src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
877 
878                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
879                     src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
880 
881                     dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
882                     /* dst row = 1 to 3 */
883                     src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
884                     src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
885                     src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
886 
887                     /* store four 8-bit output values  */
888                     *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
889 
890                     dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
891                     dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
892                     dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
893                     /* row = 1 */
894                     *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
895                     /* row = 2 */
896                     *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
897                     /* row = 3 */
898                     *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
899 
900                     pi2_src += 4;   /* Pointer update */
901                     pu1_dst += 4; /* Pointer update */
902 
903                 } /* inner loop ends here(4-output values in single iteration) */
904                 pi2_src = pi2_src - wdx2 + 4*src_strd;  /* Pointer update */
905                 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
906             }
907         }
908     }
909     else /* ht multiple of 2 case */
910 #endif
911 
912     {
913         if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
914         {
915             __m128i src_temp2_4x32b, src_temp3_4x32b;
916             __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
917             /*  outer for loop starts from here */
918             for(row = 0; row < ht; row += 2)
919             {
920                 for(col = 0; col < wdx2; col += 16)
921                 {
922                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
923                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
924                     /* row = 1 */
925                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
926 
927                     /* row = 0 */ /* Second 4 pixels */
928                     src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
929                     /* row = 1 */
930                     src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
931                     /* row = 0 */ /* Third 4 pixels */
932                     src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
933                     /* row = 1 */
934                     src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
935                     /* row = 0 */ /* Last 4 pixels */
936                     src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
937                     /* row = 1 */
938                     src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
939 
940                     /* considering pix. 4:0 by converting 16-into 32 bit */
941                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
942                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
943                     /* (pi2_src[col] + lvl_shift)*/
944                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
945                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
946                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
947                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
948                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
949 
950                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
951                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
952                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
953                     /* (pi2_src[col] + lvl_shift)*/
954                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
955                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
956                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
957                     src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
958                     src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
959 
960                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
961                     src_temp4_4x32b  = _mm_cvtepi16_epi32(src_temp4_4x32b);
962                     src_temp5_4x32b  = _mm_cvtepi16_epi32(src_temp5_4x32b);
963                     /* (pi2_src[col] + lvl_shift)*/
964                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
965                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
966                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
967                     src_temp4_4x32b  = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
968                     src_temp5_4x32b  = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
969 
970                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
971                     src_temp6_4x32b  = _mm_cvtepi16_epi32(src_temp6_4x32b);
972                     src_temp7_4x32b  = _mm_cvtepi16_epi32(src_temp7_4x32b);
973                     /* (pi2_src[col] + lvl_shift)*/
974                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
975                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
976                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
977                     src_temp6_4x32b  = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
978                     src_temp7_4x32b  = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
979 
980                     /* i4_tmp += 1 << (shift - 1) */
981                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
982                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
983                     /* (i4_tmp >> shift) */
984                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
985                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
986 
987                     /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
988                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
989                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
990                     /* (i4_tmp >> shift) */
991                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
992                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
993 
994                     /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
995                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
996                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
997                     /* (i4_tmp >> shift) */
998                     src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b,  shift);
999                     src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1000 
1001                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1002                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
1003                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1004                     /* (i4_tmp >> shift) */
1005                     src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b,  shift);
1006                     src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1007 
1008                     /*i4_tmp = (i4_tmp >> shift) + off0; */
1009                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1010                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1011                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
1012                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1013                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1014                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
1015                     src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
1016                     src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
1017                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1018                     src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
1019                     src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
1020 
1021                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1022                     src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1023                     src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
1024                     src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
1025                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1026                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
1027                     src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
1028 
1029                     /* store 16 8-bit output values  */
1030                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1031                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1032 
1033                     pi2_src += 16;  /* Pointer update */
1034                     pu1_dst += 16; /* Pointer update */
1035 
1036                 } /* inner loop ends here(4-output values in single iteration) */
1037                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1038                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1039             }
1040         }
1041         else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
1042         {
1043             __m128i src_temp2_4x32b, src_temp3_4x32b;
1044             /*  outer for loop starts from here */
1045             for(row = 0; row < ht; row += 2)
1046             {
1047                 for(col = 0; col < wdx2; col += 8)
1048                 {
1049                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1050                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1051                     /* row = 1 */
1052                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1053 
1054                     /* row = 0 */ /* Last 4 pixels */
1055                     src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
1056                     /* row = 1 */
1057                     src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
1058 
1059                     /* considering pix. 4:0 by converting 16-into 32 bit */
1060                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
1061                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
1062                     /* (pi2_src[col] + lvl_shift)*/
1063                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1064                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1065                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1066                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1067                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1068 
1069                     /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
1070                     src_temp2_4x32b  = _mm_cvtepi16_epi32(src_temp2_4x32b);
1071                     src_temp3_4x32b  = _mm_cvtepi16_epi32(src_temp3_4x32b);
1072                     /* (pi2_src[col] + lvl_shift)*/
1073                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
1074                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
1075                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1076                     src_temp2_4x32b  = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
1077                     src_temp3_4x32b  = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1078 
1079                     /* i4_tmp += 1 << (shift - 1) */
1080                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1081                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1082                     /* (i4_tmp >> shift) */
1083                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
1084                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1085 
1086                     /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1087                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
1088                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1089                     /* (i4_tmp >> shift) */
1090                     src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b,  shift);
1091                     src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1092 
1093                     /*i4_tmp = (i4_tmp >> shift) + off0; */
1094                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1095                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1096                     /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1097                     src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1098                     src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1099 
1100                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1101                     src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1102 
1103                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1104                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1105                     src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1106 
1107                     /* store four 8-bit output values  */
1108                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1109                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1110 
1111                     pi2_src += 8;   /* Pointer update */
1112                     pu1_dst += 8; /* Pointer update */
1113 
1114                 } /* inner loop ends here(4-output values in single iteration) */
1115                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1116                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1117             }
1118         }
1119         else /* 2*wd multiple of 4 case */
1120         {
1121             WORD32 dst0, dst1;
1122             /*  outer for loop starts from here */
1123             for(row = 0; row < ht; row += 2)
1124             {
1125                 for(col = 0; col < wdx2; col += 4)
1126                 {
1127                     /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1128                     src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1129                     /* row = 1 */
1130                     src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1131 
1132                     /* considering pix. 4:0 by converting 16-into 32 bit */
1133                     src_temp0_4x32b  = _mm_cvtepi16_epi32(src_temp0_4x32b);
1134                     src_temp1_4x32b  = _mm_cvtepi16_epi32(src_temp1_4x32b);
1135 
1136                     /* (pi2_src[col] + lvl_shift)*/
1137                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1138                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1139 
1140                     /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1141                     src_temp0_4x32b  = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1142                     src_temp1_4x32b  = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1143 
1144                     /* i4_tmp += 1 << (shift - 1) */
1145                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1146                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1147 
1148                     /* (i4_tmp >> shift) */
1149                     src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b,  shift);
1150                     src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1151 
1152                     /*i4_tmp = (i4_tmp >> shift) + off0; */
1153                     src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1154                     src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1155 
1156                     src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
1157 
1158                     /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1159                     src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1160 
1161                     dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
1162                     /* dst row = 1 to 3 */
1163                     src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
1164 
1165                     /* store four 8-bit output values  */
1166                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1167 
1168                     dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
1169                     /* row = 1 */
1170                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1171 
1172                     pi2_src += 4;   /* Pointer update */
1173                     pu1_dst += 4; /* Pointer update */
1174 
1175                 } /* inner loop ends here(4-output values in single iteration) */
1176                 pi2_src = pi2_src - wdx2 + 2 * src_strd;  /* Pointer update */
1177                 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1178             }
1179         }
1180     }
1181 }
1182 
1183 /**
1184 *******************************************************************************
1185 *
1186 * @brief
1187 *  Does bi-weighted prediction on the arrays pointed by  pi2_src1 and
1188 * pi2_src2 and stores it at location pointed  by pi2_dst
1189 *
1190 * @par Description:
1191 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
1192 * off1 + 1) << (shift - 1) ) >> shift
1193 *
1194 * @param[in] pi2_src1
1195 *  Pointer to source 1
1196 *
1197 * @param[in] pi2_src2
1198 *  Pointer to source 2
1199 *
1200 * @param[out] pu1_dst
1201 *  Pointer to destination
1202 *
1203 * @param[in] src_strd1
1204 *  Source stride 1
1205 *
1206 * @param[in] src_strd2
1207 *  Source stride 2
1208 *
1209 * @param[in] dst_strd
1210 *  Destination stride
1211 *
1212 * @param[in] wgt0
1213 *  weight to be multiplied to source 1
1214 *
1215 * @param[in] off0
1216 *  offset 0
1217 *
1218 * @param[in] wgt1
1219 *  weight to be multiplied to source 2
1220 *
1221 * @param[in] off1
1222 *  offset 1
1223 *
1224 * @param[in] shift
1225 *  (14 Bit depth) + log2_weight_denominator
1226 *
1227 * @param[in] lvl_shift1
1228 *  added before shift and offset
1229 *
1230 * @param[in] lvl_shift2
1231 *  added before shift and offset
1232 *
1233 * @param[in] ht
1234 *  height of the source
1235 *
1236 * @param[in] wd
1237 *  width of the source
1238 *
1239 * @returns
1240 *
1241 * @remarks
1242 *  None
1243 *
1244 *******************************************************************************
1245 */
1246 
ihevc_weighted_pred_bi_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1247 void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
1248                                   WORD16 *pi2_src2,
1249                                   UWORD8 *pu1_dst,
1250                                   WORD32 src_strd1,
1251                                   WORD32 src_strd2,
1252                                   WORD32 dst_strd,
1253                                   WORD32 wgt0,
1254                                   WORD32 off0,
1255                                   WORD32 wgt1,
1256                                   WORD32 off1,
1257                                   WORD32 shift,
1258                                   WORD32 lvl_shift1,
1259                                   WORD32 lvl_shift2,
1260                                   WORD32 ht,
1261                                   WORD32 wd)
1262 {
1263     WORD32 row, col, temp;
1264 
1265     __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1266     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1267 
1268 
1269     ASSERT(wd % 4 == 0); /* checking assumption*/
1270     ASSERT(ht % 2 == 0); /* checking assumption*/
1271 
1272     temp = (off0 + off1 + 1) << (shift - 1);
1273 
1274     // seting values in register
1275     const_temp_4x32b = _mm_set1_epi32(temp);
1276     lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1277     lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1278     wgt0_4x32b = _mm_set1_epi32(wgt0);
1279     wgt1_4x32b = _mm_set1_epi32(wgt1);
1280 
1281     if(0 == (wd & 7)) /* wd multiple of 8 case */
1282     {
1283         __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1284         /*  outer for loop starts from here */
1285         for(row = 0; row < ht; row += 2)
1286         {
1287             for(col = 0; col < wd; col += 8)
1288             {
1289                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1290                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1291                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1292                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1293                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1294                 /* Next 4 pixels */
1295                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1296                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1297                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1298                 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1299 
1300                 /* considering pix. 4:0 by converting 16-into 32 bit */
1301                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1302                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1303                 /* (pi2_src1[col] + lvl_shift1) */
1304                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1305                 /* (pi2_src2[col] + lvl_shift2) */
1306                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1307                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1308                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1309                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1310                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1311 
1312                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1313                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1314                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1315                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1316                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1317                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1318 
1319                 /* Next 4 Pixels */
1320                 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1321                 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1322                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1323                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1324                 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1325                 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1326                 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1327                 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1328                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1329                 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1330                 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1331                 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1332 
1333                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1334                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1335                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1336                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1337                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1338                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1339                 /* (i4_tmp >> shift) */
1340                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1341                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1342 
1343                 /* Next 4 Pixels */
1344                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1345                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1346                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1347                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1348                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1349                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1350 
1351                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1352                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1353 
1354                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1355                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1356                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1357 
1358                 /* store four 8-bit output values  */
1359                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1360                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1361 
1362                 pi2_src1 += 8;  /* Pointer update */
1363                 pi2_src2 += 8;  /* Pointer update */
1364                 pu1_dst  += 8;  /* Pointer update */
1365 
1366             } /* inner loop ends here(4-output values in single iteration) */
1367 
1368             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
1369             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
1370             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
1371 
1372         } /* outer loop ends */
1373     }
1374     else /* wd multiple of 4 case */
1375     {
1376         WORD32 dst0, dst1;
1377         /*  outer for loop starts from here */
1378         for(row = 0; row < ht; row += 2)
1379         {
1380             for(col = 0; col < wd; col += 4)
1381             {
1382                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1383                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1384                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1385                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1386                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1387 
1388                 /* considering pix. 4:0 by converting 16-into 32 bit */
1389                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1390                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1391                 /* (pi2_src1[col] + lvl_shift1) */
1392                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1393                 /* (pi2_src2[col] + lvl_shift2) */
1394                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1395                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1396                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1397                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1398                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1399 
1400                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1401                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1402                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1403                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1404                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1405                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1406 
1407                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1408                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1409                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1410 
1411                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1412                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1413                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1414 
1415                 /* (i4_tmp >> shift) */
1416                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1417                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1418 
1419                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1420 
1421                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1422                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1423 
1424                 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1425 
1426                 /* dst row = 1 to 3 */
1427                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1428 
1429                 /* store four 8-bit output values  */
1430                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1431 
1432                 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1433 
1434                 /* row = 1 to 3 */
1435                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1436 
1437                 pi2_src1 += 4;  /* Pointer update */
1438                 pi2_src2 += 4;  /* Pointer update */
1439                 pu1_dst  += 4;  /* Pointer update */
1440 
1441             } /* inner loop ends here(4-output values in single iteration) */
1442 
1443             pi2_src1 = pi2_src1 - wd + 2 * src_strd1;  /* Pointer update */
1444             pi2_src2 = pi2_src2 - wd + 2 * src_strd2;  /* Pointer update */
1445             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;   /* Pointer update */
1446 
1447         } /* outer loop ends */
1448     }
1449 
1450 }
1451 
1452 /**
1453 *******************************************************************************
1454 *
1455 * @brief
1456 * Does chroma bi-weighted prediction on the arrays pointed by  pi2_src1 and
1457 * pi2_src2 and stores it at location pointed  by pi2_dst
1458 *
1459 * @par Description:
1460 *  dst = ( (src1 + lvl_shift1)*wgt0 +  (src2 + lvl_shift2)*wgt1 +  (off0 +
1461 * off1 + 1) << (shift - 1) ) >> shift
1462 *
1463 * @param[in] pi2_src1
1464 *  Pointer to source 1
1465 *
1466 * @param[in] pi2_src2
1467 *  Pointer to source 2
1468 *
1469 * @param[out] pu1_dst
1470 *  Pointer to destination
1471 *
1472 * @param[in] src_strd1
1473 *  Source stride 1
1474 *
1475 * @param[in] src_strd2
1476 *  Source stride 2
1477 *
1478 * @param[in] dst_strd
1479 *  Destination stride
1480 *
1481 * @param[in] wgt0
1482 *  weight to be multiplied to source 1
1483 *
1484 * @param[in] off0
1485 *  offset 0
1486 *
1487 * @param[in] wgt1
1488 *  weight to be multiplied to source 2
1489 *
1490 * @param[in] off1
1491 *  offset 1
1492 *
1493 * @param[in] shift
1494 *  (14 Bit depth) + log2_weight_denominator
1495 *
1496 * @param[in] lvl_shift1
1497 *  added before shift and offset
1498 *
1499 * @param[in] lvl_shift2
1500 *  added before shift and offset
1501 *
1502 * @param[in] ht
1503 *  height of the source
1504 *
1505 * @param[in] wd
1506 *  width of the source (each colour component)
1507 *
1508 * @returns
1509 *
1510 * @remarks
1511 *  None
1512 *
1513 *******************************************************************************
1514 */
1515 
ihevc_weighted_pred_chroma_bi_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1516 void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
1517                                          WORD16 *pi2_src2,
1518                                          UWORD8 *pu1_dst,
1519                                          WORD32 src_strd1,
1520                                          WORD32 src_strd2,
1521                                          WORD32 dst_strd,
1522                                          WORD32 wgt0_cb,
1523                                          WORD32 wgt0_cr,
1524                                          WORD32 off0_cb,
1525                                          WORD32 off0_cr,
1526                                          WORD32 wgt1_cb,
1527                                          WORD32 wgt1_cr,
1528                                          WORD32 off1_cb,
1529                                          WORD32 off1_cr,
1530                                          WORD32 shift,
1531                                          WORD32 lvl_shift1,
1532                                          WORD32 lvl_shift2,
1533                                          WORD32 ht,
1534                                          WORD32 wd)
1535 {
1536     WORD32 row, col, temp1, temp2;
1537     WORD32 wdx2;
1538 
1539     __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1540     __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1541 
1542 
1543     ASSERT(wd % 2 == 0); /* checking assumption*/
1544     ASSERT(ht % 2 == 0); /* checking assumption*/
1545 
1546     temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1547     temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1548 
1549     // seting values in register
1550     const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1551     lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1552     lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1553     wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1554     wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1555 
1556     wdx2 = wd * 2;
1557 
1558     if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1559     {
1560         __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1561         /*  outer for loop starts from here */
1562         for(row = 0; row < ht; row += 2)
1563         {
1564             for(col = 0; col < wdx2; col += 8)
1565             {
1566                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1567                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1568                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1569                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1570                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1571                 /* Next 4 pixels */
1572                 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1573                 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1574                 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1575                 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1576 
1577                 /* considering pix. 4:0 by converting 16-into 32 bit */
1578                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1579                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1580                 /* (pi2_src1[col] + lvl_shift1) */
1581                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1582                 /* (pi2_src2[col] + lvl_shift2) */
1583                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1584                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1585                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1586                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1587                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1588 
1589                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1590                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1591                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1592                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1593                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1594                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1595 
1596                 /* Next 4 Pixels */
1597                 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1598                 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1599                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1600                 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1601                 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1602                 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1603                 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1604                 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1605                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1606                 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1607                 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1608                 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1609 
1610                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1611                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1612                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1613                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1614                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1615                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1616                 /* (i4_tmp >> shift) */
1617                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1618                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1619 
1620                 /* Next 4 Pixels */
1621                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1622                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1623                 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1624                 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1625                 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b,  shift);
1626                 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b,  shift);
1627 
1628                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1629                 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1630 
1631                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1632                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1633                 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1634 
1635                 /* store four 8-bit output values  */
1636                 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1637                 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1638 
1639                 pi2_src1 += 8;  /* Pointer update */
1640                 pi2_src2 += 8;  /* Pointer update */
1641                 pu1_dst  += 8;  /* Pointer update */
1642 
1643             } /* inner loop ends here(4-output values in single iteration) */
1644 
1645             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1646             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1647             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1648 
1649         } /* outer loop ends */
1650     }
1651     else /* wdx2 multiple of 4 case */
1652     {
1653         WORD32 dst0, dst1;
1654         /*  outer for loop starts from here */
1655         for(row = 0; row < ht; row += 2)
1656         {
1657             for(col = 0; col < wdx2; col += 4)
1658             {
1659                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1660                 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1661                 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1662                 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1663                 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1664 
1665                 /* considering pix. 4:0 by converting 16-into 32 bit */
1666                 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1667                 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1668                 /* (pi2_src1[col] + lvl_shift1) */
1669                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1670                 /* (pi2_src2[col] + lvl_shift2) */
1671                 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1672                 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1673                 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1674                 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1675                 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1676 
1677                 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1678                 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1679                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1680                 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1681                 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1682                 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1683 
1684                 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1685                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1686                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1687 
1688                 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1689                 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1690                 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1691 
1692                 /* (i4_tmp >> shift) */
1693                 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b,  shift);
1694                 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b,  shift);
1695 
1696                 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1697 
1698                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1699                 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1700 
1701                 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1702 
1703                 /* dst row = 1 to 3 */
1704                 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1705 
1706                 /* store four 8-bit output values  */
1707                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1708 
1709                 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1710 
1711                 /* row = 1 to 3 */
1712                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1713 
1714                 pi2_src1 += 4;  /* Pointer update */
1715                 pi2_src2 += 4;  /* Pointer update */
1716                 pu1_dst  += 4;  /* Pointer update */
1717 
1718             } /* inner loop ends here(4-output values in single iteration) */
1719 
1720             pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1;    /* Pointer update */
1721             pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2;    /* Pointer update */
1722             pu1_dst  = pu1_dst  - wdx2 + 2 * dst_strd;   /* Pointer update */
1723         }
1724     }
1725 
1726 }
1727 
1728 /**
1729 *******************************************************************************
1730 *
1731 * @brief
1732 *  Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1733 * pi2_src2 and stores it at location  pointed by pi2_dst
1734 *
1735 * @par Description:
1736 *  dst = ( (src1 + lvl_shift1) +  (src2 + lvl_shift2) +  1 << (shift - 1) )
1737 * >> shift  where shift = 15 - BitDepth
1738 *
1739 * @param[in] pi2_src1
1740 *  Pointer to source 1
1741 *
1742 * @param[in] pi2_src2
1743 *  Pointer to source 2
1744 *
1745 * @param[out] pu1_dst
1746 *  Pointer to destination
1747 *
1748 * @param[in] src_strd1
1749 *  Source stride 1
1750 *
1751 * @param[in] src_strd2
1752 *  Source stride 2
1753 *
1754 * @param[in] dst_strd
1755 *  Destination stride
1756 *
1757 * @param[in] lvl_shift1
1758 *  added before shift and offset
1759 *
1760 * @param[in] lvl_shift2
1761 *  added before shift and offset
1762 *
1763 * @param[in] ht
1764 *  height of the source
1765 *
1766 * @param[in] wd
1767 *  width of the source
1768 *
1769 * @returns
1770 *
1771 * @remarks
1772 *  None
1773 *
1774 * Assumption : ht%4 == 0, wd%4 == 0
1775 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1776 * final result will match even if intermediate precision is in 16 bit.
1777 *
1778 *******************************************************************************
1779 */
1780 
ihevc_weighted_pred_bi_default_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1781 void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
1782                                           WORD16 *pi2_src2,
1783                                           UWORD8 *pu1_dst,
1784                                           WORD32 src_strd1,
1785                                           WORD32 src_strd2,
1786                                           WORD32 dst_strd,
1787                                           WORD32 lvl_shift1,
1788                                           WORD32 lvl_shift2,
1789                                           WORD32 ht,
1790                                           WORD32 wd)
1791 {
1792     WORD32 row, col, temp;
1793     WORD32 shift;
1794 
1795     __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1796     __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1797     __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1798 
1799     ASSERT(wd % 4 == 0); /* checking assumption*/
1800     ASSERT(ht % 2 == 0); /* checking assumption*/
1801 
1802     shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1803     temp = 1 << (shift - 1);
1804 
1805     // seting values in register
1806     lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1807     lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1808     const_temp_8x16b = _mm_set1_epi16(temp);
1809 
1810     lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1811     lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1812 
1813     if(0 == (ht & 3)) /* ht multiple of 4*/
1814     {
1815         if(0 == (wd & 15)) /* wd multiple of 16 case */
1816         {
1817             __m128i src_temp9_8x16b,  src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1818             __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1819             /*  outer for loop starts from here */
1820             for(row = 0; row < ht; row += 4)
1821             {
1822                 for(col = 0; col < wd; col += 16)
1823                 {
1824                     /*load 8 pixel values */ /* First 8 Values */
1825                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1826                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1827                     /* row = 1 */
1828                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1829                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1830                     /* row = 2 */
1831                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1832                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1833                     /* row = 3 */
1834                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1835                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1836 
1837                     /*load 8 pixel values */ /* Second 8 Values */
1838                     src_temp9_8x16b  = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1839                     src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1840                     /* row = 1 */
1841                     src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1842                     src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1843                     /* row = 2 */
1844                     src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1845                     src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1846 
1847                     /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1848                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1849                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1850                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1851                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1852 
1853                     /*load 8 pixel values */ /* Second 8 Values */
1854                     /* row = 3 */
1855                     src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1856                     src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1857 
1858                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1859                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1860                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1861                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1862                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1863 
1864                     /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1865                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b,  src_temp10_8x16b);
1866                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1867                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1868                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1869 
1870                     /* (i4_tmp >> shift) */ /* First 8 Values */
1871                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1872                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1873                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1874                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1875 
1876                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1877                     src_temp9_8x16b  = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1878                     src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1879                     src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1880                     src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1881 
1882                     /* (i4_tmp >> shift) */ /* Second 8 Values */
1883                     src_temp9_8x16b  = _mm_srai_epi16(src_temp9_8x16b,  shift);
1884                     src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b,  shift);
1885                     src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b,  shift);
1886                     src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b,  shift);
1887 
1888                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1889                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1890                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1891                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1892                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1893 
1894                     /* store four 8-bit output values  */ /* 16 8 Values */
1895                     _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1896                     _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1897                     _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1898                     _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1899 
1900                     /* To update pointer */
1901                     pi2_src1 += 16;
1902                     pi2_src2 += 16;
1903                     pu1_dst  += 16;
1904 
1905                 } /* inner loop ends here(8-output values in single iteration) */
1906 
1907                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1908                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1909                 pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1910 
1911             }
1912         }
1913         else if(0 == (wd & 7)) /* multiple of 8 case */
1914         {
1915             /*  outer for loop starts from here */
1916             for(row = 0; row < ht; row += 4)
1917             {
1918                 for(col = 0; col < wd; col += 8)
1919                 {
1920                     /*load 8 pixel values */
1921                     src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1922                     src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1923                     /* row = 1 */
1924                     src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1925                     src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1926                     /* row = 2 */
1927                     src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1928                     src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1929                     /* row = 3 */
1930                     src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1931                     src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1932 
1933                     /* (pi2_src1[col] + pi2_src2[col]) */
1934                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1935                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1936                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1937                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1938 
1939                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1940                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1941                     src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1942                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1943                     src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1944 
1945                     /* (i4_tmp >> shift) */
1946                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
1947                     src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b,  shift);
1948                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
1949                     src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b,  shift);
1950 
1951                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1952                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1953                     src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1954                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1955                     src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1956 
1957                     /* store four 8-bit output values  */
1958                     _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1959                     _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1960                     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1961                     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1962 
1963                     /* To update pointer */
1964                     pi2_src1 += 8;
1965                     pi2_src2 += 8;
1966                     pu1_dst  += 8;
1967 
1968                 } /* inner loop ends here(8-output values in single iteration) */
1969 
1970                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1;  /* Pointer update */
1971                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2;  /* Pointer update */
1972                 pu1_dst  = pu1_dst - wd + 4 * dst_strd;   /* Pointer update */
1973 
1974             }
1975         }
1976         else /* wd multiple of 4 case*/
1977         {
1978             WORD32 dst0, dst1, dst2, dst3;
1979 
1980             /*  outer for loop starts from here */
1981             for(row = 0; row < ht; row += 4)
1982             {
1983                 for(col = 0; col < wd; col += 4)
1984                 {
1985                     /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1986                     src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1987                     /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1988                     src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1989 
1990                     /* row = 1 */
1991                     src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1992                     src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1993                     /* row = 2 */
1994                     src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1995                     src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1996                     /* row = 3 */
1997                     src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1998                     src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1999 
2000                     /* Pack two rows together */
2001                     src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2002                     src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2003                     src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
2004                     src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
2005 
2006                     /* (pi2_src1[col] + pi2_src2[col]) */
2007                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2008                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
2009 
2010                     /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2011                     src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2012                     src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
2013 
2014                     /* (i4_tmp >> shift) */
2015                     src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2016                     src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b,  shift);
2017 
2018                     /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2019                     src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2020                     src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
2021 
2022                     dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2023                     /* dst row = 1 to 3 */
2024                     src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2025                     src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
2026 
2027                     /* store four 8-bit output values  */
2028                     *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2029 
2030                     dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2031                     dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
2032                     dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
2033 
2034                     /* row = 1 to row = 3 */
2035                     *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2036                     *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
2037                     *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
2038 
2039                     /* To update pointer */
2040                     pi2_src1 += 4;
2041                     pi2_src2 += 4;
2042                     pu1_dst  += 4;
2043 
2044                 } /* inner loop ends here(4-output values in single iteration) */
2045 
2046                 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
2047                 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
2048                 pu1_dst  = pu1_dst  - wd + 4 * dst_strd;  /* Pointer update */
2049 
2050             }
2051         }
2052     }
2053     else /* ht multiple of 2 case and wd multiple of 4 case*/
2054     {
2055 
2056         WORD32 dst0, dst1;
2057 
2058         /*  outer for loop starts from here */
2059         for(row = 0; row < ht; row += 2)
2060         {
2061             for(col = 0; col < wd; col += 4)
2062             {
2063                 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2064                 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2065                 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2066                 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2067 
2068                 /* row = 1 */
2069                 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2070                 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2071 
2072                 /* Pack two rows together */
2073                 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2074                 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2075 
2076                 /* (pi2_src1[col] + pi2_src2[col]) */
2077                 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2078 
2079                 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2080                 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2081 
2082                 /* (i4_tmp >> shift) */
2083                 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b,  shift);
2084 
2085                 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2086                 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2087 
2088                 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2089                 /* dst row = 1 to 3 */
2090                 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2091 
2092                 /* store four 8-bit output values  */
2093                 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2094 
2095                 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2096 
2097                 /* row = 1 to row = 3 */
2098                 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2099 
2100                 /* To update pointer */
2101                 pi2_src1 += 4;
2102                 pi2_src2 += 4;
2103                 pu1_dst  += 4;
2104 
2105             } /* inner loop ends here(4-output values in single iteration) */
2106 
2107             pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
2108             pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
2109             pu1_dst  = pu1_dst  - wd + 2 * dst_strd;  /* Pointer update */
2110 
2111         }
2112 
2113     }
2114 
2115 }
2116