1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_weighted_pred_x86_intr.c
22 *
23 * @brief
24 * Contains function definitions for weighted prediction used in inter
25 * prediction
26 *
27 * @author
28 *
29 *
30 * @par List of Functions:
31 * - ihevc_weighted_pred_uni_sse42()
32 * - ihevc_weighted_pred_bi_sse42()
33 * - ihevc_weighted_pred_bi_default_sse42()
34 * - ihevc_weighted_pred_chroma_uni_sse42()
35 * - ihevc_weighted_pred_chroma_bi_sse42()
36 *
37 * @remarks
38 * None
39 *
40 *******************************************************************************
41 */
42 /*****************************************************************************/
43 /* File Includes */
44 /*****************************************************************************/
45 #include <stdio.h>
46 #include <assert.h>
47
48 #include "ihevc_debug.h"
49 #include "ihevc_typedefs.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_platform_macros.h"
52 #include "ihevc_func_selector.h"
53 #include "ihevc_defs.h"
54 #include "ihevc_weighted_pred.h"
55 #include "ihevc_inter_pred.h"
56
57 #include <immintrin.h>
58
59 /**
60 *******************************************************************************
61 *
62 * @brief
63 * Does uni-weighted prediction on the array pointed by pi2_src and stores
64 * it at the location pointed by pi2_dst
65 *
66 * @par Description:
67 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
68 * offset
69 *
70 * @param[in] pi2_src
71 * Pointer to the source
72 *
73 * @param[out] pu1_dst
74 * Pointer to the destination
75 *
76 * @param[in] src_strd
77 * Source stride
78 *
79 * @param[in] dst_strd
80 * Destination stride
81 *
82 * @param[in] wgt0
83 * weight to be multiplied to the source
84 *
85 * @param[in] off0
86 * offset to be added after rounding and
87 *
88 * @param[in] shifting
89 *
90 *
91 * @param[in] shift
92 * (14 Bit depth) + log2_weight_denominator
93 *
94 * @param[in] lvl_shift
95 * added before shift and offset
96 *
97 * @param[in] ht
98 * height of the source
99 *
100 * @param[in] wd
101 * width of the source
102 *
103 * @returns
104 *
105 * @remarks
106 * None
107 *
108 *******************************************************************************
109 */
110
ihevc_weighted_pred_uni_sse42(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)111 void ihevc_weighted_pred_uni_sse42(WORD16 *pi2_src,
112 UWORD8 *pu1_dst,
113 WORD32 src_strd,
114 WORD32 dst_strd,
115 WORD32 wgt0,
116 WORD32 off0,
117 WORD32 shift,
118 WORD32 lvl_shift,
119 WORD32 ht,
120 WORD32 wd)
121 {
122 WORD32 row, col, temp;
123 WORD32 dst0, dst1, dst2, dst3;
124
125 /* all 128 bit registers are named with a suffix mxnb, where m is the */
126 /* number of n bits packed in the register */
127 __m128i src_temp0_4x32b, src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b;
128 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
129
130 ASSERT(wd % 4 == 0); /* checking assumption*/
131 ASSERT(ht % 4 == 0); /* checking assumption*/
132
133 temp = 1 << (shift - 1);
134
135 // seting values in register
136 const_temp_4x32b = _mm_set1_epi32(temp);
137 lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
138 wgt0_4x32b = _mm_set1_epi32(wgt0);
139 off0_4x32b = _mm_set1_epi32(off0);
140
141 if(0 == (wd & 7)) /* wd multiple of 8 case */
142 {
143 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
144
145 /* outer for loop starts from here */
146 for(row = 0; row < ht; row += 4)
147 {
148 for(col = 0; col < wd; col += 8)
149 { /* for row =0 ,1,2,3*/
150
151 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
152 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
153 /* row = 1 */
154 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
155 /* row = 2 */
156 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
157 /* row = 3 */
158 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
159
160 /* row = 0 */ /* Last 4 pixels */
161 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
162 /* row = 1 */
163 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
164 /* row = 2 */
165 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd + 4));
166 /* row = 3 */
167 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd + 4));
168
169 /* considering pix. 4:0 by converting 16-into 32 bit */ /* First 4 pixels */
170 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
171 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
172 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
173 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
174
175 /* (pi2_src[col] + lvl_shift)*/ /* First 4 pixels */
176 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
177 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
178 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
179 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
180
181 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* First 4 pixels */
182 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
183 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
184 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
185 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
186
187 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
188 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
189 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
190 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
191 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
192
193 /* (pi2_src[col] + lvl_shift)*/ /* Last 4 pixels */
194 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
195 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
196 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
197 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
198
199 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/ /* Last 4 pixels */
200 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
201 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
202 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
203 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
204
205 /* i4_tmp += 1 << (shift - 1) */ /* First 4 pixels */
206 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
207 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
208 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
209 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
210
211 /* (i4_tmp >> shift) */ /* First 4 pixels */
212 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
213 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
214 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
215 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
216
217 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
218 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
219 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
220 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
221 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
222
223 /* (i4_tmp >> shift) */ /* Last 4 pixels */
224 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
225 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
226 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
227 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
228
229 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* First 4 pixels */
230 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
231 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
232 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
233 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
234
235 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
236 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
237 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
238 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
239 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
240
241 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp4_4x32b);
242 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
243 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp6_4x32b);
244 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
245 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
246 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
247 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
248 src_temp2_4x32b = _mm_packus_epi16(src_temp2_4x32b, src_temp2_4x32b);
249 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
250
251 /* store four 8-bit output values */
252 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
253 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 2*/
254 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp2_4x32b); /* row = 1*/
255 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp3_4x32b); /* row = 3*/
256
257 /* To update pointer */
258 pi2_src += 8;
259 pu1_dst += 8;
260
261 } /* inner loop ends here(4-output values in single iteration) */
262
263 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
264 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
265
266 }
267 }
268 else /* wd multiple of 4 case */
269 {
270 /* outer for loop starts from here */
271 for(row = 0; row < ht; row += 4)
272 {
273 for(col = 0; col < wd; col += 4)
274 { /* for row =0 ,1,2,3*/
275
276 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
277 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
278 /* row = 1 */
279 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
280 /* row = 2 */
281 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 2 * src_strd));
282 /* row = 3 */
283 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 3 * src_strd));
284
285 /* considering pix. 4:0 by converting 16-into 32 bit */
286 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
287 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
288 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
289 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
290
291 /* (pi2_src[col] + lvl_shift)*/
292 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
293 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
294 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
295 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
296
297 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
298 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
299 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
300 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
301 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
302
303 /* i4_tmp += 1 << (shift - 1) */
304 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
305 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
306 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
307 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
308
309 /* (i4_tmp >> shift) */
310 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
311 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
312 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
313 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
314
315 /*i4_tmp = (i4_tmp >> shift) + off0; */
316 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
317 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
318 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
319 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
320
321 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
322 src_temp2_4x32b = _mm_packs_epi32(src_temp2_4x32b, src_temp3_4x32b);
323
324 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
325 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp2_4x32b);
326
327 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
328 /* dst row = 1 to 3 */
329 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
330 src_temp2_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 2);
331 src_temp3_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 3);
332
333 /* store four 8-bit output values */
334 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
335
336 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
337 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
338 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
339
340 /* row = 1 to row = 3 */
341 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
342 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
343 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
344
345 /* To update pointer */
346 pi2_src += 4;
347 pu1_dst += 4;
348
349 } /* inner loop ends here(4-output values in single iteration) */
350
351 pi2_src = pi2_src - wd + 4 * src_strd; /* Pointer update */
352 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
353
354 }
355 }
356 }
357
358 /**
359 *******************************************************************************
360 *
361 * @brief
362 * Does chroma uni-weighted prediction on array pointed by pi2_src and stores
363 * it at the location pointed by pi2_dst
364 *
365 * @par Description:
366 * dst = ( (src + lvl_shift) * wgt0 + (1 << (shift - 1)) ) >> shift +
367 * offset
368 *
369 * @param[in] pi2_src
370 * Pointer to the source
371 *
372 * @param[out] pu1_dst
373 * Pointer to the destination
374 *
375 * @param[in] src_strd
376 * Source stride
377 *
378 * @param[in] dst_strd
379 * Destination stride
380 *
381 * @param[in] wgt0
382 * weight to be multiplied to the source
383 *
384 * @param[in] off0
385 * offset to be added after rounding and
386 *
387 * @param[in] shifting
388 *
389 *
390 * @param[in] shift
391 * (14 Bit depth) + log2_weight_denominator
392 *
393 * @param[in] lvl_shift
394 * added before shift and offset
395 *
396 * @param[in] ht
397 * height of the source
398 *
399 * @param[in] wd
400 * width of the source (each colour component)
401 *
402 * @returns
403 *
404 * @remarks
405 * None
406 *
407 *******************************************************************************
408 */
409
ihevc_weighted_pred_chroma_uni_sse42(WORD16 * pi2_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 shift,WORD32 lvl_shift,WORD32 ht,WORD32 wd)410 void ihevc_weighted_pred_chroma_uni_sse42(WORD16 *pi2_src,
411 UWORD8 *pu1_dst,
412 WORD32 src_strd,
413 WORD32 dst_strd,
414 WORD32 wgt0_cb,
415 WORD32 wgt0_cr,
416 WORD32 off0_cb,
417 WORD32 off0_cr,
418 WORD32 shift,
419 WORD32 lvl_shift,
420 WORD32 ht,
421 WORD32 wd)
422 {
423 WORD32 row, col, temp, wdx2;
424 /* all 128 bit registers are named with a suffix mxnb, where m is the */
425 /* number of n bits packed in the register */
426
427 __m128i src_temp0_4x32b, src_temp1_4x32b;
428 __m128i const_temp_4x32b, lvl_shift_4x32b, wgt0_4x32b, off0_4x32b;
429
430 ASSERT(wd % 2 == 0); /* checking assumption*/
431 ASSERT(ht % 2 == 0); /* checking assumption*/
432
433 temp = 1 << (shift - 1);
434 wdx2 = 2 * wd;
435
436 // seting values in register
437 const_temp_4x32b = _mm_set1_epi32(temp);
438 lvl_shift_4x32b = _mm_set1_epi32(lvl_shift);
439 wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
440 off0_4x32b = _mm_set_epi32(off0_cr, off0_cb, off0_cr, off0_cb);
441
442 #if 0 /* Enable this for ht%4=0 case. But was degrading performance for lower sizes and improving for higher sizes!!! */
443 if( 0 == (ht & 3)) /* ht multiple of 4 case */
444 {
445 if( 0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
446 {
447 __m128i src_temp2_4x32b, src_temp3_4x32b;
448 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
449 __m128i src_temp8_4x32b, src_temp9_4x32b, src_temp10_4x32b, src_temp11_4x32b;
450 __m128i src_temp12_4x32b, src_temp13_4x32b, src_temp14_4x32b, src_temp15_4x32b;
451 /* outer for loop starts from here */
452 for(row = 0; row < ht; row +=4)
453 {
454 for(col = 0; col < wdx2; col +=16)
455 {
456 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
457 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
458 /* row = 1 */
459 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
460 /* row = 0 */ /* Second 4 pixels */
461 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
462 /* row = 1 */
463 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
464 /* row = 0 */ /* Third 4 pixels */
465 src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+8));
466 /* row = 1 */
467 src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+8));
468 /* row = 0 */ /* Last 4 pixels */
469 src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+12));
470 /* row = 1 */
471 src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+12));
472
473 /* considering pix. 4:0 by converting 16-into 32 bit */
474 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
475 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
476 /* (pi2_src[col] + lvl_shift)*/
477 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
478 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
479 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
480 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
481 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
482
483 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
484 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
485 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
486 /* (pi2_src[col] + lvl_shift)*/
487 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
488 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
489 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
490 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
491 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
492
493 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
494 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
495 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
496 /* (pi2_src[col] + lvl_shift)*/
497 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
498 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
499 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
500 src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
501 src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
502
503 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
504 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
505 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
506 /* (pi2_src[col] + lvl_shift)*/
507 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
508 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
509 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
510 src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
511 src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
512
513 /* i4_tmp += 1 << (shift - 1) */
514 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
515 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
516 /* (i4_tmp >> shift) */
517 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
518 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
519
520 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
521 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
522 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
523 /* (i4_tmp >> shift) */
524 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
525 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
526
527 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
528 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
529 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
530 /* (i4_tmp >> shift) */
531 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
532 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
533
534 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
535 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
536 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
537 /* (i4_tmp >> shift) */
538 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
539 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
540
541 /*i4_tmp = (i4_tmp >> shift) + off0; */
542 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
543 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
544 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
545 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
546 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
547 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
548 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
549 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
550 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
551 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
552 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
553
554 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp2_4x32b);
555 src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp3_4x32b);
556 src_temp4_4x32b = _mm_packs_epi32 (src_temp4_4x32b, src_temp6_4x32b);
557 src_temp5_4x32b = _mm_packs_epi32 (src_temp5_4x32b, src_temp7_4x32b);
558 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
559 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp4_4x32b);
560 src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp5_4x32b);
561
562 /* store 16 8-bit output values */
563 _mm_storeu_si128((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
564 _mm_storeu_si128((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
565
566 /* row = 2 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
567 src_temp8_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
568 /* row = 3 */
569 src_temp9_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
570 /* row = 2 */ /* Second 4 pixels */
571 src_temp10_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
572 /* row = 3 */
573 src_temp11_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
574 /* row = 2 */ /* Third 4 pixels */
575 src_temp12_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+8));
576 /* row = 3 */
577 src_temp13_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+8));
578 /* row = 2 */ /* Last 4 pixels */
579 src_temp14_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+12));
580 /* row = 3 */
581 src_temp15_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+12));
582
583 /* considering pix. 4:0 by converting 16-into 32 bit */
584 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
585 src_temp9_4x32b = _mm_cvtepi16_epi32(src_temp9_4x32b);
586 /* (pi2_src[col] + lvl_shift)*/
587 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, lvl_shift_4x32b);
588 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, lvl_shift_4x32b);
589 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
590 src_temp8_4x32b = _mm_mullo_epi32 (src_temp8_4x32b, wgt0_4x32b);
591 src_temp9_4x32b = _mm_mullo_epi32 (src_temp9_4x32b, wgt0_4x32b);
592
593 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
594 src_temp10_4x32b = _mm_cvtepi16_epi32(src_temp10_4x32b);
595 src_temp11_4x32b = _mm_cvtepi16_epi32(src_temp11_4x32b);
596 /* (pi2_src[col] + lvl_shift)*/
597 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, lvl_shift_4x32b);
598 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, lvl_shift_4x32b);
599 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
600 src_temp10_4x32b = _mm_mullo_epi32 (src_temp10_4x32b, wgt0_4x32b);
601 src_temp11_4x32b = _mm_mullo_epi32 (src_temp11_4x32b, wgt0_4x32b);
602
603 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
604 src_temp12_4x32b = _mm_cvtepi16_epi32(src_temp12_4x32b);
605 src_temp13_4x32b = _mm_cvtepi16_epi32(src_temp13_4x32b);
606 /* (pi2_src[col] + lvl_shift)*/
607 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, lvl_shift_4x32b);
608 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, lvl_shift_4x32b);
609 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
610 src_temp12_4x32b = _mm_mullo_epi32 (src_temp12_4x32b, wgt0_4x32b);
611 src_temp13_4x32b = _mm_mullo_epi32 (src_temp13_4x32b, wgt0_4x32b);
612
613 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
614 src_temp14_4x32b = _mm_cvtepi16_epi32(src_temp14_4x32b);
615 src_temp15_4x32b = _mm_cvtepi16_epi32(src_temp15_4x32b);
616 /* (pi2_src[col] + lvl_shift)*/
617 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, lvl_shift_4x32b);
618 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, lvl_shift_4x32b);
619 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
620 src_temp14_4x32b = _mm_mullo_epi32 (src_temp14_4x32b, wgt0_4x32b);
621 src_temp15_4x32b = _mm_mullo_epi32 (src_temp15_4x32b, wgt0_4x32b);
622
623 /* i4_tmp += 1 << (shift - 1) */
624 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, const_temp_4x32b);
625 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, const_temp_4x32b);
626 /* (i4_tmp >> shift) */
627 src_temp8_4x32b = _mm_srai_epi32(src_temp8_4x32b, shift);
628 src_temp9_4x32b = _mm_srai_epi32(src_temp9_4x32b, shift);
629
630 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
631 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, const_temp_4x32b);
632 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, const_temp_4x32b);
633 /* (i4_tmp >> shift) */
634 src_temp10_4x32b = _mm_srai_epi32(src_temp10_4x32b, shift);
635 src_temp11_4x32b = _mm_srai_epi32(src_temp11_4x32b, shift);
636
637 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
638 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, const_temp_4x32b);
639 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, const_temp_4x32b);
640 /* (i4_tmp >> shift) */
641 src_temp12_4x32b = _mm_srai_epi32(src_temp12_4x32b, shift);
642 src_temp13_4x32b = _mm_srai_epi32(src_temp13_4x32b, shift);
643
644 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
645 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, const_temp_4x32b);
646 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, const_temp_4x32b);
647 /* (i4_tmp >> shift) */
648 src_temp14_4x32b = _mm_srai_epi32(src_temp14_4x32b, shift);
649 src_temp15_4x32b = _mm_srai_epi32(src_temp15_4x32b, shift);
650
651 /*i4_tmp = (i4_tmp >> shift) + off0; */
652 src_temp8_4x32b = _mm_add_epi32 (src_temp8_4x32b, off0_4x32b);
653 src_temp9_4x32b = _mm_add_epi32 (src_temp9_4x32b, off0_4x32b);
654 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
655 src_temp10_4x32b = _mm_add_epi32 (src_temp10_4x32b, off0_4x32b);
656 src_temp11_4x32b = _mm_add_epi32 (src_temp11_4x32b, off0_4x32b);
657 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
658 src_temp12_4x32b = _mm_add_epi32 (src_temp12_4x32b, off0_4x32b);
659 src_temp13_4x32b = _mm_add_epi32 (src_temp13_4x32b, off0_4x32b);
660 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
661 src_temp14_4x32b = _mm_add_epi32 (src_temp14_4x32b, off0_4x32b);
662 src_temp15_4x32b = _mm_add_epi32 (src_temp15_4x32b, off0_4x32b);
663
664 src_temp8_4x32b = _mm_packs_epi32 (src_temp8_4x32b, src_temp10_4x32b);
665 src_temp9_4x32b = _mm_packs_epi32 (src_temp9_4x32b, src_temp11_4x32b);
666 src_temp12_4x32b = _mm_packs_epi32 (src_temp12_4x32b, src_temp14_4x32b);
667 src_temp13_4x32b = _mm_packs_epi32 (src_temp13_4x32b, src_temp15_4x32b);
668 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
669 src_temp8_4x32b = _mm_packus_epi16 (src_temp8_4x32b, src_temp12_4x32b);
670 src_temp9_4x32b = _mm_packus_epi16 (src_temp9_4x32b, src_temp13_4x32b);
671
672 /* store 16 8-bit output values */
673 _mm_storeu_si128((__m128i*)(pu1_dst+2*dst_strd), src_temp8_4x32b); /* row = 2*/
674 _mm_storeu_si128((__m128i*)(pu1_dst+3*dst_strd), src_temp9_4x32b); /* row = 3*/
675
676 pi2_src += 16; /* Pointer update */
677 pu1_dst += 16; /* Pointer update */
678
679 } /* inner loop ends here(4-output values in single iteration) */
680 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
681 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
682 }
683 }
684 else if( 0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
685 {
686 __m128i src_temp2_4x32b,src_temp3_4x32b;
687 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
688 /* outer for loop starts from here */
689 for(row = 0; row < ht; row +=4)
690 {
691 for(col = 0; col < wdx2; col +=8)
692 {
693 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
694 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
695 /* row = 1 */
696 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd));
697 /* row = 2 */
698 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
699 /* row = 3 */
700 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
701
702 /* row = 0 */ /* Last 4 pixels */
703 src_temp4_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+4));
704 /* row = 1 */
705 src_temp5_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+src_strd+4));
706 /* row = 2 */
707 src_temp6_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd+4));
708 /* row = 3 */
709 src_temp7_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd+4));
710
711 /* considering pix. 4:0 by converting 16-into 32 bit */
712 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
713 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
714 /* (pi2_src[col] + lvl_shift)*/
715 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
716 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
717 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
718 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
719 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
720
721 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
722 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
723 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
724 /* (pi2_src[col] + lvl_shift)*/
725 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
726 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
727 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
728 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
729 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
730
731 /* considering pix. 4:0 by converting 16-into 32 bit */
732 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
733 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
734 /* (pi2_src[col] + lvl_shift)*/
735 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, lvl_shift_4x32b);
736 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, lvl_shift_4x32b);
737 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
738 src_temp4_4x32b = _mm_mullo_epi32 (src_temp4_4x32b, wgt0_4x32b);
739 src_temp5_4x32b = _mm_mullo_epi32 (src_temp5_4x32b, wgt0_4x32b);
740
741 /* considering pix. 4:0 by converting 16-into 32 bit */
742 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
743 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
744 /* (pi2_src[col] + lvl_shift)*/
745 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, lvl_shift_4x32b);
746 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, lvl_shift_4x32b);
747 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
748 src_temp6_4x32b = _mm_mullo_epi32 (src_temp6_4x32b, wgt0_4x32b);
749 src_temp7_4x32b = _mm_mullo_epi32 (src_temp7_4x32b, wgt0_4x32b);
750
751 /* i4_tmp += 1 << (shift - 1) */
752 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
753 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
754 /* (i4_tmp >> shift) */
755 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
756 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
757
758 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
759 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
760 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
761 /* (i4_tmp >> shift) */
762 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
763 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
764
765 /* i4_tmp += 1 << (shift - 1) */
766 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, const_temp_4x32b);
767 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, const_temp_4x32b);
768 /* (i4_tmp >> shift) */
769 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
770 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
771
772 /* i4_tmp += 1 << (shift - 1) */
773 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, const_temp_4x32b);
774 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, const_temp_4x32b);
775 /* (i4_tmp >> shift) */
776 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
777 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
778
779 /*i4_tmp = (i4_tmp >> shift) + off0; */
780 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
781 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
782 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
783 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
784 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
785 /*i4_tmp = (i4_tmp >> shift) + off0; */
786 src_temp4_4x32b = _mm_add_epi32 (src_temp4_4x32b, off0_4x32b);
787 src_temp5_4x32b = _mm_add_epi32 (src_temp5_4x32b, off0_4x32b);
788 /*i4_tmp = (i4_tmp >> shift) + off0; */
789 src_temp6_4x32b = _mm_add_epi32 (src_temp6_4x32b, off0_4x32b);
790 src_temp7_4x32b = _mm_add_epi32 (src_temp7_4x32b, off0_4x32b);
791
792 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp4_4x32b);
793 src_temp1_4x32b = _mm_packs_epi32 (src_temp1_4x32b, src_temp5_4x32b);
794 src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp6_4x32b);
795 src_temp3_4x32b = _mm_packs_epi32 (src_temp3_4x32b, src_temp7_4x32b);
796
797 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
798 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp0_4x32b);
799 src_temp1_4x32b = _mm_packus_epi16 (src_temp1_4x32b, src_temp1_4x32b);
800 src_temp2_4x32b = _mm_packus_epi16 (src_temp2_4x32b, src_temp2_4x32b);
801 src_temp3_4x32b = _mm_packus_epi16 (src_temp3_4x32b, src_temp3_4x32b);
802
803 /* store four 8-bit output values */
804 _mm_storel_epi64((__m128i*)(pu1_dst+0*dst_strd), src_temp0_4x32b); /* row = 0*/
805 _mm_storel_epi64((__m128i*)(pu1_dst+1*dst_strd), src_temp1_4x32b); /* row = 1*/
806 _mm_storel_epi64((__m128i*)(pu1_dst+2*dst_strd), src_temp2_4x32b); /* row = 0*/
807 _mm_storel_epi64((__m128i*)(pu1_dst+3*dst_strd), src_temp3_4x32b); /* row = 1*/
808
809 pi2_src += 8; /* Pointer update */
810 pu1_dst += 8; /* Pointer update */
811
812 } /* inner loop ends here(4-output values in single iteration) */
813 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
814 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
815 }
816 }
817 else /* 2*wd multiple of 4 case */
818 {
819 WORD32 dst0, dst1, dst2, dst3;
820 __m128i src_temp2_4x32b,src_temp3_4x32b;
821 /* outer for loop starts from here */
822 for(row = 0; row < ht; row +=4)
823 {
824 for(col = 0; col < wdx2; col +=4)
825 {
826 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
827 src_temp0_4x32b = _mm_loadu_si128((__m128i*)(pi2_src));
828 /* row = 1 */
829 src_temp1_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+1*src_strd));
830 /* row = 2 */
831 src_temp2_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+2*src_strd));
832 /* row = 3 */
833 src_temp3_4x32b = _mm_loadu_si128((__m128i*)(pi2_src+3*src_strd));
834
835 /* considering pix. 4:0 by converting 16-into 32 bit */
836 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
837 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
838 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
839 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
840
841 /* (pi2_src[col] + lvl_shift)*/
842 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, lvl_shift_4x32b);
843 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, lvl_shift_4x32b);
844 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
845 src_temp0_4x32b = _mm_mullo_epi32 (src_temp0_4x32b, wgt0_4x32b);
846 src_temp1_4x32b = _mm_mullo_epi32 (src_temp1_4x32b, wgt0_4x32b);
847
848 /* (pi2_src[col] + lvl_shift)*/
849 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, lvl_shift_4x32b);
850 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, lvl_shift_4x32b);
851 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
852 src_temp2_4x32b = _mm_mullo_epi32 (src_temp2_4x32b, wgt0_4x32b);
853 src_temp3_4x32b = _mm_mullo_epi32 (src_temp3_4x32b, wgt0_4x32b);
854
855 /* i4_tmp += 1 << (shift - 1) */
856 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, const_temp_4x32b);
857 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, const_temp_4x32b);
858 /* (i4_tmp >> shift) */
859 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
860 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
861 /*i4_tmp = (i4_tmp >> shift) + off0; */
862 src_temp0_4x32b = _mm_add_epi32 (src_temp0_4x32b, off0_4x32b);
863 src_temp1_4x32b = _mm_add_epi32 (src_temp1_4x32b, off0_4x32b);
864
865 /* i4_tmp += 1 << (shift - 1) */
866 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, const_temp_4x32b);
867 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, const_temp_4x32b);
868 /* (i4_tmp >> shift) */
869 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
870 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
871 /*i4_tmp = (i4_tmp >> shift) + off0; */
872 src_temp2_4x32b = _mm_add_epi32 (src_temp2_4x32b, off0_4x32b);
873 src_temp3_4x32b = _mm_add_epi32 (src_temp3_4x32b, off0_4x32b);
874
875 src_temp0_4x32b = _mm_packs_epi32 (src_temp0_4x32b, src_temp1_4x32b);
876 src_temp2_4x32b = _mm_packs_epi32 (src_temp2_4x32b, src_temp3_4x32b);
877
878 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
879 src_temp0_4x32b = _mm_packus_epi16 (src_temp0_4x32b, src_temp2_4x32b);
880
881 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
882 /* dst row = 1 to 3 */
883 src_temp1_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 1);
884 src_temp2_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 2);
885 src_temp3_4x32b = _mm_shuffle_epi32 (src_temp0_4x32b, 3);
886
887 /* store four 8-bit output values */
888 *(WORD32 *) (&pu1_dst[0*dst_strd]) = dst0;
889
890 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
891 dst2 = _mm_cvtsi128_si32(src_temp2_4x32b);
892 dst3 = _mm_cvtsi128_si32(src_temp3_4x32b);
893 /* row = 1 */
894 *(WORD32 *) (&pu1_dst[1*dst_strd]) = dst1;
895 /* row = 2 */
896 *(WORD32 *) (&pu1_dst[2*dst_strd]) = dst2;
897 /* row = 3 */
898 *(WORD32 *) (&pu1_dst[3*dst_strd]) = dst3;
899
900 pi2_src += 4; /* Pointer update */
901 pu1_dst += 4; /* Pointer update */
902
903 } /* inner loop ends here(4-output values in single iteration) */
904 pi2_src = pi2_src - wdx2 + 4*src_strd; /* Pointer update */
905 pu1_dst = pu1_dst - wdx2 + 4*dst_strd; /* Pointer update */
906 }
907 }
908 }
909 else /* ht multiple of 2 case */
910 #endif
911
912 {
913 if(0 == (wdx2 & 15)) /* 2*wd multiple of 168 case */
914 {
915 __m128i src_temp2_4x32b, src_temp3_4x32b;
916 __m128i src_temp4_4x32b, src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b;
917 /* outer for loop starts from here */
918 for(row = 0; row < ht; row += 2)
919 {
920 for(col = 0; col < wdx2; col += 16)
921 {
922 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
923 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
924 /* row = 1 */
925 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
926
927 /* row = 0 */ /* Second 4 pixels */
928 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
929 /* row = 1 */
930 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
931 /* row = 0 */ /* Third 4 pixels */
932 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 8));
933 /* row = 1 */
934 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 8));
935 /* row = 0 */ /* Last 4 pixels */
936 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 12));
937 /* row = 1 */
938 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 12));
939
940 /* considering pix. 4:0 by converting 16-into 32 bit */
941 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
942 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
943 /* (pi2_src[col] + lvl_shift)*/
944 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
945 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
946 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
947 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
948 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
949
950 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Second 4 pixels */
951 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
952 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
953 /* (pi2_src[col] + lvl_shift)*/
954 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
955 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
956 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
957 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
958 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
959
960 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Third 4 pixels */
961 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
962 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
963 /* (pi2_src[col] + lvl_shift)*/
964 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift_4x32b);
965 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift_4x32b);
966 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
967 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt0_4x32b);
968 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
969
970 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
971 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
972 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
973 /* (pi2_src[col] + lvl_shift)*/
974 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift_4x32b);
975 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift_4x32b);
976 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
977 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt0_4x32b);
978 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
979
980 /* i4_tmp += 1 << (shift - 1) */
981 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
982 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
983 /* (i4_tmp >> shift) */
984 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
985 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
986
987 /* i4_tmp += 1 << (shift - 1) */ /* Second 4 pixels */
988 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
989 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
990 /* (i4_tmp >> shift) */
991 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
992 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
993
994 /* i4_tmp += 1 << (shift - 1) */ /* Third 4 pixels */
995 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, const_temp_4x32b);
996 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
997 /* (i4_tmp >> shift) */
998 src_temp4_4x32b = _mm_srai_epi32(src_temp4_4x32b, shift);
999 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
1000
1001 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1002 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, const_temp_4x32b);
1003 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1004 /* (i4_tmp >> shift) */
1005 src_temp6_4x32b = _mm_srai_epi32(src_temp6_4x32b, shift);
1006 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
1007
1008 /*i4_tmp = (i4_tmp >> shift) + off0; */
1009 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1010 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1011 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Second 4 pixels */
1012 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1013 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1014 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Third 4 pixels */
1015 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, off0_4x32b);
1016 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, off0_4x32b);
1017 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1018 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, off0_4x32b);
1019 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, off0_4x32b);
1020
1021 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1022 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1023 src_temp4_4x32b = _mm_packs_epi32(src_temp4_4x32b, src_temp6_4x32b);
1024 src_temp5_4x32b = _mm_packs_epi32(src_temp5_4x32b, src_temp7_4x32b);
1025 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1026 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp4_4x32b);
1027 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp5_4x32b);
1028
1029 /* store 16 8-bit output values */
1030 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1031 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1032
1033 pi2_src += 16; /* Pointer update */
1034 pu1_dst += 16; /* Pointer update */
1035
1036 } /* inner loop ends here(4-output values in single iteration) */
1037 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
1038 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1039 }
1040 }
1041 else if(0 == (wdx2 & 7)) /* 2*wd multiple of 8 case */
1042 {
1043 __m128i src_temp2_4x32b, src_temp3_4x32b;
1044 /* outer for loop starts from here */
1045 for(row = 0; row < ht; row += 2)
1046 {
1047 for(col = 0; col < wdx2; col += 8)
1048 {
1049 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1050 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1051 /* row = 1 */
1052 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1053
1054 /* row = 0 */ /* Last 4 pixels */
1055 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + 4));
1056 /* row = 1 */
1057 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd + 4));
1058
1059 /* considering pix. 4:0 by converting 16-into 32 bit */
1060 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
1061 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1062 /* (pi2_src[col] + lvl_shift)*/
1063 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1064 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1065 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1066 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1067 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1068
1069 /* considering pix. 4:0 by converting 16-into 32 bit */ /* Last 4 pixels */
1070 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1071 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1072 /* (pi2_src[col] + lvl_shift)*/
1073 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift_4x32b);
1074 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift_4x32b);
1075 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1076 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt0_4x32b);
1077 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1078
1079 /* i4_tmp += 1 << (shift - 1) */
1080 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1081 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1082 /* (i4_tmp >> shift) */
1083 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
1084 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1085
1086 /* i4_tmp += 1 << (shift - 1) */ /* Last 4 pixels */
1087 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, const_temp_4x32b);
1088 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1089 /* (i4_tmp >> shift) */
1090 src_temp2_4x32b = _mm_srai_epi32(src_temp2_4x32b, shift);
1091 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
1092
1093 /*i4_tmp = (i4_tmp >> shift) + off0; */
1094 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1095 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1096 /*i4_tmp = (i4_tmp >> shift) + off0; */ /* Last 4 pixels */
1097 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, off0_4x32b);
1098 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, off0_4x32b);
1099
1100 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp2_4x32b);
1101 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1102
1103 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1104 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1105 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1106
1107 /* store four 8-bit output values */
1108 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp0_4x32b); /* row = 0*/
1109 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp1_4x32b); /* row = 1*/
1110
1111 pi2_src += 8; /* Pointer update */
1112 pu1_dst += 8; /* Pointer update */
1113
1114 } /* inner loop ends here(4-output values in single iteration) */
1115 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
1116 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1117 }
1118 }
1119 else /* 2*wd multiple of 4 case */
1120 {
1121 WORD32 dst0, dst1;
1122 /* outer for loop starts from here */
1123 for(row = 0; row < ht; row += 2)
1124 {
1125 for(col = 0; col < wdx2; col += 4)
1126 {
1127 /* row = 0 */ /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1128 src_temp0_4x32b = _mm_loadu_si128((__m128i *)(pi2_src));
1129 /* row = 1 */
1130 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src + src_strd));
1131
1132 /* considering pix. 4:0 by converting 16-into 32 bit */
1133 src_temp0_4x32b = _mm_cvtepi16_epi32(src_temp0_4x32b);
1134 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1135
1136 /* (pi2_src[col] + lvl_shift)*/
1137 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, lvl_shift_4x32b);
1138 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift_4x32b);
1139
1140 /*i4_tmp = (pi2_src[col] + lvl_shift) * wgt0*/
1141 src_temp0_4x32b = _mm_mullo_epi32(src_temp0_4x32b, wgt0_4x32b);
1142 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1143
1144 /* i4_tmp += 1 << (shift - 1) */
1145 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, const_temp_4x32b);
1146 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1147
1148 /* (i4_tmp >> shift) */
1149 src_temp0_4x32b = _mm_srai_epi32(src_temp0_4x32b, shift);
1150 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1151
1152 /*i4_tmp = (i4_tmp >> shift) + off0; */
1153 src_temp0_4x32b = _mm_add_epi32(src_temp0_4x32b, off0_4x32b);
1154 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, off0_4x32b);
1155
1156 src_temp0_4x32b = _mm_packs_epi32(src_temp0_4x32b, src_temp1_4x32b);
1157
1158 /* pu1_dst[col] = CLIP_U8(i4_tmp); */
1159 src_temp0_4x32b = _mm_packus_epi16(src_temp0_4x32b, src_temp0_4x32b);
1160
1161 dst0 = _mm_cvtsi128_si32(src_temp0_4x32b);
1162 /* dst row = 1 to 3 */
1163 src_temp1_4x32b = _mm_shuffle_epi32(src_temp0_4x32b, 1);
1164
1165 /* store four 8-bit output values */
1166 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1167
1168 dst1 = _mm_cvtsi128_si32(src_temp1_4x32b);
1169 /* row = 1 */
1170 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1171
1172 pi2_src += 4; /* Pointer update */
1173 pu1_dst += 4; /* Pointer update */
1174
1175 } /* inner loop ends here(4-output values in single iteration) */
1176 pi2_src = pi2_src - wdx2 + 2 * src_strd; /* Pointer update */
1177 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1178 }
1179 }
1180 }
1181 }
1182
1183 /**
1184 *******************************************************************************
1185 *
1186 * @brief
1187 * Does bi-weighted prediction on the arrays pointed by pi2_src1 and
1188 * pi2_src2 and stores it at location pointed by pi2_dst
1189 *
1190 * @par Description:
1191 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
1192 * off1 + 1) << (shift - 1) ) >> shift
1193 *
1194 * @param[in] pi2_src1
1195 * Pointer to source 1
1196 *
1197 * @param[in] pi2_src2
1198 * Pointer to source 2
1199 *
1200 * @param[out] pu1_dst
1201 * Pointer to destination
1202 *
1203 * @param[in] src_strd1
1204 * Source stride 1
1205 *
1206 * @param[in] src_strd2
1207 * Source stride 2
1208 *
1209 * @param[in] dst_strd
1210 * Destination stride
1211 *
1212 * @param[in] wgt0
1213 * weight to be multiplied to source 1
1214 *
1215 * @param[in] off0
1216 * offset 0
1217 *
1218 * @param[in] wgt1
1219 * weight to be multiplied to source 2
1220 *
1221 * @param[in] off1
1222 * offset 1
1223 *
1224 * @param[in] shift
1225 * (14 Bit depth) + log2_weight_denominator
1226 *
1227 * @param[in] lvl_shift1
1228 * added before shift and offset
1229 *
1230 * @param[in] lvl_shift2
1231 * added before shift and offset
1232 *
1233 * @param[in] ht
1234 * height of the source
1235 *
1236 * @param[in] wd
1237 * width of the source
1238 *
1239 * @returns
1240 *
1241 * @remarks
1242 * None
1243 *
1244 *******************************************************************************
1245 */
1246
ihevc_weighted_pred_bi_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0,WORD32 off0,WORD32 wgt1,WORD32 off1,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1247 void ihevc_weighted_pred_bi_sse42(WORD16 *pi2_src1,
1248 WORD16 *pi2_src2,
1249 UWORD8 *pu1_dst,
1250 WORD32 src_strd1,
1251 WORD32 src_strd2,
1252 WORD32 dst_strd,
1253 WORD32 wgt0,
1254 WORD32 off0,
1255 WORD32 wgt1,
1256 WORD32 off1,
1257 WORD32 shift,
1258 WORD32 lvl_shift1,
1259 WORD32 lvl_shift2,
1260 WORD32 ht,
1261 WORD32 wd)
1262 {
1263 WORD32 row, col, temp;
1264
1265 __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1266 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1267
1268
1269 ASSERT(wd % 4 == 0); /* checking assumption*/
1270 ASSERT(ht % 2 == 0); /* checking assumption*/
1271
1272 temp = (off0 + off1 + 1) << (shift - 1);
1273
1274 // seting values in register
1275 const_temp_4x32b = _mm_set1_epi32(temp);
1276 lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1277 lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1278 wgt0_4x32b = _mm_set1_epi32(wgt0);
1279 wgt1_4x32b = _mm_set1_epi32(wgt1);
1280
1281 if(0 == (wd & 7)) /* wd multiple of 8 case */
1282 {
1283 __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1284 /* outer for loop starts from here */
1285 for(row = 0; row < ht; row += 2)
1286 {
1287 for(col = 0; col < wd; col += 8)
1288 {
1289 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1290 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1291 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1292 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1293 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1294 /* Next 4 pixels */
1295 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1296 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1297 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1298 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1299
1300 /* considering pix. 4:0 by converting 16-into 32 bit */
1301 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1302 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1303 /* (pi2_src1[col] + lvl_shift1) */
1304 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1305 /* (pi2_src2[col] + lvl_shift2) */
1306 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1307 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1308 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1309 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1310 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1311
1312 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1313 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1314 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1315 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1316 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1317 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1318
1319 /* Next 4 Pixels */
1320 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1321 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1322 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1323 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1324 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1325 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1326 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1327 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1328 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1329 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1330 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1331 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1332
1333 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1334 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1335 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1336 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1337 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1338 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1339 /* (i4_tmp >> shift) */
1340 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1341 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
1342
1343 /* Next 4 Pixels */
1344 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1345 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1346 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1347 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1348 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
1349 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
1350
1351 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1352 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1353
1354 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1355 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1356 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1357
1358 /* store four 8-bit output values */
1359 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1360 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1361
1362 pi2_src1 += 8; /* Pointer update */
1363 pi2_src2 += 8; /* Pointer update */
1364 pu1_dst += 8; /* Pointer update */
1365
1366 } /* inner loop ends here(4-output values in single iteration) */
1367
1368 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
1369 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
1370 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
1371
1372 } /* outer loop ends */
1373 }
1374 else /* wd multiple of 4 case */
1375 {
1376 WORD32 dst0, dst1;
1377 /* outer for loop starts from here */
1378 for(row = 0; row < ht; row += 2)
1379 {
1380 for(col = 0; col < wd; col += 4)
1381 {
1382 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1383 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1384 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1385 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1386 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1387
1388 /* considering pix. 4:0 by converting 16-into 32 bit */
1389 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1390 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1391 /* (pi2_src1[col] + lvl_shift1) */
1392 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1393 /* (pi2_src2[col] + lvl_shift2) */
1394 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1395 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1396 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1397 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1398 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1399
1400 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1401 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1402 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1403 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1404 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1405 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1406
1407 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1408 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1409 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1410
1411 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1412 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1413 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1414
1415 /* (i4_tmp >> shift) */
1416 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1417 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
1418
1419 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1420
1421 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1422 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1423
1424 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1425
1426 /* dst row = 1 to 3 */
1427 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1428
1429 /* store four 8-bit output values */
1430 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1431
1432 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1433
1434 /* row = 1 to 3 */
1435 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1436
1437 pi2_src1 += 4; /* Pointer update */
1438 pi2_src2 += 4; /* Pointer update */
1439 pu1_dst += 4; /* Pointer update */
1440
1441 } /* inner loop ends here(4-output values in single iteration) */
1442
1443 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
1444 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
1445 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
1446
1447 } /* outer loop ends */
1448 }
1449
1450 }
1451
1452 /**
1453 *******************************************************************************
1454 *
1455 * @brief
1456 * Does chroma bi-weighted prediction on the arrays pointed by pi2_src1 and
1457 * pi2_src2 and stores it at location pointed by pi2_dst
1458 *
1459 * @par Description:
1460 * dst = ( (src1 + lvl_shift1)*wgt0 + (src2 + lvl_shift2)*wgt1 + (off0 +
1461 * off1 + 1) << (shift - 1) ) >> shift
1462 *
1463 * @param[in] pi2_src1
1464 * Pointer to source 1
1465 *
1466 * @param[in] pi2_src2
1467 * Pointer to source 2
1468 *
1469 * @param[out] pu1_dst
1470 * Pointer to destination
1471 *
1472 * @param[in] src_strd1
1473 * Source stride 1
1474 *
1475 * @param[in] src_strd2
1476 * Source stride 2
1477 *
1478 * @param[in] dst_strd
1479 * Destination stride
1480 *
1481 * @param[in] wgt0
1482 * weight to be multiplied to source 1
1483 *
1484 * @param[in] off0
1485 * offset 0
1486 *
1487 * @param[in] wgt1
1488 * weight to be multiplied to source 2
1489 *
1490 * @param[in] off1
1491 * offset 1
1492 *
1493 * @param[in] shift
1494 * (14 Bit depth) + log2_weight_denominator
1495 *
1496 * @param[in] lvl_shift1
1497 * added before shift and offset
1498 *
1499 * @param[in] lvl_shift2
1500 * added before shift and offset
1501 *
1502 * @param[in] ht
1503 * height of the source
1504 *
1505 * @param[in] wd
1506 * width of the source (each colour component)
1507 *
1508 * @returns
1509 *
1510 * @remarks
1511 * None
1512 *
1513 *******************************************************************************
1514 */
1515
ihevc_weighted_pred_chroma_bi_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 wgt0_cb,WORD32 wgt0_cr,WORD32 off0_cb,WORD32 off0_cr,WORD32 wgt1_cb,WORD32 wgt1_cr,WORD32 off1_cb,WORD32 off1_cr,WORD32 shift,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1516 void ihevc_weighted_pred_chroma_bi_sse42(WORD16 *pi2_src1,
1517 WORD16 *pi2_src2,
1518 UWORD8 *pu1_dst,
1519 WORD32 src_strd1,
1520 WORD32 src_strd2,
1521 WORD32 dst_strd,
1522 WORD32 wgt0_cb,
1523 WORD32 wgt0_cr,
1524 WORD32 off0_cb,
1525 WORD32 off0_cr,
1526 WORD32 wgt1_cb,
1527 WORD32 wgt1_cr,
1528 WORD32 off1_cb,
1529 WORD32 off1_cr,
1530 WORD32 shift,
1531 WORD32 lvl_shift1,
1532 WORD32 lvl_shift2,
1533 WORD32 ht,
1534 WORD32 wd)
1535 {
1536 WORD32 row, col, temp1, temp2;
1537 WORD32 wdx2;
1538
1539 __m128i src_temp1_4x32b, src_temp2_4x32b, src_temp3_4x32b, src_temp4_4x32b;
1540 __m128i const_temp_4x32b, lvl_shift1_4x32b, lvl_shift2_4x32b, wgt0_4x32b, wgt1_4x32b;
1541
1542
1543 ASSERT(wd % 2 == 0); /* checking assumption*/
1544 ASSERT(ht % 2 == 0); /* checking assumption*/
1545
1546 temp1 = (off0_cb + off1_cb + 1) << (shift - 1);
1547 temp2 = (off0_cr + off1_cr + 1) << (shift - 1);
1548
1549 // seting values in register
1550 const_temp_4x32b = _mm_set_epi32(temp2, temp1, temp2, temp1);
1551 lvl_shift1_4x32b = _mm_set1_epi32(lvl_shift1);
1552 lvl_shift2_4x32b = _mm_set1_epi32(lvl_shift2);
1553 wgt0_4x32b = _mm_set_epi32(wgt0_cr, wgt0_cb, wgt0_cr, wgt0_cb);
1554 wgt1_4x32b = _mm_set_epi32(wgt1_cr, wgt1_cb, wgt1_cr, wgt1_cb);
1555
1556 wdx2 = wd * 2;
1557
1558 if(0 == (wdx2 & 7)) /* wdx2 multiple of 8 case */
1559 {
1560 __m128i src_temp5_4x32b, src_temp6_4x32b, src_temp7_4x32b, src_temp8_4x32b;
1561 /* outer for loop starts from here */
1562 for(row = 0; row < ht; row += 2)
1563 {
1564 for(col = 0; col < wdx2; col += 8)
1565 {
1566 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1567 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1568 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1569 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1570 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1571 /* Next 4 pixels */
1572 src_temp5_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 4)); /* row = 0 */
1573 src_temp6_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 4)); /* row = 0 */
1574 src_temp7_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1 + 4)); /* row = 1 */
1575 src_temp8_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2 + 4)); /* row = 1 */
1576
1577 /* considering pix. 4:0 by converting 16-into 32 bit */
1578 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1579 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1580 /* (pi2_src1[col] + lvl_shift1) */
1581 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1582 /* (pi2_src2[col] + lvl_shift2) */
1583 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1584 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1585 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1586 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1587 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1588
1589 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1590 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1591 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1592 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1593 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1594 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1595
1596 /* Next 4 Pixels */
1597 src_temp5_4x32b = _mm_cvtepi16_epi32(src_temp5_4x32b);
1598 src_temp6_4x32b = _mm_cvtepi16_epi32(src_temp6_4x32b);
1599 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, lvl_shift1_4x32b);
1600 src_temp6_4x32b = _mm_add_epi32(src_temp6_4x32b, lvl_shift2_4x32b);
1601 src_temp5_4x32b = _mm_mullo_epi32(src_temp5_4x32b, wgt0_4x32b);
1602 src_temp6_4x32b = _mm_mullo_epi32(src_temp6_4x32b, wgt1_4x32b);
1603 src_temp7_4x32b = _mm_cvtepi16_epi32(src_temp7_4x32b);
1604 src_temp8_4x32b = _mm_cvtepi16_epi32(src_temp8_4x32b);
1605 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, lvl_shift1_4x32b);
1606 src_temp8_4x32b = _mm_add_epi32(src_temp8_4x32b, lvl_shift2_4x32b);
1607 src_temp7_4x32b = _mm_mullo_epi32(src_temp7_4x32b, wgt0_4x32b);
1608 src_temp8_4x32b = _mm_mullo_epi32(src_temp8_4x32b, wgt1_4x32b);
1609
1610 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1611 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1612 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1613 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1614 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1615 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1616 /* (i4_tmp >> shift) */
1617 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1618 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
1619
1620 /* Next 4 Pixels */
1621 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, src_temp6_4x32b);
1622 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, src_temp8_4x32b);
1623 src_temp5_4x32b = _mm_add_epi32(src_temp5_4x32b, const_temp_4x32b);
1624 src_temp7_4x32b = _mm_add_epi32(src_temp7_4x32b, const_temp_4x32b);
1625 src_temp5_4x32b = _mm_srai_epi32(src_temp5_4x32b, shift);
1626 src_temp7_4x32b = _mm_srai_epi32(src_temp7_4x32b, shift);
1627
1628 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp5_4x32b);
1629 src_temp3_4x32b = _mm_packs_epi32(src_temp3_4x32b, src_temp7_4x32b);
1630
1631 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1632 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1633 src_temp3_4x32b = _mm_packus_epi16(src_temp3_4x32b, src_temp3_4x32b);
1634
1635 /* store four 8-bit output values */
1636 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_4x32b); /* row = 0*/
1637 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_4x32b); /* row = 1*/
1638
1639 pi2_src1 += 8; /* Pointer update */
1640 pi2_src2 += 8; /* Pointer update */
1641 pu1_dst += 8; /* Pointer update */
1642
1643 } /* inner loop ends here(4-output values in single iteration) */
1644
1645 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
1646 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
1647 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1648
1649 } /* outer loop ends */
1650 }
1651 else /* wdx2 multiple of 4 case */
1652 {
1653 WORD32 dst0, dst1;
1654 /* outer for loop starts from here */
1655 for(row = 0; row < ht; row += 2)
1656 {
1657 for(col = 0; col < wdx2; col += 4)
1658 {
1659 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1660 src_temp1_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1)); /* row = 0 */
1661 src_temp2_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2)); /* row = 0 */
1662 src_temp3_4x32b = _mm_loadu_si128((__m128i *)(pi2_src1 + 1 * src_strd1)); /* row = 1 */
1663 src_temp4_4x32b = _mm_loadu_si128((__m128i *)(pi2_src2 + 1 * src_strd2)); /* row = 1 */
1664
1665 /* considering pix. 4:0 by converting 16-into 32 bit */
1666 src_temp1_4x32b = _mm_cvtepi16_epi32(src_temp1_4x32b);
1667 src_temp2_4x32b = _mm_cvtepi16_epi32(src_temp2_4x32b);
1668 /* (pi2_src1[col] + lvl_shift1) */
1669 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, lvl_shift1_4x32b);
1670 /* (pi2_src2[col] + lvl_shift2) */
1671 src_temp2_4x32b = _mm_add_epi32(src_temp2_4x32b, lvl_shift2_4x32b);
1672 /*i4_tmp = (pi2_src1[col] + lvl_shift1) * wgt0 */
1673 src_temp1_4x32b = _mm_mullo_epi32(src_temp1_4x32b, wgt0_4x32b);
1674 /*(pi2_src2[col] + lvl_shift2) * wgt1 */
1675 src_temp2_4x32b = _mm_mullo_epi32(src_temp2_4x32b, wgt1_4x32b);
1676
1677 src_temp3_4x32b = _mm_cvtepi16_epi32(src_temp3_4x32b);
1678 src_temp4_4x32b = _mm_cvtepi16_epi32(src_temp4_4x32b);
1679 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, lvl_shift1_4x32b);
1680 src_temp4_4x32b = _mm_add_epi32(src_temp4_4x32b, lvl_shift2_4x32b);
1681 src_temp3_4x32b = _mm_mullo_epi32(src_temp3_4x32b, wgt0_4x32b);
1682 src_temp4_4x32b = _mm_mullo_epi32(src_temp4_4x32b, wgt1_4x32b);
1683
1684 /* (pi2_src1[col] + lvl_shift1) * wgt0 + (pi2_src2[col] + lvl_shift2) * wgt1 */
1685 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, src_temp2_4x32b);
1686 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, src_temp4_4x32b);
1687
1688 /* i4_tmp += (off0 + off1 + 1) << (shift - 1); */
1689 src_temp1_4x32b = _mm_add_epi32(src_temp1_4x32b, const_temp_4x32b);
1690 src_temp3_4x32b = _mm_add_epi32(src_temp3_4x32b, const_temp_4x32b);
1691
1692 /* (i4_tmp >> shift) */
1693 src_temp1_4x32b = _mm_srai_epi32(src_temp1_4x32b, shift);
1694 src_temp3_4x32b = _mm_srai_epi32(src_temp3_4x32b, shift);
1695
1696 src_temp1_4x32b = _mm_packs_epi32(src_temp1_4x32b, src_temp3_4x32b);
1697
1698 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1699 src_temp1_4x32b = _mm_packus_epi16(src_temp1_4x32b, src_temp1_4x32b);
1700
1701 dst0 = _mm_cvtsi128_si32(src_temp1_4x32b);
1702
1703 /* dst row = 1 to 3 */
1704 src_temp2_4x32b = _mm_shuffle_epi32(src_temp1_4x32b, 1);
1705
1706 /* store four 8-bit output values */
1707 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
1708
1709 dst1 = _mm_cvtsi128_si32(src_temp2_4x32b);
1710
1711 /* row = 1 to 3 */
1712 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
1713
1714 pi2_src1 += 4; /* Pointer update */
1715 pi2_src2 += 4; /* Pointer update */
1716 pu1_dst += 4; /* Pointer update */
1717
1718 } /* inner loop ends here(4-output values in single iteration) */
1719
1720 pi2_src1 = pi2_src1 - wdx2 + 2 * src_strd1; /* Pointer update */
1721 pi2_src2 = pi2_src2 - wdx2 + 2 * src_strd2; /* Pointer update */
1722 pu1_dst = pu1_dst - wdx2 + 2 * dst_strd; /* Pointer update */
1723 }
1724 }
1725
1726 }
1727
1728 /**
1729 *******************************************************************************
1730 *
1731 * @brief
1732 * Does default bi-weighted prediction on the arrays pointed by pi2_src1 and
1733 * pi2_src2 and stores it at location pointed by pi2_dst
1734 *
1735 * @par Description:
1736 * dst = ( (src1 + lvl_shift1) + (src2 + lvl_shift2) + 1 << (shift - 1) )
1737 * >> shift where shift = 15 - BitDepth
1738 *
1739 * @param[in] pi2_src1
1740 * Pointer to source 1
1741 *
1742 * @param[in] pi2_src2
1743 * Pointer to source 2
1744 *
1745 * @param[out] pu1_dst
1746 * Pointer to destination
1747 *
1748 * @param[in] src_strd1
1749 * Source stride 1
1750 *
1751 * @param[in] src_strd2
1752 * Source stride 2
1753 *
1754 * @param[in] dst_strd
1755 * Destination stride
1756 *
1757 * @param[in] lvl_shift1
1758 * added before shift and offset
1759 *
1760 * @param[in] lvl_shift2
1761 * added before shift and offset
1762 *
1763 * @param[in] ht
1764 * height of the source
1765 *
1766 * @param[in] wd
1767 * width of the source
1768 *
1769 * @returns
1770 *
1771 * @remarks
1772 * None
1773 *
1774 * Assumption : ht%4 == 0, wd%4 == 0
1775 * shift == 7, (lvl_shift1+lvl_shift2) can take {0, 8K, 16K}. In that case,
1776 * final result will match even if intermediate precision is in 16 bit.
1777 *
1778 *******************************************************************************
1779 */
1780
ihevc_weighted_pred_bi_default_sse42(WORD16 * pi2_src1,WORD16 * pi2_src2,UWORD8 * pu1_dst,WORD32 src_strd1,WORD32 src_strd2,WORD32 dst_strd,WORD32 lvl_shift1,WORD32 lvl_shift2,WORD32 ht,WORD32 wd)1781 void ihevc_weighted_pred_bi_default_sse42(WORD16 *pi2_src1,
1782 WORD16 *pi2_src2,
1783 UWORD8 *pu1_dst,
1784 WORD32 src_strd1,
1785 WORD32 src_strd2,
1786 WORD32 dst_strd,
1787 WORD32 lvl_shift1,
1788 WORD32 lvl_shift2,
1789 WORD32 ht,
1790 WORD32 wd)
1791 {
1792 WORD32 row, col, temp;
1793 WORD32 shift;
1794
1795 __m128i src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b, src_temp4_8x16b;
1796 __m128i const_temp_8x16b, lvl_shift1_8x16b, lvl_shift2_8x16b;
1797 __m128i src_temp5_8x16b, src_temp6_8x16b, src_temp7_8x16b, src_temp8_8x16b;
1798
1799 ASSERT(wd % 4 == 0); /* checking assumption*/
1800 ASSERT(ht % 2 == 0); /* checking assumption*/
1801
1802 shift = SHIFT_14_MINUS_BIT_DEPTH + 1;
1803 temp = 1 << (shift - 1);
1804
1805 // seting values in register
1806 lvl_shift1_8x16b = _mm_set1_epi16(lvl_shift1);
1807 lvl_shift2_8x16b = _mm_set1_epi16(lvl_shift2);
1808 const_temp_8x16b = _mm_set1_epi16(temp);
1809
1810 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, lvl_shift2_8x16b);
1811 lvl_shift1_8x16b = _mm_adds_epi16(lvl_shift1_8x16b, const_temp_8x16b);
1812
1813 if(0 == (ht & 3)) /* ht multiple of 4*/
1814 {
1815 if(0 == (wd & 15)) /* wd multiple of 16 case */
1816 {
1817 __m128i src_temp9_8x16b, src_temp10_8x16b, src_temp11_8x16b, src_temp12_8x16b;
1818 __m128i src_temp13_8x16b, src_temp14_8x16b, src_temp15_8x16b, src_temp16_8x16b;
1819 /* outer for loop starts from here */
1820 for(row = 0; row < ht; row += 4)
1821 {
1822 for(col = 0; col < wd; col += 16)
1823 {
1824 /*load 8 pixel values */ /* First 8 Values */
1825 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1826 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1827 /* row = 1 */
1828 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1829 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1830 /* row = 2 */
1831 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1832 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1833 /* row = 3 */
1834 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1835 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1836
1837 /*load 8 pixel values */ /* Second 8 Values */
1838 src_temp9_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 8));
1839 src_temp10_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 8));
1840 /* row = 1 */
1841 src_temp11_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1 + 8));
1842 src_temp12_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2 + 8));
1843 /* row = 2 */
1844 src_temp13_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1 + 8));
1845 src_temp14_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2 + 8));
1846
1847 /* (pi2_src1[col] + pi2_src2[col]) */ /* First 8 Values */
1848 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1849 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1850 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1851 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1852
1853 /*load 8 pixel values */ /* Second 8 Values */
1854 /* row = 3 */
1855 src_temp15_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1 + 8));
1856 src_temp16_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2 + 8));
1857
1858 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* First 8 Values */
1859 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1860 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1861 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1862 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1863
1864 /* (pi2_src1[col] + pi2_src2[col]) */ /* Second 8 Values */
1865 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, src_temp10_8x16b);
1866 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, src_temp12_8x16b);
1867 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, src_temp14_8x16b);
1868 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, src_temp16_8x16b);
1869
1870 /* (i4_tmp >> shift) */ /* First 8 Values */
1871 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1872 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1873 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1874 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1875
1876 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */ /* Second 8 Values */
1877 src_temp9_8x16b = _mm_adds_epi16(src_temp9_8x16b, lvl_shift1_8x16b);
1878 src_temp11_8x16b = _mm_adds_epi16(src_temp11_8x16b, lvl_shift1_8x16b);
1879 src_temp13_8x16b = _mm_adds_epi16(src_temp13_8x16b, lvl_shift1_8x16b);
1880 src_temp15_8x16b = _mm_adds_epi16(src_temp15_8x16b, lvl_shift1_8x16b);
1881
1882 /* (i4_tmp >> shift) */ /* Second 8 Values */
1883 src_temp9_8x16b = _mm_srai_epi16(src_temp9_8x16b, shift);
1884 src_temp11_8x16b = _mm_srai_epi16(src_temp11_8x16b, shift);
1885 src_temp13_8x16b = _mm_srai_epi16(src_temp13_8x16b, shift);
1886 src_temp15_8x16b = _mm_srai_epi16(src_temp15_8x16b, shift);
1887
1888 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */ /* 16 8 Values */
1889 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp9_8x16b);
1890 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp11_8x16b);
1891 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp13_8x16b);
1892 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp15_8x16b);
1893
1894 /* store four 8-bit output values */ /* 16 8 Values */
1895 _mm_storeu_si128((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1896 _mm_storeu_si128((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1897 _mm_storeu_si128((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1898 _mm_storeu_si128((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1899
1900 /* To update pointer */
1901 pi2_src1 += 16;
1902 pi2_src2 += 16;
1903 pu1_dst += 16;
1904
1905 } /* inner loop ends here(8-output values in single iteration) */
1906
1907 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1908 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1909 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
1910
1911 }
1912 }
1913 else if(0 == (wd & 7)) /* multiple of 8 case */
1914 {
1915 /* outer for loop starts from here */
1916 for(row = 0; row < ht; row += 4)
1917 {
1918 for(col = 0; col < wd; col += 8)
1919 {
1920 /*load 8 pixel values */
1921 src_temp1_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1));
1922 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2));
1923 /* row = 1 */
1924 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + src_strd1));
1925 src_temp4_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + src_strd2));
1926 /* row = 2 */
1927 src_temp5_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 2 * src_strd1));
1928 src_temp6_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 2 * src_strd2));
1929 /* row = 3 */
1930 src_temp7_8x16b = _mm_loadu_si128((__m128i *)(pi2_src1 + 3 * src_strd1));
1931 src_temp8_8x16b = _mm_loadu_si128((__m128i *)(pi2_src2 + 3 * src_strd2));
1932
1933 /* (pi2_src1[col] + pi2_src2[col]) */
1934 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
1935 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, src_temp4_8x16b);
1936 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
1937 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, src_temp8_8x16b);
1938
1939 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
1940 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
1941 src_temp3_8x16b = _mm_adds_epi16(src_temp3_8x16b, lvl_shift1_8x16b);
1942 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
1943 src_temp7_8x16b = _mm_adds_epi16(src_temp7_8x16b, lvl_shift1_8x16b);
1944
1945 /* (i4_tmp >> shift) */
1946 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
1947 src_temp3_8x16b = _mm_srai_epi16(src_temp3_8x16b, shift);
1948 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
1949 src_temp7_8x16b = _mm_srai_epi16(src_temp7_8x16b, shift);
1950
1951 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
1952 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
1953 src_temp3_8x16b = _mm_packus_epi16(src_temp3_8x16b, src_temp3_8x16b);
1954 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
1955 src_temp7_8x16b = _mm_packus_epi16(src_temp7_8x16b, src_temp7_8x16b);
1956
1957 /* store four 8-bit output values */
1958 _mm_storel_epi64((__m128i *)(pu1_dst + 0 * dst_strd), src_temp1_8x16b); /* row = 0*/
1959 _mm_storel_epi64((__m128i *)(pu1_dst + 1 * dst_strd), src_temp3_8x16b); /* row = 2*/
1960 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), src_temp5_8x16b); /* row = 1*/
1961 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), src_temp7_8x16b); /* row = 3*/
1962
1963 /* To update pointer */
1964 pi2_src1 += 8;
1965 pi2_src2 += 8;
1966 pu1_dst += 8;
1967
1968 } /* inner loop ends here(8-output values in single iteration) */
1969
1970 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
1971 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
1972 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
1973
1974 }
1975 }
1976 else /* wd multiple of 4 case*/
1977 {
1978 WORD32 dst0, dst1, dst2, dst3;
1979
1980 /* outer for loop starts from here */
1981 for(row = 0; row < ht; row += 4)
1982 {
1983 for(col = 0; col < wd; col += 4)
1984 {
1985 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
1986 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
1987 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
1988 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
1989
1990 /* row = 1 */
1991 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
1992 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
1993 /* row = 2 */
1994 src_temp5_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 2 * src_strd1));
1995 src_temp6_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 2 * src_strd2));
1996 /* row = 3 */
1997 src_temp7_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + 3 * src_strd1));
1998 src_temp8_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + 3 * src_strd2));
1999
2000 /* Pack two rows together */
2001 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2002 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2003 src_temp5_8x16b = _mm_unpacklo_epi64(src_temp5_8x16b, src_temp7_8x16b);
2004 src_temp6_8x16b = _mm_unpacklo_epi64(src_temp6_8x16b, src_temp8_8x16b);
2005
2006 /* (pi2_src1[col] + pi2_src2[col]) */
2007 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2008 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, src_temp6_8x16b);
2009
2010 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2011 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2012 src_temp5_8x16b = _mm_adds_epi16(src_temp5_8x16b, lvl_shift1_8x16b);
2013
2014 /* (i4_tmp >> shift) */
2015 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
2016 src_temp5_8x16b = _mm_srai_epi16(src_temp5_8x16b, shift);
2017
2018 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2019 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2020 src_temp5_8x16b = _mm_packus_epi16(src_temp5_8x16b, src_temp5_8x16b);
2021
2022 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2023 /* dst row = 1 to 3 */
2024 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2025 src_temp4_8x16b = _mm_shuffle_epi32(src_temp5_8x16b, 1);
2026
2027 /* store four 8-bit output values */
2028 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2029
2030 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2031 dst2 = _mm_cvtsi128_si32(src_temp5_8x16b);
2032 dst3 = _mm_cvtsi128_si32(src_temp4_8x16b);
2033
2034 /* row = 1 to row = 3 */
2035 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2036 *(WORD32 *)(&pu1_dst[2 * dst_strd]) = dst2;
2037 *(WORD32 *)(&pu1_dst[3 * dst_strd]) = dst3;
2038
2039 /* To update pointer */
2040 pi2_src1 += 4;
2041 pi2_src2 += 4;
2042 pu1_dst += 4;
2043
2044 } /* inner loop ends here(4-output values in single iteration) */
2045
2046 pi2_src1 = pi2_src1 - wd + 4 * src_strd1; /* Pointer update */
2047 pi2_src2 = pi2_src2 - wd + 4 * src_strd2; /* Pointer update */
2048 pu1_dst = pu1_dst - wd + 4 * dst_strd; /* Pointer update */
2049
2050 }
2051 }
2052 }
2053 else /* ht multiple of 2 case and wd multiple of 4 case*/
2054 {
2055
2056 WORD32 dst0, dst1;
2057
2058 /* outer for loop starts from here */
2059 for(row = 0; row < ht; row += 2)
2060 {
2061 for(col = 0; col < wd; col += 4)
2062 {
2063 /*load 4 pixel values from 7:0 pos. relative to cur. pos.*/
2064 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1));
2065 /*load 8 pixel values from 7:0 pos. relative to cur. pos.*/
2066 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2));
2067
2068 /* row = 1 */
2069 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src1 + src_strd1));
2070 src_temp4_8x16b = _mm_loadl_epi64((__m128i *)(pi2_src2 + src_strd2));
2071
2072 /* Pack two rows together */
2073 src_temp1_8x16b = _mm_unpacklo_epi64(src_temp1_8x16b, src_temp3_8x16b);
2074 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp4_8x16b);
2075
2076 /* (pi2_src1[col] + pi2_src2[col]) */
2077 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, src_temp2_8x16b);
2078
2079 /* i4_tmp = (pi2_src1[col] + pi2_src2[col] + lvl_shift1 + lvl_shift2 + shift_value) */
2080 src_temp1_8x16b = _mm_adds_epi16(src_temp1_8x16b, lvl_shift1_8x16b);
2081
2082 /* (i4_tmp >> shift) */
2083 src_temp1_8x16b = _mm_srai_epi16(src_temp1_8x16b, shift);
2084
2085 /* pu1_dst[col] = CLIP_U8(i4_tmp >> shift); */
2086 src_temp1_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp1_8x16b);
2087
2088 dst0 = _mm_cvtsi128_si32(src_temp1_8x16b);
2089 /* dst row = 1 to 3 */
2090 src_temp2_8x16b = _mm_shuffle_epi32(src_temp1_8x16b, 1);
2091
2092 /* store four 8-bit output values */
2093 *(WORD32 *)(&pu1_dst[0 * dst_strd]) = dst0;
2094
2095 dst1 = _mm_cvtsi128_si32(src_temp2_8x16b);
2096
2097 /* row = 1 to row = 3 */
2098 *(WORD32 *)(&pu1_dst[1 * dst_strd]) = dst1;
2099
2100 /* To update pointer */
2101 pi2_src1 += 4;
2102 pi2_src2 += 4;
2103 pu1_dst += 4;
2104
2105 } /* inner loop ends here(4-output values in single iteration) */
2106
2107 pi2_src1 = pi2_src1 - wd + 2 * src_strd1; /* Pointer update */
2108 pi2_src2 = pi2_src2 - wd + 2 * src_strd2; /* Pointer update */
2109 pu1_dst = pu1_dst - wd + 2 * dst_strd; /* Pointer update */
2110
2111 }
2112
2113 }
2114
2115 }
2116