1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264e_half_pel_ssse3.c
24 *
25 * @brief
26 * Contains the x86 intrinsic function definitions for 6-tap vertical filter
27 * and cascaded 2D filter used in motion estimation in H264 encoder.
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * ih264e_sixtapfilter_horz_ssse3
34 * ih264e_sixtap_filter_2dvh_vert_ssse3
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41
42 /*****************************************************************************/
43 /* File Includes */
44 /*****************************************************************************/
45
46 /* System include files */
47 #include <stdio.h>
48 #include <assert.h>
49 #include <limits.h>
50
51 /* User include files */
52 #include "ih264_typedefs.h"
53 #include "ithread.h"
54 #include "ih264_platform_macros.h"
55 #include "ih264_defs.h"
56 #include "ih264e_half_pel.h"
57 #include "ih264_macros.h"
58 #include "ih264e_debug.h"
59 #include "ih264_inter_pred_filters.h"
60 #include "ih264_mem_fns.h"
61 #include "ih264_padding.h"
62 #include "ih264_intra_pred_filters.h"
63 #include "ih264_deblk_edge_filters.h"
64
65
66 /*****************************************************************************/
67 /* Function Definitions */
68 /*****************************************************************************/
69 /*
70 *******************************************************************************
71 *
72 * @brief
73 * Interprediction luma filter for horizontal input(Filter run for width = 17
74 * and height =16)
75 *
76 * @par Description:
77 * Applies a 6 tap horizontal filter .The output is clipped to 8 bits sec.
78 * 8.4.2.2.1 titled "Luma sample interpolation process"
79 *
80 * @param[in] pu1_src
81 * UWORD8 pointer to the source
82 *
83 * @param[out] pu1_dst
84 * UWORD8 pointer to the destination
85 *
86 * @param[in] src_strd
87 * integer source stride
88 *
89 * @param[in] dst_strd
90 * integer destination stride
91 *
92 * @returns
93 * None
94 *
95 * @remarks
96 * None
97 *
98 *******************************************************************************
99 */
ih264e_sixtapfilter_horz_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd)100 void ih264e_sixtapfilter_horz_ssse3(UWORD8 *pu1_src,
101 UWORD8 *pu1_dst,
102 WORD32 src_strd,
103 WORD32 dst_strd)
104 {
105 WORD32 ht;
106 WORD32 tmp;
107
108 __m128i src_r0_16x8b, src_r1_16x8b, src_r0_sht_16x8b, src_r1_sht_16x8b;
109 __m128i src_r0_t1_16x8b, src_r1_t1_16x8b;
110
111 __m128i res_r0_t1_8x16b, res_r0_t2_8x16b, res_r0_t3_8x16b;
112 __m128i res_r1_t1_8x16b, res_r1_t2_8x16b, res_r1_t3_8x16b;
113
114 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
115 __m128i const_val16_8x16b;
116
117 ht = 16;
118 pu1_src -= 2; // the filter input starts from x[-2] (till x[3])
119
120 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
121 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
122 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
123 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
124 const_val16_8x16b = _mm_set1_epi16(16);
125
126 //Row0 : a0 a1 a2 a3 a4 a5 a6 a7 a8 a9.....
127 //Row0 : b0 b1 b2 b3 b4 b5 b6 b7 b8 b9.....
128 //b0 is same a8. Similarly other bn pixels are same as a(n+8) pixels.
129
130 do
131 {
132 src_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src); //a0 a1 a2 a3 a4 a5 a6 a7 a8 a9....a15
133 src_r1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + 8)); //b0 b1 b2 b3 b4 b5 b6 b7 b8 b9....b15
134
135 src_r0_sht_16x8b = _mm_srli_si128(src_r0_16x8b, 1); //a1 a2 a3 a4 a5 a6 a7 a8 a9....a15 0
136 src_r1_sht_16x8b = _mm_srli_si128(src_r1_16x8b, 1); //b1 b2 b3 b4 b5 b6 b7 b8 b9....b15 0
137
138 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a0 a1 a1 a2 a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8
139 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b0 b1 b1 b2 b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8
140
141 res_r0_t1_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff0_1_16x8b); //a0*c0+a1*c1 a1*c0+a2*c1 a2*c0+a3*c1 a3*c0+a4*c1
142 //a4*c0+a5*c1 a5*c0+a6*c1 a6*c0+a7*c1 a7*c0+a8*c1
143 res_r1_t1_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff0_1_16x8b); //b0*c0+b1*c1 b1*c0+b2*c1 b2*c0+b3*c1 b3*c0+b4*c1
144 //b4*c0+b5*c1 b5*c0+b6*c1 b6*c0+b7*c1 b7*c0+b8*c1
145
146 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a2 a3 a4 a5 a6 a7 a8 a9....a15 0 0
147 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b2 b3 b4 b5 b6 b7 b8 b9....b15 0 0
148
149 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a3 a4 a5 a6 a7 a8 a9....a15 0 0 0
150 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b3 b4 b5 b6 b7 b8 b9....b15 0 0 0
151
152 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a2 a3 a3 a4 a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10
153 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b2 b3 b3 b4 b4 b5 b5 b6 b6 b7 b7 b8 a8 a9 a9 a10
154
155 res_r0_t2_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff2_3_16x8b); //a2*c2+a3*c3 a3*c2+a4*c3 a4*c2+a5*c3 a5*c2+a6*c3
156 //a6*c2+a7*c3 a7*c2+a8*c3 a8*c2+a9*c3 a9*c2+a10*c3
157 res_r1_t2_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff2_3_16x8b); //b2*c2+b3*c3 b3*c2+b4*c3 b2*c4+b5*c3 b5*c2+b6*c3
158 //b6*c2+b7*c3 b7*c2+b8*c3 b8*c2+b9*c3 b9*c2+b10*c3
159
160 src_r0_16x8b = _mm_srli_si128(src_r0_16x8b, 2); //a4 a5 a6 a7 a8 a9....a15 0 0 0 0
161 src_r1_16x8b = _mm_srli_si128(src_r1_16x8b, 2); //b4 b5 b6 b7 b8 b9....b15 0 0 0 0
162
163 src_r0_sht_16x8b = _mm_srli_si128(src_r0_sht_16x8b, 2); //a5 a6 a7 a8 a9....a15 0 0 0 0 0
164 src_r1_sht_16x8b = _mm_srli_si128(src_r1_sht_16x8b, 2); //b5 b6 b7 b8 b9....b15 0 0 0 0 0
165
166 src_r0_t1_16x8b = _mm_unpacklo_epi8(src_r0_16x8b, src_r0_sht_16x8b); //a4 a5 a5 a6 a6 a7 a7 a8 a8 a9 a9 a10 a10 a11 a11 a12
167 src_r1_t1_16x8b = _mm_unpacklo_epi8(src_r1_16x8b, src_r1_sht_16x8b); //b4 b5 b5 b6 b6 b7 b7 b8 b8 b9 b9 b10 b10 b11 b11 b12
168
169 res_r0_t3_8x16b = _mm_maddubs_epi16(src_r0_t1_16x8b, coeff4_5_16x8b); //a4*c4+a5*c5 a5*c4+a6*c5 a6*c4+a7*c5 a7*c4+a8*c5
170 //a8*c4+a9*c5 a9*c4+a10*c5 a10*c4+a11*c5 a11*c4+a12*c5
171 res_r1_t3_8x16b = _mm_maddubs_epi16(src_r1_t1_16x8b, coeff4_5_16x8b); //b4*c4+b5*c5 b5*c4+b6*c5 b6*c4+b7*c5 b7*c4+b8*c5
172 //b8*c4+b9*c5 b9*c4+b10*c5 b10*c4+b11*c5 b11*c4+b12*c5
173 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t2_8x16b);
174 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t2_8x16b);
175 res_r0_t3_8x16b = _mm_add_epi16(res_r0_t3_8x16b, const_val16_8x16b);
176 res_r1_t3_8x16b = _mm_add_epi16(res_r1_t3_8x16b, const_val16_8x16b);
177 res_r0_t1_8x16b = _mm_add_epi16(res_r0_t1_8x16b, res_r0_t3_8x16b);
178 res_r1_t1_8x16b = _mm_add_epi16(res_r1_t1_8x16b, res_r1_t3_8x16b);
179
180 tmp = ((pu1_src[18] + pu1_src[19]) << 2) - pu1_src[17] - pu1_src[20];
181 tmp = pu1_src[16] + pu1_src[21] + (tmp << 2) + tmp;
182
183 res_r0_t1_8x16b = _mm_srai_epi16(res_r0_t1_8x16b, 5); //shifting right by 5 bits.
184 res_r1_t1_8x16b = _mm_srai_epi16(res_r1_t1_8x16b, 5);
185 tmp = (tmp + 16) >> 5;
186
187 src_r0_16x8b = _mm_packus_epi16(res_r0_t1_8x16b, res_r1_t1_8x16b);
188 pu1_dst[16] = CLIP_U8(tmp);
189
190 _mm_storeu_si128((__m128i *)pu1_dst, src_r0_16x8b);
191
192 ht--;
193 pu1_src += src_strd;
194 pu1_dst += dst_strd;
195 }
196 while(ht > 0);
197 }
198
199 /*
200 *******************************************************************************
201 *
202 * @brief
203 * This function implements a two stage cascaded six tap filter. It
204 * applies the six tap filter in the vertical direction on the
205 * predictor values, followed by applying the same filter in the
206 * horizontal direction on the output of the first stage. The six tap
207 * filtering operation is described in sec 8.4.2.2.1 titled "Luma sample
208 * interpolation process" (Filter run for width = 17 and height =17)
209 *
210 * @par Description:
211 * The function interpolates the predictors first in the vertical direction
212 * and then in the horizontal direction to output the (1/2,1/2). The output
213 * of the first stage of the filter is stored in the buffer pointed to by
214 * pi16_pred1(only in C) in 16 bit precision.
215 *
216 * @param[in] pu1_src
217 * UWORD8 pointer to the source
218 *
219 * @param[out] pu1_dst1
220 * UWORD8 pointer to the destination(Vertical filtered output)
221 *
222 * @param[out] pu1_dst2
223 * UWORD8 pointer to the destination(out put after applying horizontal filter
224 * to the intermediate vertical output)
225 *
226 * @param[in] src_strd
227 * integer source stride
228
229 * @param[in] dst_strd
230 * integer destination stride of pu1_dst
231 *
232 * @param[in]pi16_pred1
233 * Pointer to 16bit intermediate buffer(used only in c)
234 *
235 * @param[in] pi16_pred1_strd
236 * integer destination stride of pi16_pred1
237 *
238 * @returns
239 * None
240 *
241 * @remarks
242 * None
243 *
244 *******************************************************************************
245 */
ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst1,UWORD8 * pu1_dst2,WORD32 src_strd,WORD32 dst_strd,WORD32 * pi4_pred1,WORD32 pred1_strd)246 void ih264e_sixtap_filter_2dvh_vert_ssse3(UWORD8 *pu1_src,
247 UWORD8 *pu1_dst1,
248 UWORD8 *pu1_dst2,
249 WORD32 src_strd,
250 WORD32 dst_strd,
251 WORD32 *pi4_pred1,
252 WORD32 pred1_strd)
253 {
254 WORD32 ht;
255 WORD16 *pi2_pred1;
256
257 ht = 17;
258 pi2_pred1 = (WORD16 *)pi4_pred1;
259 pred1_strd = pred1_strd << 1;
260
261 // Vertical 6-tap filter
262 {
263 __m128i src1_r0_16x8b, src1_r1_16x8b, src1_r2_16x8b;
264 __m128i src1_r3_16x8b, src1_r4_16x8b, src1_r5_16x8b;
265 __m128i src2_r0_16x8b, src2_r1_16x8b, src2_r2_16x8b;
266 __m128i src2_r3_16x8b, src2_r4_16x8b, src2_r5_16x8b;
267
268 __m128i src_r0r1_16x8b, src_r2r3_16x8b, src_r4r5_16x8b;
269
270 __m128i res_t1_8x16b, res_t2_8x16b, res_t3_8x16b;
271 __m128i coeff0_1_16x8b, coeff2_3_16x8b, coeff4_5_16x8b;
272
273 coeff0_1_16x8b = _mm_set1_epi32(0xFB01FB01); //c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1 c0 c1
274 coeff2_3_16x8b = _mm_set1_epi32(0x14141414); //c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3 c2 c3
275 coeff4_5_16x8b = _mm_set1_epi32(0x01FB01FB); //c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5 c4 c5
276 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
277
278 pu1_src -= 2;
279 pu1_src -= src_strd << 1; // the filter input starts from x[-2] (till x[3])
280
281 // Loading first five rows to start first row processing.
282 // 22 values loaded in each row.
283 src1_r0_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
284 src2_r0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
285 pu1_src += src_strd;
286
287 src1_r1_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
288 src2_r1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
289 pu1_src += src_strd;
290
291 src1_r2_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
292 src2_r2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
293 pu1_src += src_strd;
294
295 src1_r3_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
296 src2_r3_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
297 pu1_src += src_strd;
298
299 src1_r4_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
300 src2_r4_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
301 pu1_src += src_strd;
302
303 do
304 {
305 src1_r5_16x8b = _mm_loadu_si128((__m128i *)pu1_src);
306 src2_r5_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 14));
307
308 src_r0r1_16x8b = _mm_unpacklo_epi8(src1_r0_16x8b, src1_r1_16x8b);
309 src_r2r3_16x8b = _mm_unpacklo_epi8(src1_r2_16x8b, src1_r3_16x8b);
310 src_r4r5_16x8b = _mm_unpacklo_epi8(src1_r4_16x8b, src1_r5_16x8b);
311
312 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
313 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
314 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
315
316 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
317 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
318
319 _mm_storeu_si128((__m128i *)pi2_pred1, res_t1_8x16b);
320
321 src_r0r1_16x8b = _mm_unpackhi_epi8(src1_r0_16x8b, src1_r1_16x8b);
322 src_r2r3_16x8b = _mm_unpackhi_epi8(src1_r2_16x8b, src1_r3_16x8b);
323 src_r4r5_16x8b = _mm_unpackhi_epi8(src1_r4_16x8b, src1_r5_16x8b);
324
325 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
326 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
327 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
328
329 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
330 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
331
332 _mm_storeu_si128((__m128i *)(pi2_pred1 + 8), res_t1_8x16b);
333
334 src_r0r1_16x8b = _mm_unpacklo_epi8(src2_r0_16x8b, src2_r1_16x8b);
335 src_r2r3_16x8b = _mm_unpacklo_epi8(src2_r2_16x8b, src2_r3_16x8b);
336 src_r4r5_16x8b = _mm_unpacklo_epi8(src2_r4_16x8b, src2_r5_16x8b);
337
338 res_t1_8x16b = _mm_maddubs_epi16(src_r0r1_16x8b, coeff0_1_16x8b);
339 res_t2_8x16b = _mm_maddubs_epi16(src_r2r3_16x8b, coeff2_3_16x8b);
340 res_t3_8x16b = _mm_maddubs_epi16(src_r4r5_16x8b, coeff4_5_16x8b);
341
342 res_t1_8x16b = _mm_add_epi16(res_t1_8x16b, res_t2_8x16b);
343 res_t1_8x16b = _mm_add_epi16(res_t3_8x16b, res_t1_8x16b);
344
345 _mm_storeu_si128((__m128i *)(pi2_pred1 + 14), res_t1_8x16b);
346
347 src1_r0_16x8b = src1_r1_16x8b;
348 src1_r1_16x8b = src1_r2_16x8b;
349 src1_r2_16x8b = src1_r3_16x8b;
350 src1_r3_16x8b = src1_r4_16x8b;
351 src1_r4_16x8b = src1_r5_16x8b;
352
353 src2_r0_16x8b = src2_r1_16x8b;
354 src2_r1_16x8b = src2_r2_16x8b;
355 src2_r2_16x8b = src2_r3_16x8b;
356 src2_r3_16x8b = src2_r4_16x8b;
357 src2_r4_16x8b = src2_r5_16x8b;
358
359 ht--;
360 pu1_src += src_strd;
361 pi2_pred1 += pred1_strd;
362 }
363 while(ht > 0);
364 }
365
366 ht = 17;
367 pi2_pred1 = (WORD16 *)pi4_pred1;
368
369 // Horizontal 6-tap filter
370 {
371 WORD32 temp;
372
373 __m128i src_r0_8x16b, src_r1_8x16b, src_r2_8x16b, src_r3_8x16b;
374 __m128i src_r4_8x16b, src_r5_8x16b;
375 __m128i src_r0r1_8x16b, src_r2r3_8x16b, src_r4r5_8x16b;
376 __m128i res_vert1_8x16b, res_vert2_8x16b, res_16x8b;
377
378 __m128i res_t0_4x32b, res_t1_4x32b, res_t2_4x32b, res_t3_4x32b;
379 __m128i res_c0_8x16b, res_c1_8x16b;
380
381 __m128i coeff0_1_8x16b, coeff2_3_8x16b, coeff4_5_8x16b;
382 __m128i const_val512_4x32b, const_val16_8x16b;
383
384 coeff0_1_8x16b = _mm_set1_epi32(0xFFFB0001); //c0 c1 c0 c1 c0 c1 c0 c1
385 coeff2_3_8x16b = _mm_set1_epi32(0x00140014); //c2 c3 c2 c3 c2 c3 c2 c3
386 coeff4_5_8x16b = _mm_set1_epi32(0x0001FFFB); //c4 c5 c4 c5 c4 c5 c4 c5
387 //c0 = c5 = 1, c1 = c4 = -5, c2 = c3 = 20
388 const_val512_4x32b = _mm_set1_epi32(512);
389 const_val16_8x16b = _mm_set1_epi16(16);
390
391 do
392 {
393 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1));
394 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 1));
395 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 2));
396 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 3));
397 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 4));
398 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 5));
399
400 res_vert1_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
401 res_vert1_8x16b = _mm_srai_epi16(res_vert1_8x16b, 5); //shifting right by 5 bits.
402
403 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
404 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
405 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
406
407 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
408 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
409 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
410
411 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
412 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
413 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
414 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
415
416 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
417 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
418 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
419
420 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
421 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
422 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
423
424 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
425 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
426 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
427 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
428
429 res_c0_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
430
431 src_r0_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8));
432 src_r1_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 1));
433 src_r2_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 2));
434 src_r3_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 3));
435 src_r4_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 4));
436 src_r5_8x16b = _mm_loadu_si128((__m128i *)(pi2_pred1 + 8 + 5));
437
438 res_vert2_8x16b = _mm_add_epi16(src_r2_8x16b, const_val16_8x16b);
439 res_vert2_8x16b = _mm_srai_epi16(res_vert2_8x16b, 5); //shifting right by 5 bits.
440
441 src_r0r1_8x16b = _mm_unpacklo_epi16(src_r0_8x16b, src_r1_8x16b);
442 src_r2r3_8x16b = _mm_unpacklo_epi16(src_r2_8x16b, src_r3_8x16b);
443 src_r4r5_8x16b = _mm_unpacklo_epi16(src_r4_8x16b, src_r5_8x16b);
444
445 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
446 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
447 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
448
449 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
450 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
451 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
452 res_t0_4x32b = _mm_srai_epi32(res_t1_4x32b ,10);
453
454 src_r0r1_8x16b = _mm_unpackhi_epi16(src_r0_8x16b, src_r1_8x16b);
455 src_r2r3_8x16b = _mm_unpackhi_epi16(src_r2_8x16b, src_r3_8x16b);
456 src_r4r5_8x16b = _mm_unpackhi_epi16(src_r4_8x16b, src_r5_8x16b);
457
458 res_t1_4x32b = _mm_madd_epi16(src_r0r1_8x16b, coeff0_1_8x16b);
459 res_t2_4x32b = _mm_madd_epi16(src_r2r3_8x16b, coeff2_3_8x16b);
460 res_t3_4x32b = _mm_madd_epi16(src_r4r5_8x16b, coeff4_5_8x16b);
461
462 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t2_4x32b);
463 res_t3_4x32b = _mm_add_epi32(res_t3_4x32b, const_val512_4x32b);
464 res_t1_4x32b = _mm_add_epi32(res_t1_4x32b, res_t3_4x32b);
465 res_t1_4x32b = _mm_srai_epi32(res_t1_4x32b, 10);
466
467 res_c1_8x16b = _mm_packs_epi32(res_t0_4x32b, res_t1_4x32b);
468
469 res_16x8b = _mm_packus_epi16(res_vert1_8x16b, res_vert2_8x16b);
470 _mm_storeu_si128((__m128i *)pu1_dst1, res_16x8b);
471 pu1_dst1[16] = CLIP_U8((pi2_pred1[18] + 16) >> 5);
472
473 res_16x8b = _mm_packus_epi16(res_c0_8x16b, res_c1_8x16b);
474 _mm_storeu_si128((__m128i *)pu1_dst2, res_16x8b);
475 temp = ((pi2_pred1[18] + pi2_pred1[19]) << 2) - pi2_pred1[17] - pi2_pred1[20];
476 temp = pi2_pred1[16] + pi2_pred1[21] + (temp << 2) + temp;
477 pu1_dst2[16] = CLIP_U8((temp + 512) >> 10);
478
479 ht--;
480 pi2_pred1 += pred1_strd;
481 pu1_dst1 += dst_strd;
482 pu1_dst2 += dst_strd;
483 }
484 while(ht > 0);
485 }
486 }
487