1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /* */
22 /* File Name : ih264_deblk_luma_ssse3.c */
23 /* */
24 /* Description : Contains function definitions for deblocking */
25 /* */
26 /* List of Functions : ih264_deblk_luma_vert_bs4_ssse3() */
27 /* ih264_deblk_luma_horz_bs4_ssse3() */
28 /* ih264_deblk_luma_vert_bslt4_ssse3() */
29 /* ih264_deblk_luma_horz_bslt4_ssse3() */
30 /* ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
31 /* ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
32 /* */
33 /* Issues / Problems : None */
34 /* */
35 /* Revision History : */
36 /* */
37 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
38 /* 12 02 2015 Naveen Kumar P Added luma deblocking ssse3 */
39 /* intrinsics */
40 /* */
41 /*****************************************************************************/
42
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46
47 /* System include files */
48 #include <stdio.h>
49
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_platform_macros.h"
53 #include "ih264_deblk_edge_filters.h"
54 #include "ih264_macros.h"
55
56 /*****************************************************************************/
57 /* Function Definitions */
58 /*****************************************************************************/
59
60 /*****************************************************************************/
61 /* */
62 /* Function Name : ih264_deblk_luma_vert_bs4_ssse3() */
63 /* */
64 /* Description : This function performs filtering of a luma block */
65 /* vertical edge when the boundary strength is set to 4. */
66 /* */
67 /* Inputs : pu1_src - pointer to the src sample q0 */
68 /* src_strd - source stride */
69 /* alpha - alpha value for the boundary */
70 /* beta - beta value for the boundary */
71 /* */
72 /* Globals : None */
73 /* */
74 /* Processing : This operation is described in Sec. 8.7.2.4 under the */
75 /* title "Filtering process for edges for bS equal to 4" in */
76 /* ITU T Rec H.264. */
77 /* */
78 /* Outputs : None */
79 /* */
80 /* Returns : None */
81 /* */
82 /* Issues : None */
83 /* */
84 /* Revision History: */
85 /* */
86 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
87 /* 12 02 2015 Naveen Kumar P Initial version */
88 /* */
89 /*****************************************************************************/
ih264_deblk_luma_vert_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)90 void ih264_deblk_luma_vert_bs4_ssse3(UWORD8 *pu1_src,
91 WORD32 src_strd,
92 WORD32 alpha,
93 WORD32 beta)
94 {
95 __m128i zero = _mm_setzero_si128();
96 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
97 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
98 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
99 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
100 __m128i q0_16x8_1;
101 __m128i p0_16x8_1;
102 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
103 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
104 __m128i temp1, temp2, temp3, temp4, temp5, temp6;
105 __m128i Alpha_8x16, Beta_8x16;
106 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
107 __m128i const_val2_16x8 = _mm_set1_epi16(2);
108 __m128i line1, line2, line3, line4, line5, line6, line7, line8;
109
110 Alpha_8x16 = _mm_set1_epi16(alpha);
111 Beta_8x16 = _mm_set1_epi16(beta);
112
113 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
114 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
115 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
116 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
117 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
118 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
119 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
120 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
121
122 temp1 = _mm_unpacklo_epi8(line1, line2);
123 temp2 = _mm_unpacklo_epi8(line3, line4);
124 temp3 = _mm_unpacklo_epi8(line5, line6);
125 temp4 = _mm_unpacklo_epi8(line7, line8);
126
127 line1 = _mm_unpacklo_epi16(temp1, temp2);
128 line2 = _mm_unpackhi_epi16(temp1, temp2);
129 line3 = _mm_unpacklo_epi16(temp3, temp4);
130 line4 = _mm_unpackhi_epi16(temp3, temp4);
131
132 p1_8x16 = _mm_unpacklo_epi32(line1, line3);
133 p0_8x16 = _mm_unpackhi_epi32(line1, line3);
134 q0_8x16 = _mm_unpacklo_epi32(line2, line4);
135 q1_8x16 = _mm_unpackhi_epi32(line2, line4);
136
137 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd));
138 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd));
139 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd));
140 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd));
141 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd));
142 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd));
143 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd));
144 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd));
145
146 temp1 = _mm_unpacklo_epi8(line1, line2);
147 temp2 = _mm_unpacklo_epi8(line3, line4);
148 temp3 = _mm_unpacklo_epi8(line5, line6);
149 temp4 = _mm_unpacklo_epi8(line7, line8);
150
151 line1 = _mm_unpacklo_epi16(temp1, temp2);
152 line2 = _mm_unpackhi_epi16(temp1, temp2);
153 line3 = _mm_unpacklo_epi16(temp3, temp4);
154 line4 = _mm_unpackhi_epi16(temp3, temp4);
155
156 temp1 = _mm_unpacklo_epi32(line1, line3);
157 temp2 = _mm_unpackhi_epi32(line1, line3);
158 temp3 = _mm_unpacklo_epi32(line2, line4);
159 temp4 = _mm_unpackhi_epi32(line2, line4);
160
161 p3_16x8 = _mm_unpacklo_epi64(p1_8x16, temp1);
162 p2_16x8 = _mm_unpackhi_epi64(p1_8x16, temp1);
163 q2_16x8 = _mm_unpacklo_epi64(q1_8x16, temp4);
164 q3_16x8 = _mm_unpackhi_epi64(q1_8x16, temp4);
165 p1_16x8 = _mm_unpacklo_epi64(p0_8x16, temp2);
166 p0_16x8 = _mm_unpackhi_epi64(p0_8x16, temp2);
167 q0_16x8 = _mm_unpacklo_epi64(q0_8x16, temp3);
168 q1_16x8 = _mm_unpackhi_epi64(q0_8x16, temp3);
169
170 //Cond1 (ABS(p0 - q0) < alpha)
171 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
172 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
173 temp1 = _mm_add_epi8(temp1, temp2);
174
175 temp2 = _mm_unpacklo_epi8(temp1, zero);
176 temp1 = _mm_unpackhi_epi8(temp1, zero);
177
178 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
179 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
180
181 flag1_16x8 = _mm_packs_epi16(temp2, temp1);
182
183 //Cond2 (ABS(q1 - q0) < beta)
184 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
185 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
186 temp1 = _mm_add_epi8(temp1, temp2);
187
188 temp2 = _mm_unpacklo_epi8(temp1, zero);
189 temp1 = _mm_unpackhi_epi8(temp1, zero);
190
191 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
192 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
193
194 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
195
196 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
197
198 //Cond3 (ABS(p1 - p0) < beta)
199 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
200 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
201 temp1 = _mm_add_epi8(temp1, temp2);
202
203 temp2 = _mm_unpacklo_epi8(temp1, zero);
204 temp1 = _mm_unpackhi_epi8(temp1, zero);
205
206 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
207 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
208
209 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
210
211 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
212 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
213
214 // (ABS(p0 - q0) < ((alpha >> 2) + 2))
215 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
216 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
217 temp1 = _mm_add_epi8(temp1, temp2);
218 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
219 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
220
221 temp2 = _mm_unpacklo_epi8(temp1, zero);
222 temp1 = _mm_unpackhi_epi8(temp1, zero);
223 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
224 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
225
226 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
227 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
228
229 // (ABS(p2 - p0) < beta)
230 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
231 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
232 temp1 = _mm_add_epi8(temp1, temp2);
233
234 temp2 = _mm_unpacklo_epi8(temp1, zero);
235 temp1 = _mm_unpackhi_epi8(temp1, zero);
236 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
237 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
238
239 flag3_16x8 = _mm_packs_epi16(temp2, temp1);
240 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
241
242 // (ABS(q2 - q0) < beta)
243 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
244 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
245 temp1 = _mm_add_epi8(temp1, temp2);
246
247 temp2 = _mm_unpacklo_epi8(temp1, zero);
248 temp1 = _mm_unpackhi_epi8(temp1, zero);
249 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
250 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
251
252 flag4_16x8 = _mm_packs_epi16(temp2, temp1);
253 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
254
255 // First 8 pixels
256 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
257 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
258 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
259 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
260 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
261 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
262 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
263 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
264
265 // p0_1 and q0_1
266 temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
267 temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
268 temp5 = _mm_add_epi16(temp1, const_val2_16x8);
269 temp6 = _mm_add_epi16(temp2, const_val2_16x8);
270 temp3 = _mm_slli_epi16(p1_8x16, 1);
271 temp4 = _mm_slli_epi16(q1_8x16, 1);
272 temp1 = _mm_add_epi16(temp5, temp3);
273 temp2 = _mm_add_epi16(temp6, temp4);
274 p0_16x8_1 = _mm_srai_epi16(temp1, 2);
275 q0_16x8_1 = _mm_srai_epi16(temp2, 2);
276
277 // p1_2 and q1_2
278 temp6 = _mm_add_epi16(temp6, p0_8x16);
279 temp5 = _mm_add_epi16(temp5, q0_8x16);
280 temp1 = _mm_add_epi16(temp6, p2_8x16);
281 temp2 = _mm_add_epi16(temp5, q2_8x16);
282 p1_16x8_2 = _mm_srai_epi16(temp1, 2);
283 q1_16x8_2 = _mm_srai_epi16(temp2, 2);
284
285 // p0_2 and q0_2
286 temp1 = _mm_add_epi16(temp3, p2_8x16);
287 temp2 = _mm_add_epi16(temp4, q2_8x16);
288 temp1 = _mm_add_epi16(temp1, q1_8x16);
289 temp2 = _mm_add_epi16(temp2, p1_8x16);
290 temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
291 temp3 = _mm_slli_epi16(temp3, 1);
292 temp1 = _mm_add_epi16(temp1, temp3);
293 temp2 = _mm_add_epi16(temp2, temp3);
294 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
295 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
296 p0_16x8_2 = _mm_srai_epi16(temp1, 3);
297 q0_16x8_2 = _mm_srai_epi16(temp2, 3);
298
299 // p2_2 and q2_2
300 temp1 = _mm_add_epi16(temp6, const_val2_16x8);
301 temp2 = _mm_add_epi16(temp5, const_val2_16x8);
302 temp3 = _mm_slli_epi16(p2_8x16, 1);
303 temp4 = _mm_slli_epi16(q2_8x16, 1);
304 temp3 = _mm_add_epi16(p2_8x16, temp3);
305 temp4 = _mm_add_epi16(q2_8x16, temp4);
306 temp5 = _mm_slli_epi16(p3_8x16, 1);
307 temp6 = _mm_slli_epi16(q3_8x16, 1);
308 temp1 = _mm_add_epi16(temp1, temp3);
309 temp2 = _mm_add_epi16(temp2, temp4);
310 temp1 = _mm_add_epi16(temp1, temp5);
311 temp2 = _mm_add_epi16(temp2, temp6);
312 p2_16x8_2 = _mm_srai_epi16(temp1, 3);
313 q2_16x8_2 = _mm_srai_epi16(temp2, 3);
314
315 // Second 8 pixels and packing with first 8 pixels
316 p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
317 p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
318 p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
319 p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
320 q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
321 q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
322 q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
323 q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
324
325 // p0_1 and q0_1
326 temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
327 temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
328 temp5 = _mm_add_epi16(temp1, const_val2_16x8);
329 temp6 = _mm_add_epi16(temp2, const_val2_16x8);
330 temp3 = _mm_slli_epi16(p1_8x16, 1);
331 temp4 = _mm_slli_epi16(q1_8x16, 1);
332 temp1 = _mm_add_epi16(temp5, temp3);
333 temp2 = _mm_add_epi16(temp6, temp4);
334 temp1 = _mm_srai_epi16(temp1, 2);
335 temp2 = _mm_srai_epi16(temp2, 2);
336 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
337 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
338
339 // p1_2 and q1_2
340 temp6 = _mm_add_epi16(temp6, p0_8x16);
341 temp5 = _mm_add_epi16(temp5, q0_8x16);
342 temp1 = _mm_add_epi16(temp6, p2_8x16);
343 temp2 = _mm_add_epi16(temp5, q2_8x16);
344 temp1 = _mm_srai_epi16(temp1, 2);
345 temp2 = _mm_srai_epi16(temp2, 2);
346 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
347 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
348
349 // p0_2 and q0_2
350 temp1 = _mm_add_epi16(temp3, p2_8x16);
351 temp2 = _mm_add_epi16(temp4, q2_8x16);
352 temp1 = _mm_add_epi16(temp1, q1_8x16);
353 temp2 = _mm_add_epi16(temp2, p1_8x16);
354 temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
355 temp3 = _mm_slli_epi16(temp3, 1);
356 temp1 = _mm_add_epi16(temp1, temp3);
357 temp2 = _mm_add_epi16(temp2, temp3);
358 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
359 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
360 temp1 = _mm_srai_epi16(temp1, 3);
361 temp2 = _mm_srai_epi16(temp2, 3);
362 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
363 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
364
365 // p2_2 and q2_2
366 temp1 = _mm_add_epi16(temp6, const_val2_16x8);
367 temp2 = _mm_add_epi16(temp5, const_val2_16x8);
368 temp3 = _mm_slli_epi16(p2_8x16, 1);
369 temp4 = _mm_slli_epi16(q2_8x16, 1);
370 temp3 = _mm_add_epi16(p2_8x16, temp3);
371 temp4 = _mm_add_epi16(q2_8x16, temp4);
372 temp5 = _mm_slli_epi16(p3_8x16, 1);
373 temp6 = _mm_slli_epi16(q3_8x16, 1);
374 temp1 = _mm_add_epi16(temp1, temp3);
375 temp2 = _mm_add_epi16(temp2, temp4);
376 temp1 = _mm_add_epi16(temp1, temp5);
377 temp2 = _mm_add_epi16(temp2, temp6);
378 temp1 = _mm_srai_epi16(temp1, 3);
379 temp2 = _mm_srai_epi16(temp2, 3);
380 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
381 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
382
383 // p0 and q0
384 p0_16x8 = _mm_and_si128(p0_16x8,
385 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
386 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
387 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
388 q0_16x8 = _mm_and_si128(q0_16x8,
389 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
390 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
391 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
392
393 // p0 and q0
394 p0_16x8 = _mm_and_si128(p0_16x8,
395 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
396 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
397 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
398 q0_16x8 = _mm_and_si128(q0_16x8,
399 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
400 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
401 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
402
403 // p1 and q1
404 p1_16x8 = _mm_and_si128(p1_16x8,
405 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
406 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
407 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
408 q1_16x8 = _mm_and_si128(q1_16x8,
409 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
410 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
411 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
412
413 // p2 and q2
414 p2_16x8 = _mm_and_si128(p2_16x8,
415 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
416 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
417 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
418 q2_16x8 = _mm_and_si128(q2_16x8,
419 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
420 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
421 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
422
423 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
424 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
425 temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
426 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
427
428 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
429 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
430 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
431 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
432
433 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
434 line2 = _mm_srli_si128(line1, 8);
435 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
436 line4 = _mm_srli_si128(line3, 8);
437 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
438 line6 = _mm_srli_si128(line5, 8);
439 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
440 line8 = _mm_srli_si128(line7, 8);
441
442 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
443 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
444 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
445 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
446 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
447 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
448 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
449 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
450
451 temp1 = _mm_unpackhi_epi8(p3_16x8, p2_16x8);
452 temp2 = _mm_unpackhi_epi8(p1_16x8, p0_16x8);
453 temp3 = _mm_unpackhi_epi8(q0_16x8, q1_16x8);
454 temp4 = _mm_unpackhi_epi8(q2_16x8, q3_16x8);
455
456 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
457 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
458 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
459 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
460
461 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
462 line2 = _mm_srli_si128(line1, 8);
463 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
464 line4 = _mm_srli_si128(line3, 8);
465 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
466 line6 = _mm_srli_si128(line5, 8);
467 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
468 line8 = _mm_srli_si128(line7, 8);
469
470 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 8 * src_strd), line1);
471 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 9 * src_strd), line2);
472 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 10 * src_strd), line3);
473 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 11 * src_strd), line4);
474 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 12 * src_strd), line5);
475 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 13 * src_strd), line6);
476 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 14 * src_strd), line7);
477 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 15 * src_strd), line8);
478
479 }
480
481 /*****************************************************************************/
482 /* */
483 /* Function Name : ih264_deblk_luma_horz_bs4_ssse3() */
484 /* */
485 /* Description : This function performs filtering of a luma block */
486 /* horizontal edge when the boundary strength is set to 4. */
487 /* */
488 /* Inputs : pu1_src - pointer to the src sample q0 */
489 /* src_strd - source stride */
490 /* alpha - alpha value for the boundary */
491 /* beta - beta value for the boundary */
492 /* */
493 /* Globals : None */
494 /* */
495 /* Processing : This operation is described in Sec. 8.7.2.4 under the */
496 /* title "Filtering process for edges for bS equal to 4" in */
497 /* ITU T Rec H.264. */
498 /* */
499 /* Outputs : None */
500 /* */
501 /* Returns : None */
502 /* */
503 /* Issues : None */
504 /* */
505 /* Revision History: */
506 /* */
507 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
508 /* 12 02 2015 Naveen Kumar P Initial version */
509 /* */
510 /*****************************************************************************/
ih264_deblk_luma_horz_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)511 void ih264_deblk_luma_horz_bs4_ssse3(UWORD8 *pu1_src,
512 WORD32 src_strd,
513 WORD32 alpha,
514 WORD32 beta)
515 {
516 WORD16 i16_posP3, i16_posP2, i16_posP1, i16_posP0;
517 WORD16 i16_posQ1, i16_posQ2, i16_posQ3;
518 UWORD8 *pu1_HorzPixel;
519 __m128i zero = _mm_setzero_si128();
520 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
521 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
522 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
523 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
524 __m128i q0_16x8_1;
525 __m128i p0_16x8_1;
526 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
527 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
528 __m128i temp1, temp2, temp3, temp4, temp5, temp6;
529 __m128i Alpha_8x16, Beta_8x16;
530 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
531 __m128i const_val2_16x8 = _mm_set1_epi16(2);
532
533 pu1_HorzPixel = pu1_src - (src_strd << 2);
534
535 i16_posQ1 = src_strd;
536 i16_posQ2 = X2(src_strd);
537 i16_posQ3 = X3(src_strd);
538 i16_posP0 = X3(src_strd);
539 i16_posP1 = X2(src_strd);
540 i16_posP2 = src_strd;
541 i16_posP3 = 0;
542
543 Alpha_8x16 = _mm_set1_epi16(alpha);
544 Beta_8x16 = _mm_set1_epi16(beta);
545
546 p3_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP3));
547 p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
548 p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
549 p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
550 q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
551 q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
552 q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
553 q3_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ3));
554
555 //Cond1 (ABS(p0 - q0) < alpha)
556 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
557 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
558 temp1 = _mm_add_epi8(temp1, temp2);
559
560 temp2 = _mm_unpacklo_epi8(temp1, zero);
561 temp1 = _mm_unpackhi_epi8(temp1, zero);
562
563 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
564 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
565
566 flag1_16x8 = _mm_packs_epi16(temp2, temp1);
567
568 //Cond2 (ABS(q1 - q0) < beta)
569 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
570 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
571 temp1 = _mm_add_epi8(temp1, temp2);
572
573 temp2 = _mm_unpacklo_epi8(temp1, zero);
574 temp1 = _mm_unpackhi_epi8(temp1, zero);
575
576 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
577 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
578
579 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
580
581 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
582
583 //Cond3 (ABS(p1 - p0) < beta)
584 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
585 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
586 temp1 = _mm_add_epi8(temp1, temp2);
587
588 temp2 = _mm_unpacklo_epi8(temp1, zero);
589 temp1 = _mm_unpackhi_epi8(temp1, zero);
590
591 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
592 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
593
594 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
595
596 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
597 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
598
599 // (ABS(p0 - q0) < ((alpha >> 2) + 2))
600 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
601 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
602 temp1 = _mm_add_epi8(temp1, temp2);
603 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
604 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
605
606 temp2 = _mm_unpacklo_epi8(temp1, zero);
607 temp1 = _mm_unpackhi_epi8(temp1, zero);
608 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
609 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
610
611 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
612 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
613
614 // (ABS(p2 - p0) < beta)
615 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
616 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
617 temp1 = _mm_add_epi8(temp1, temp2);
618
619 temp2 = _mm_unpacklo_epi8(temp1, zero);
620 temp1 = _mm_unpackhi_epi8(temp1, zero);
621 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
622 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
623
624 flag3_16x8 = _mm_packs_epi16(temp2, temp1);
625 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
626
627 // (ABS(q2 - q0) < beta)
628 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
629 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
630 temp1 = _mm_add_epi8(temp1, temp2);
631
632 temp2 = _mm_unpacklo_epi8(temp1, zero);
633 temp1 = _mm_unpackhi_epi8(temp1, zero);
634 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
635 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
636
637 flag4_16x8 = _mm_packs_epi16(temp2, temp1);
638 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
639
640 // First 8 pixels
641 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
642 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
643 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
644 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
645 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
646 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
647 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
648 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
649
650 // p0_1 and q0_1
651 temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
652 temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
653 temp5 = _mm_add_epi16(temp1, const_val2_16x8);
654 temp6 = _mm_add_epi16(temp2, const_val2_16x8);
655 temp3 = _mm_slli_epi16(p1_8x16, 1);
656 temp4 = _mm_slli_epi16(q1_8x16, 1);
657 temp1 = _mm_add_epi16(temp5, temp3);
658 temp2 = _mm_add_epi16(temp6, temp4);
659 p0_16x8_1 = _mm_srai_epi16(temp1, 2);
660 q0_16x8_1 = _mm_srai_epi16(temp2, 2);
661
662 // p1_2 and q1_2
663 temp6 = _mm_add_epi16(temp6, p0_8x16);
664 temp5 = _mm_add_epi16(temp5, q0_8x16);
665 temp1 = _mm_add_epi16(temp6, p2_8x16);
666 temp2 = _mm_add_epi16(temp5, q2_8x16);
667 p1_16x8_2 = _mm_srai_epi16(temp1, 2);
668 q1_16x8_2 = _mm_srai_epi16(temp2, 2);
669
670 // p0_2 and q0_2
671 temp1 = _mm_add_epi16(temp3, p2_8x16);
672 temp2 = _mm_add_epi16(temp4, q2_8x16);
673 temp1 = _mm_add_epi16(temp1, q1_8x16);
674 temp2 = _mm_add_epi16(temp2, p1_8x16);
675 temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
676 temp3 = _mm_slli_epi16(temp3, 1);
677 temp1 = _mm_add_epi16(temp1, temp3);
678 temp2 = _mm_add_epi16(temp2, temp3);
679 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
680 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
681 p0_16x8_2 = _mm_srai_epi16(temp1, 3);
682 q0_16x8_2 = _mm_srai_epi16(temp2, 3);
683
684 // p2_2 and q2_2
685 temp1 = _mm_add_epi16(temp6, const_val2_16x8);
686 temp2 = _mm_add_epi16(temp5, const_val2_16x8);
687 temp3 = _mm_slli_epi16(p2_8x16, 1);
688 temp4 = _mm_slli_epi16(q2_8x16, 1);
689 temp3 = _mm_add_epi16(p2_8x16, temp3);
690 temp4 = _mm_add_epi16(q2_8x16, temp4);
691 temp5 = _mm_slli_epi16(p3_8x16, 1);
692 temp6 = _mm_slli_epi16(q3_8x16, 1);
693 temp1 = _mm_add_epi16(temp1, temp3);
694 temp2 = _mm_add_epi16(temp2, temp4);
695 temp1 = _mm_add_epi16(temp1, temp5);
696 temp2 = _mm_add_epi16(temp2, temp6);
697 p2_16x8_2 = _mm_srai_epi16(temp1, 3);
698 q2_16x8_2 = _mm_srai_epi16(temp2, 3);
699
700 // Second 8 pixels and packing with first 8 pixels
701 p3_8x16 = _mm_unpackhi_epi8(p3_16x8, zero);
702 p2_8x16 = _mm_unpackhi_epi8(p2_16x8, zero);
703 p1_8x16 = _mm_unpackhi_epi8(p1_16x8, zero);
704 p0_8x16 = _mm_unpackhi_epi8(p0_16x8, zero);
705 q0_8x16 = _mm_unpackhi_epi8(q0_16x8, zero);
706 q1_8x16 = _mm_unpackhi_epi8(q1_16x8, zero);
707 q2_8x16 = _mm_unpackhi_epi8(q2_16x8, zero);
708 q3_8x16 = _mm_unpackhi_epi8(q3_16x8, zero);
709
710 // p0_1 and q0_1
711 temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
712 temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
713 temp5 = _mm_add_epi16(temp1, const_val2_16x8);
714 temp6 = _mm_add_epi16(temp2, const_val2_16x8);
715 temp3 = _mm_slli_epi16(p1_8x16, 1);
716 temp4 = _mm_slli_epi16(q1_8x16, 1);
717 temp1 = _mm_add_epi16(temp5, temp3);
718 temp2 = _mm_add_epi16(temp6, temp4);
719 temp1 = _mm_srai_epi16(temp1, 2);
720 temp2 = _mm_srai_epi16(temp2, 2);
721 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, temp1);
722 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, temp2);
723
724 // p1_2 and q1_2
725 temp6 = _mm_add_epi16(temp6, p0_8x16);
726 temp5 = _mm_add_epi16(temp5, q0_8x16);
727 temp1 = _mm_add_epi16(temp6, p2_8x16);
728 temp2 = _mm_add_epi16(temp5, q2_8x16);
729 temp1 = _mm_srai_epi16(temp1, 2);
730 temp2 = _mm_srai_epi16(temp2, 2);
731 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, temp1);
732 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, temp2);
733
734 // p0_2 and q0_2
735 temp1 = _mm_add_epi16(temp3, p2_8x16);
736 temp2 = _mm_add_epi16(temp4, q2_8x16);
737 temp1 = _mm_add_epi16(temp1, q1_8x16);
738 temp2 = _mm_add_epi16(temp2, p1_8x16);
739 temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
740 temp3 = _mm_slli_epi16(temp3, 1);
741 temp1 = _mm_add_epi16(temp1, temp3);
742 temp2 = _mm_add_epi16(temp2, temp3);
743 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
744 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
745 temp1 = _mm_srai_epi16(temp1, 3);
746 temp2 = _mm_srai_epi16(temp2, 3);
747 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, temp1);
748 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, temp2);
749
750 // p2_2 and q2_2
751 temp1 = _mm_add_epi16(temp6, const_val2_16x8);
752 temp2 = _mm_add_epi16(temp5, const_val2_16x8);
753 temp3 = _mm_slli_epi16(p2_8x16, 1);
754 temp4 = _mm_slli_epi16(q2_8x16, 1);
755 temp3 = _mm_add_epi16(p2_8x16, temp3);
756 temp4 = _mm_add_epi16(q2_8x16, temp4);
757 temp5 = _mm_slli_epi16(p3_8x16, 1);
758 temp6 = _mm_slli_epi16(q3_8x16, 1);
759 temp1 = _mm_add_epi16(temp1, temp3);
760 temp2 = _mm_add_epi16(temp2, temp4);
761 temp1 = _mm_add_epi16(temp1, temp5);
762 temp2 = _mm_add_epi16(temp2, temp6);
763 temp1 = _mm_srai_epi16(temp1, 3);
764 temp2 = _mm_srai_epi16(temp2, 3);
765 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, temp1);
766 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, temp2);
767
768 // p0 and q0
769 p0_16x8 = _mm_and_si128(p0_16x8,
770 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
771 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
772 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
773 q0_16x8 = _mm_and_si128(q0_16x8,
774 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
775 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
776 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
777
778 // p0 and q0
779 p0_16x8 = _mm_and_si128(p0_16x8,
780 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
781 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
782 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
783 q0_16x8 = _mm_and_si128(q0_16x8,
784 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
785 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
786 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
787
788 // p1 and q1
789 p1_16x8 = _mm_and_si128(p1_16x8,
790 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
791 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
792 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
793 q1_16x8 = _mm_and_si128(q1_16x8,
794 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
795 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
796 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
797
798 // p2 and q2
799 p2_16x8 = _mm_and_si128(p2_16x8,
800 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
801 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
802 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
803 q2_16x8 = _mm_and_si128(q2_16x8,
804 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
805 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
806 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
807
808 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP2), p2_16x8);
809 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), p1_16x8);
810 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), p0_16x8);
811
812 _mm_storeu_si128((__m128i *)(pu1_src), q0_16x8);
813 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), q1_16x8);
814 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ2), q2_16x8);
815
816 }
817
818 /*****************************************************************************/
819 /* */
820 /* Function Name : ih264_deblk_luma_vert_bslt4_ssse3() */
821 /* */
822 /* Description : This function performs filtering of a luma block */
823 /* vertical edge when the boundary strength is less than 4. */
824 /* */
825 /* Inputs : pu1_src - pointer to the src sample q0 */
826 /* src_strd - source stride */
827 /* alpha - alpha value for the boundary */
828 /* beta - beta value for the boundary */
829 /* u4_bs - packed Boundary strength array */
830 /* pu1_cliptab - tc0_table */
831 /* */
832 /* Globals : None */
833 /* */
834 /* Processing : This operation is described in Sec. 8.7.2.3 under the */
835 /* title "Filtering process for edges for bS less than 4" */
836 /* in ITU T Rec H.264. */
837 /* */
838 /* Outputs : None */
839 /* */
840 /* Returns : None */
841 /* */
842 /* Issues : None */
843 /* */
844 /* Revision History: */
845 /* */
846 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
847 /* 12 02 2015 Naveen Kumar P Initial version */
848 /* */
849 /*****************************************************************************/
ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)850 void ih264_deblk_luma_vert_bslt4_ssse3(UWORD8 *pu1_src,
851 WORD32 src_strd,
852 WORD32 alpha,
853 WORD32 beta,
854 UWORD32 u4_bs,
855 const UWORD8 *pu1_cliptab)
856 {
857 UWORD8 u1_Bs, u1_Bs1;
858
859 WORD32 j = 0;
860
861 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
862 __m128i int1, int2, int3, int4, high1, high2;
863 __m128i flag, flag1, i_C, i_C0;
864 __m128i i_Ap, i_Aq, diff, const1, const2, in_macro, in_macrotemp, temp,
865 temp1;
866 __m128i zero = _mm_setzero_si128();
867
868 for(j = 0; j <= 8 * src_strd; j += 8 * src_strd)
869 {
870 //Transpose
871 linea = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + j));
872 lineb = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + src_strd + j));
873 linec = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j));
874 lined = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j));
875
876 linea = _mm_unpacklo_epi8(linea, zero);
877 lineb = _mm_unpacklo_epi8(lineb, zero);
878 linec = _mm_unpacklo_epi8(linec, zero);
879 lined = _mm_unpacklo_epi8(lined, zero);
880
881 int1 = _mm_unpacklo_epi16(linea, lineb);
882 lineb = _mm_unpackhi_epi16(linea, lineb);
883
884 int2 = _mm_unpacklo_epi16(linec, lined);
885 lined = _mm_unpackhi_epi16(linec, lined);
886
887 linea = _mm_unpacklo_epi16(int1, int2);
888 int1 = _mm_unpackhi_epi16(int1, int2);
889
890 linec = _mm_unpacklo_epi16(lineb, lined);
891 high1 = _mm_unpackhi_epi16(lineb, lined);
892
893 linee = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j));
894 linef = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j));
895 lineg = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j));
896 lineh = _mm_loadl_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j));
897
898 linee = _mm_unpacklo_epi8(linee, zero);
899 linef = _mm_unpacklo_epi8(linef, zero);
900 lineg = _mm_unpacklo_epi8(lineg, zero);
901 lineh = _mm_unpacklo_epi8(lineh, zero);
902
903 int2 = _mm_unpacklo_epi16(linee, linef);
904 linef = _mm_unpackhi_epi16(linee, linef);
905
906 int3 = _mm_unpacklo_epi16(lineg, lineh);
907 lineh = _mm_unpackhi_epi16(lineg, lineh);
908
909 linee = _mm_unpacklo_epi16(int2, int3);
910 int2 = _mm_unpackhi_epi16(int2, int3);
911
912 lineg = _mm_unpacklo_epi16(linef, lineh);
913 high2 = _mm_unpackhi_epi16(linef, lineh);
914
915 int4 = _mm_unpacklo_epi16(linea, linee);
916 lineb = _mm_unpackhi_epi16(linea, linee);
917
918 int3 = _mm_unpacklo_epi16(int1, int2);
919 lined = _mm_unpackhi_epi16(int1, int2);
920
921 int2 = _mm_unpacklo_epi16(linec, lineg);
922 linef = _mm_unpackhi_epi16(linec, lineg);
923
924 linea = int4;
925 linec = int3;
926 linee = int2;
927
928 lineg = _mm_unpacklo_epi16(high1, high2);
929 lineh = _mm_unpackhi_epi16(high1, high2);
930
931 //end of transpose
932
933 u1_Bs = (u4_bs >> 24) & 0xff;
934 u1_Bs1 = (u4_bs >> 16) & 0xff;
935 u4_bs <<= 16;
936
937 flag1 = _mm_set_epi16(u1_Bs1, u1_Bs, u1_Bs1, u1_Bs, u1_Bs1, u1_Bs,
938 u1_Bs1, u1_Bs);
939 flag1 = _mm_cmpeq_epi16(flag1, zero); //Set flag to 1s and 0s
940 flag1 = _mm_xor_si128(flag1, _mm_set1_epi16(0xFFFF)); //Invert for required mask
941
942 i_C0 = _mm_set_epi16(pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
943 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
944 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs],
945 pu1_cliptab[u1_Bs1], pu1_cliptab[u1_Bs]);
946
947 diff = _mm_subs_epi16(linec, lined); //Condn 1
948 diff = _mm_abs_epi16(diff);
949 const1 = _mm_set1_epi16(alpha);
950 flag = _mm_cmpgt_epi16(const1, diff);
951
952 diff = _mm_subs_epi16(linee, lined); //Condtn 2
953 diff = _mm_abs_epi16(diff);
954 const1 = _mm_set1_epi16(beta);
955 flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff));
956
957 diff = _mm_subs_epi16(lineb, linec); //Condtn 3
958 diff = _mm_abs_epi16(diff);
959 flag = _mm_and_si128(flag, _mm_cmpgt_epi16(const1, diff)); //Const 1= Beta from now on
960
961 flag = _mm_and_si128(flag, flag1); //Final flag (ui_B condition + other 3 conditions)
962
963 //Adding Ap<Beta and Aq<Beta
964 i_Ap = _mm_subs_epi16(linea, linec);
965 i_Ap = _mm_abs_epi16(i_Ap);
966 const2 = _mm_cmpgt_epi16(const1, i_Ap);
967 const2 = _mm_subs_epi16(zero, const2); //Make FFFF=1 and 0000=0
968 i_C = _mm_add_epi16(i_C0, const2);
969
970 i_Aq = _mm_subs_epi16(linef, lined);
971 i_Aq = _mm_abs_epi16(i_Aq);
972 const2 = _mm_cmpgt_epi16(const1, i_Aq);
973 const2 = _mm_subs_epi16(zero, const2);
974 i_C = _mm_add_epi16(i_C, const2);
975
976 //Calculate in_macro
977 diff = _mm_subs_epi16(lined, linec);
978 diff = _mm_slli_epi16(diff, 2);
979 const2 = _mm_subs_epi16(lineb, linee);
980 diff = _mm_add_epi16(diff, const2);
981 const2 = _mm_set1_epi16(4);
982 diff = _mm_add_epi16(diff, const2);
983 in_macro = _mm_srai_epi16(diff, 3);
984
985 in_macro = _mm_min_epi16(i_C, in_macro); //CLIP3
986 i_C = _mm_subs_epi16(zero, i_C);
987 in_macro = _mm_max_epi16(i_C, in_macro);
988
989 //Compute and store
990 in_macrotemp = _mm_add_epi16(linec, in_macro);
991 in_macrotemp = _mm_and_si128(in_macrotemp, flag);
992 temp = _mm_and_si128(linec,
993 _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
994 temp = _mm_add_epi16(temp, in_macrotemp);
995 //temp= _mm_packus_epi16 (temp, zero);
996 //_mm_storel_epi64(uc_HorzPixel+i16_posP0+i, in_macrotemp);
997
998 in_macrotemp = _mm_subs_epi16(lined, in_macro);
999 in_macrotemp = _mm_and_si128(in_macrotemp, flag);
1000 temp1 = _mm_and_si128(lined,
1001 _mm_xor_si128(flag, _mm_set1_epi16(0xFFFF)));
1002 temp1 = _mm_add_epi16(temp1, in_macrotemp);
1003 //temp1= _mm_packus_epi16 (temp1, zero);
1004 //_mm_storel_epi64(pu1_src+i, in_macrotemp);
1005
1006 //If Ap<Beta
1007 flag1 = _mm_cmpgt_epi16(const1, i_Ap);
1008 flag1 = _mm_and_si128(flag, flag1);
1009 in_macrotemp = _mm_add_epi16(linec, lined);
1010 in_macrotemp = _mm_add_epi16(in_macrotemp, _mm_set1_epi16(1));
1011 in_macrotemp = _mm_srai_epi16(in_macrotemp, 1);
1012 in_macro = _mm_add_epi16(in_macrotemp, linea);
1013 in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(lineb, 1));
1014 in_macro = _mm_srai_epi16(in_macro, 1);
1015
1016 in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1017 i_C0 = _mm_subs_epi16(zero, i_C0);
1018 in_macro = _mm_max_epi16(i_C0, in_macro);
1019
1020 in_macro = _mm_and_si128(in_macro, flag1);
1021 lineb = _mm_add_epi16(lineb, in_macro);
1022 //in_macro= _mm_packus_epi16 (i_p1, zero);
1023 //_mm_storel_epi64(uc_HorzPixel+i16_posP1+i, in_macro);
1024
1025 flag1 = _mm_cmpgt_epi16(const1, i_Aq);
1026 flag1 = _mm_and_si128(flag, flag1);
1027 in_macro = _mm_add_epi16(in_macrotemp, linef);
1028 in_macro = _mm_subs_epi16(in_macro, _mm_slli_epi16(linee, 1));
1029 in_macro = _mm_srai_epi16(in_macro, 1);
1030
1031 i_C0 = _mm_abs_epi16(i_C0);
1032 in_macro = _mm_min_epi16(i_C0, in_macro); //CLIP3
1033 i_C0 = _mm_subs_epi16(zero, i_C0);
1034 in_macro = _mm_max_epi16(i_C0, in_macro);
1035
1036 in_macro = _mm_and_si128(in_macro, flag1);
1037 linee = _mm_add_epi16(linee, in_macro);
1038 //in_macro= _mm_packus_epi16 (i_q1, zero);
1039 //_mm_storel_epi64(pu1_src+i16_posQ1+i, in_macro);
1040 linec = temp;
1041 lined = temp1;
1042 //End of filtering
1043
1044 int1 = _mm_unpacklo_epi16(linea, linee);
1045 linee = _mm_unpackhi_epi16(linea, linee);
1046
1047 int2 = _mm_unpacklo_epi16(linec, lineg);
1048 lineg = _mm_unpackhi_epi16(linec, lineg);
1049
1050 linea = _mm_unpacklo_epi16(int1, int2);
1051 int3 = _mm_unpackhi_epi16(int1, int2);
1052
1053 linec = _mm_unpacklo_epi16(linee, lineg);
1054 lineg = _mm_unpackhi_epi16(linee, lineg);
1055
1056 int1 = _mm_unpacklo_epi16(lineb, linef);
1057 linef = _mm_unpackhi_epi16(lineb, linef);
1058
1059 int2 = _mm_unpacklo_epi16(lined, lineh);
1060 lineh = _mm_unpackhi_epi16(lined, lineh);
1061
1062 lineb = _mm_unpacklo_epi16(int1, int2);
1063 int4 = _mm_unpackhi_epi16(int1, int2);
1064
1065 lined = _mm_unpacklo_epi16(linef, lineh);
1066 lineh = _mm_unpackhi_epi16(linef, lineh);
1067
1068 int1 = _mm_unpackhi_epi16(linea, lineb);
1069 linea = _mm_unpacklo_epi16(linea, lineb);
1070
1071 int2 = _mm_unpacklo_epi16(int3, int4);
1072 high1 = _mm_unpackhi_epi16(int3, int4);
1073
1074 lineb = _mm_unpacklo_epi16(linec, lined);
1075 linef = _mm_unpackhi_epi16(linec, lined);
1076
1077 lined = _mm_unpacklo_epi16(lineg, lineh);
1078 lineh = _mm_unpackhi_epi16(lineg, lineh);
1079
1080 linee = int1;
1081 lineg = high1;
1082 linec = int2;
1083 //End of inverse transpose
1084
1085 //Packs and stores
1086 linea = _mm_packus_epi16(linea, zero);
1087 _mm_storel_epi64((__m128i *)(pu1_src - 3 + j), linea);
1088
1089 lineb = _mm_packus_epi16(lineb, zero);
1090 _mm_storel_epi64((__m128i *)(pu1_src - 3 + src_strd + j), lineb);
1091
1092 linec = _mm_packus_epi16(linec, zero);
1093 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 2 * src_strd + j), linec);
1094
1095 lined = _mm_packus_epi16(lined, zero);
1096 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 3 * src_strd + j), lined);
1097
1098 linee = _mm_packus_epi16(linee, zero);
1099 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 4 * src_strd + j), linee);
1100
1101 linef = _mm_packus_epi16(linef, zero);
1102 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 5 * src_strd + j), linef);
1103
1104 lineg = _mm_packus_epi16(lineg, zero);
1105 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 6 * src_strd + j), lineg);
1106
1107 lineh = _mm_packus_epi16(lineh, zero);
1108 _mm_storel_epi64((__m128i *)(pu1_src - 3 + 7 * src_strd + j), lineh);
1109
1110 }
1111 }
1112
1113 /*****************************************************************************/
1114 /* */
1115 /* Function Name : ih264_deblk_luma_horz_bslt4_ssse3() */
1116 /* */
1117 /* Description : This function performs filtering of a luma block */
1118 /* horizontal edge when boundary strength is less than 4. */
1119 /* */
1120 /* Inputs : pu1_src - pointer to the src sample q0 */
1121 /* src_strd - source stride */
1122 /* alpha - alpha value for the boundary */
1123 /* beta - beta value for the boundary */
1124 /* u4_bs - packed Boundary strength array */
1125 /* pu1_cliptab - tc0_table */
1126 /* */
1127 /* Globals : None */
1128 /* */
1129 /* Processing : This operation is described in Sec. 8.7.2.3 under the */
1130 /* title "Filtering process for edges for bS less than 4" */
1131 /* in ITU T Rec H.264. */
1132 /* */
1133 /* Outputs : None */
1134 /* */
1135 /* Returns : None */
1136 /* */
1137 /* Issues : None */
1138 /* */
1139 /* Revision History: */
1140 /* */
1141 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1142 /* 12 02 2015 Naveen Kumar P Initial version */
1143 /* */
1144 /*****************************************************************************/
ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)1145 void ih264_deblk_luma_horz_bslt4_ssse3(UWORD8 *pu1_src,
1146 WORD32 src_strd,
1147 WORD32 alpha,
1148 WORD32 beta,
1149 UWORD32 u4_bs,
1150 const UWORD8 *pu1_cliptab)
1151 {
1152 WORD16 i16_posP2, i16_posP1, i16_posP0, i16_posQ1, i16_posQ2;
1153 UWORD8 *pu1_HorzPixel;
1154 __m128i zero = _mm_setzero_si128();
1155 __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C0_hi_8x16, C_8x16, C_hi_8x16;
1156 __m128i q0_16x8, q1_16x8, q2_16x8, p0_16x8, p1_16x8, p2_16x8;
1157 __m128i temp1, temp2;
1158 __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1159 __m128i in_macro_16x8, in_macro_hi_16x8;
1160 __m128i const_val4_8x16;
1161 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1162 UWORD8 clip0, clip1, clip2, clip3;
1163
1164 pu1_HorzPixel = pu1_src - (src_strd << 2);
1165
1166 i16_posQ1 = src_strd;
1167 i16_posQ2 = X2(src_strd);
1168 i16_posP0 = X3(src_strd);
1169 i16_posP1 = X2(src_strd);
1170 i16_posP2 = src_strd;
1171
1172 q0_16x8 = _mm_loadu_si128((__m128i *)(pu1_src));
1173 q1_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ1));
1174
1175 u1_Bs0 = (u4_bs >> 24) & 0xff;
1176 u1_Bs1 = (u4_bs >> 16) & 0xff;
1177 u1_Bs2 = (u4_bs >> 8) & 0xff;
1178 u1_Bs3 = (u4_bs >> 0) & 0xff;
1179 clip0 = pu1_cliptab[u1_Bs0];
1180 clip1 = pu1_cliptab[u1_Bs1];
1181 clip2 = pu1_cliptab[u1_Bs2];
1182 clip3 = pu1_cliptab[u1_Bs3];
1183
1184 Alpha_8x16 = _mm_set1_epi16(alpha);
1185 Beta_8x16 = _mm_set1_epi16(beta);
1186
1187 bs_flag_16x8b = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
1188 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
1189 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
1190
1191 C0_16x8 = _mm_set_epi8(clip3, clip3, clip3, clip3, clip2, clip2, clip2,
1192 clip2, clip1, clip1, clip1, clip1, clip0, clip0,
1193 clip0, clip0);
1194
1195 bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1196 bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1197 C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1198 C0_hi_8x16 = _mm_unpackhi_epi8(C0_16x8, zero);
1199
1200 p1_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP1));
1201 p0_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP0));
1202 p2_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixel + i16_posP2));
1203 q2_16x8 = _mm_loadu_si128((__m128i *)(pu1_src + i16_posQ2));
1204
1205 //Cond1 (ABS(p0 - q0) < alpha)
1206 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1207 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1208 temp1 = _mm_add_epi8(temp1, temp2);
1209
1210 temp2 = _mm_unpacklo_epi8(temp1, zero);
1211 temp1 = _mm_unpackhi_epi8(temp1, zero);
1212
1213 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1214 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1215
1216 flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1217 flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1218
1219 //Cond2 (ABS(q1 - q0) < beta)
1220 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1221 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1222 temp1 = _mm_add_epi8(temp1, temp2);
1223
1224 temp2 = _mm_unpacklo_epi8(temp1, zero);
1225 temp1 = _mm_unpackhi_epi8(temp1, zero);
1226
1227 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1228 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1229
1230 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1231
1232 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1233
1234 //Cond3 (ABS(p1 - p0) < beta)
1235 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1236 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1237 temp1 = _mm_add_epi8(temp1, temp2);
1238
1239 temp2 = _mm_unpacklo_epi8(temp1, zero);
1240 temp1 = _mm_unpackhi_epi8(temp1, zero);
1241
1242 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1243 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1244
1245 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1246
1247 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1248 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1249
1250 // (ABS(p2 - p0) < beta)
1251 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1252 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1253 temp1 = _mm_add_epi8(temp1, temp2);
1254
1255 temp2 = _mm_unpacklo_epi8(temp1, zero);
1256 temp1 = _mm_unpackhi_epi8(temp1, zero);
1257 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1258 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1259
1260 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1261 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1262
1263 temp2 = _mm_subs_epi16(zero, temp2);
1264 temp1 = _mm_subs_epi16(zero, temp1);
1265
1266 C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1267 C_hi_8x16 = _mm_add_epi16(C0_hi_8x16, temp1);
1268
1269 // (ABS(q2 - q0) < beta)
1270 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1271 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1272 temp1 = _mm_add_epi8(temp1, temp2);
1273
1274 temp2 = _mm_unpacklo_epi8(temp1, zero);
1275 temp1 = _mm_unpackhi_epi8(temp1, zero);
1276 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1277 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1278
1279 flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1280 flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1281
1282 temp2 = _mm_subs_epi16(zero, temp2);
1283 temp1 = _mm_subs_epi16(zero, temp1);
1284
1285 C_8x16 = _mm_add_epi16(C_8x16, temp2);
1286 C_hi_8x16 = _mm_add_epi16(C_hi_8x16, temp1);
1287
1288 const_val4_8x16 = _mm_set1_epi16(4);
1289 temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1290 _mm_unpacklo_epi8(p0_16x8, zero));
1291 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1292 _mm_unpacklo_epi8(q1_16x8, zero));
1293 temp1 = _mm_slli_epi16(temp1, 2);
1294 temp1 = _mm_add_epi16(temp1, temp2);
1295 temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1296 in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1297
1298 temp1 = _mm_subs_epi16(_mm_unpackhi_epi8(q0_16x8, zero),
1299 _mm_unpackhi_epi8(p0_16x8, zero));
1300 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p1_16x8, zero),
1301 _mm_unpackhi_epi8(q1_16x8, zero));
1302 temp1 = _mm_slli_epi16(temp1, 2);
1303 temp1 = _mm_add_epi16(temp1, temp2);
1304 temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1305 in_macro_hi_16x8 = _mm_srai_epi16(temp1, 3);
1306
1307 in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1308 in_macro_hi_16x8 = _mm_min_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1309 C_8x16 = _mm_subs_epi16(zero, C_8x16);
1310 C_hi_8x16 = _mm_subs_epi16(zero, C_hi_8x16);
1311 in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1312 in_macro_hi_16x8 = _mm_max_epi16(C_hi_8x16, in_macro_hi_16x8); //CLIP3
1313
1314 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1315 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p0_16x8, zero), in_macro_hi_16x8);
1316
1317 temp1 = _mm_packus_epi16(temp1, temp2);
1318
1319 temp1 = _mm_and_si128(temp1, flag1_16x8);
1320 temp2 = _mm_and_si128(p0_16x8,
1321 _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1322
1323 temp1 = _mm_add_epi8(temp1, temp2);
1324
1325 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP0), temp1);
1326
1327 temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1328 temp2 = _mm_sub_epi16(_mm_unpackhi_epi8(q0_16x8, zero), in_macro_hi_16x8);
1329
1330 temp1 = _mm_packus_epi16(temp1, temp2);
1331
1332 temp1 = _mm_and_si128(temp1, flag1_16x8);
1333 temp2 = _mm_and_si128(q0_16x8,
1334 _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1335
1336 temp1 = _mm_add_epi8(temp1, temp2);
1337 _mm_storeu_si128((__m128i *)(pu1_src), temp1);
1338
1339 //if(Ap < Beta)
1340 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1341 _mm_unpacklo_epi8(p0_16x8, zero));
1342 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1343 //temp2 = _mm_subs_epi16(zero,temp2);
1344 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1345 temp2 = _mm_add_epi16(temp1, temp2);
1346 in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1347
1348 temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1349 _mm_unpackhi_epi8(p0_16x8, zero));
1350 temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(p1_16x8, zero), 1);
1351 //temp2 = _mm_subs_epi16(zero,temp2);
1352 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(p2_16x8, zero), temp2);
1353 temp2 = _mm_add_epi16(temp1, temp2);
1354 in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1355
1356 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1357 in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1358 C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1359 C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1360 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1361 in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1362
1363 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1364 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(p1_16x8, zero), in_macro_hi_16x8);
1365
1366 temp1 = _mm_packus_epi16(temp1, temp2);
1367
1368 temp1 = _mm_and_si128(temp1, flag2_16x8);
1369 temp2 = _mm_and_si128(p1_16x8,
1370 _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1371 temp1 = _mm_add_epi8(temp1, temp2);
1372 _mm_storeu_si128((__m128i *)(pu1_HorzPixel + i16_posP1), temp1);
1373
1374 //if(Aq < Beta)
1375 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1376 _mm_unpacklo_epi8(p0_16x8, zero));
1377 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1378 //temp2 = _mm_slli_epi16 (temp2, 1);
1379 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1380 temp2 = _mm_add_epi16(temp1, temp2);
1381 in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1382
1383 temp1 = _mm_avg_epu16(_mm_unpackhi_epi8(q0_16x8, zero),
1384 _mm_unpackhi_epi8(p0_16x8, zero));
1385 temp2 = _mm_slli_epi16(_mm_unpackhi_epi8(q1_16x8, zero), 1);
1386 //temp2 = _mm_slli_epi16 (temp2, 1);
1387 temp2 = _mm_subs_epi16(_mm_unpackhi_epi8(q2_16x8, zero), temp2);
1388 temp2 = _mm_add_epi16(temp1, temp2);
1389 in_macro_hi_16x8 = _mm_srai_epi16(temp2, 1);
1390
1391 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1392 in_macro_hi_16x8 = _mm_max_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1393 C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1394 C0_hi_8x16 = _mm_subs_epi16(zero, C0_hi_8x16);
1395 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1396 in_macro_hi_16x8 = _mm_min_epi16(C0_hi_8x16, in_macro_hi_16x8); //CLIP3
1397
1398 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1399 temp2 = _mm_add_epi16(_mm_unpackhi_epi8(q1_16x8, zero), in_macro_hi_16x8);
1400
1401 temp1 = _mm_packus_epi16(temp1, temp2);
1402
1403 temp1 = _mm_and_si128(temp1, flag3_16x8);
1404 temp2 = _mm_and_si128(q1_16x8,
1405 _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1406 temp1 = _mm_add_epi8(temp1, temp2);
1407
1408 _mm_storeu_si128((__m128i *)(pu1_src + i16_posQ1), temp1);
1409
1410 }
1411
1412 /*****************************************************************************/
1413 /* */
1414 /* Function Name : ih264_deblk_luma_vert_bs4_mbaff_ssse3() */
1415 /* */
1416 /* Description : This function performs filtering of a luma block */
1417 /* vertical edge when boundary strength is set to 4. */
1418 /* */
1419 /* Inputs : pu1_src - pointer to the src sample q0 */
1420 /* src_strd - source stride */
1421 /* alpha - alpha value for the boundary */
1422 /* beta - beta value for the boundary */
1423 /* */
1424 /* Globals : None */
1425 /* */
1426 /* Processing : When the function is called twice, this operation is as */
1427 /* described in Sec. 8.7.2.3 under the title "Filtering */
1428 /* process for edges for bS equal to 4" in ITU T Rec H.264. */
1429 /* */
1430 /* Outputs : None */
1431 /* */
1432 /* Returns : None */
1433 /* */
1434 /* Issues : None */
1435 /* */
1436 /* Revision History: */
1437 /* */
1438 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1439 /* 12 02 2015 Naveen Kumar P Initial version */
1440 /* */
1441 /*****************************************************************************/
ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta)1442 void ih264_deblk_luma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
1443 WORD32 src_strd,
1444 WORD32 alpha,
1445 WORD32 beta)
1446 {
1447 __m128i zero = _mm_setzero_si128();
1448 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1449 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1450 __m128i q0_8x16, q1_8x16, q2_8x16, q3_8x16;
1451 __m128i p0_8x16, p1_8x16, p2_8x16, p3_8x16;
1452 __m128i q0_16x8_1;
1453 __m128i p0_16x8_1;
1454 __m128i q0_16x8_2, q1_16x8_2, q2_16x8_2;
1455 __m128i p0_16x8_2, p1_16x8_2, p2_16x8_2;
1456 __m128i temp1, temp2, temp3, temp4, temp5, temp6;
1457 __m128i Alpha_8x16, Beta_8x16;
1458 __m128i flag1_16x8, flag2_16x8, flag3_16x8, flag4_16x8;
1459 __m128i const_val2_16x8 = _mm_set1_epi16(2);
1460 __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1461
1462 Alpha_8x16 = _mm_set1_epi16(alpha);
1463 Beta_8x16 = _mm_set1_epi16(beta);
1464
1465 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1466 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1467 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1468 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1469 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1470 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1471 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1472 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1473
1474 temp1 = _mm_unpacklo_epi8(line1, line2);
1475 temp2 = _mm_unpacklo_epi8(line3, line4);
1476 temp3 = _mm_unpacklo_epi8(line5, line6);
1477 temp4 = _mm_unpacklo_epi8(line7, line8);
1478
1479 line1 = _mm_unpacklo_epi16(temp1, temp2);
1480 line2 = _mm_unpackhi_epi16(temp1, temp2);
1481 line3 = _mm_unpacklo_epi16(temp3, temp4);
1482 line4 = _mm_unpackhi_epi16(temp3, temp4);
1483
1484 p1_8x16 = _mm_unpacklo_epi32(line1, line3);
1485 p0_8x16 = _mm_unpackhi_epi32(line1, line3);
1486 q0_8x16 = _mm_unpacklo_epi32(line2, line4);
1487 q1_8x16 = _mm_unpackhi_epi32(line2, line4);
1488
1489 p3_16x8 = _mm_unpacklo_epi64(p1_8x16, zero);
1490 p2_16x8 = _mm_unpackhi_epi64(p1_8x16, zero);
1491 q2_16x8 = _mm_unpacklo_epi64(q1_8x16, zero);
1492 q3_16x8 = _mm_unpackhi_epi64(q1_8x16, zero);
1493 p1_16x8 = _mm_unpacklo_epi64(p0_8x16, zero);
1494 p0_16x8 = _mm_unpackhi_epi64(p0_8x16, zero);
1495 q0_16x8 = _mm_unpacklo_epi64(q0_8x16, zero);
1496 q1_16x8 = _mm_unpackhi_epi64(q0_8x16, zero);
1497
1498 //Cond1 (ABS(p0 - q0) < alpha)
1499 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1500 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1501 temp1 = _mm_add_epi8(temp1, temp2);
1502
1503 temp2 = _mm_unpacklo_epi8(temp1, zero);
1504 temp1 = _mm_unpackhi_epi8(temp1, zero);
1505
1506 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1507 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1508
1509 flag1_16x8 = _mm_packs_epi16(temp2, temp1);
1510
1511 //Cond2 (ABS(q1 - q0) < beta)
1512 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1513 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1514 temp1 = _mm_add_epi8(temp1, temp2);
1515
1516 temp2 = _mm_unpacklo_epi8(temp1, zero);
1517 temp1 = _mm_unpackhi_epi8(temp1, zero);
1518
1519 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1520 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1521
1522 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1523
1524 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1525
1526 //Cond3 (ABS(p1 - p0) < beta)
1527 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1528 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1529 temp1 = _mm_add_epi8(temp1, temp2);
1530
1531 temp2 = _mm_unpacklo_epi8(temp1, zero);
1532 temp1 = _mm_unpackhi_epi8(temp1, zero);
1533
1534 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1535 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1536
1537 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1538
1539 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1540 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1541
1542 // (ABS(p0 - q0) < ((alpha >> 2) + 2))
1543 temp1 = _mm_subs_epu8(p0_16x8, q0_16x8);
1544 temp2 = _mm_subs_epu8(q0_16x8, p0_16x8);
1545 temp1 = _mm_add_epi8(temp1, temp2);
1546 Alpha_8x16 = _mm_srai_epi16(Alpha_8x16, 2);
1547 Alpha_8x16 = _mm_add_epi16(Alpha_8x16, const_val2_16x8);
1548
1549 temp2 = _mm_unpacklo_epi8(temp1, zero);
1550 temp1 = _mm_unpackhi_epi8(temp1, zero);
1551 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1552 temp1 = _mm_cmpgt_epi16(Alpha_8x16, temp1);
1553
1554 flag2_16x8 = _mm_packs_epi16(temp2, temp1);
1555 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1556
1557 // (ABS(p2 - p0) < beta)
1558 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1559 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1560 temp1 = _mm_add_epi8(temp1, temp2);
1561
1562 temp2 = _mm_unpacklo_epi8(temp1, zero);
1563 temp1 = _mm_unpackhi_epi8(temp1, zero);
1564 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1565 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1566
1567 flag3_16x8 = _mm_packs_epi16(temp2, temp1);
1568 flag3_16x8 = _mm_and_si128(flag3_16x8, flag2_16x8);
1569
1570 // (ABS(q2 - q0) < beta)
1571 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1572 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1573 temp1 = _mm_add_epi8(temp1, temp2);
1574
1575 temp2 = _mm_unpacklo_epi8(temp1, zero);
1576 temp1 = _mm_unpackhi_epi8(temp1, zero);
1577 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1578 temp1 = _mm_cmpgt_epi16(Beta_8x16, temp1);
1579
1580 flag4_16x8 = _mm_packs_epi16(temp2, temp1);
1581 flag4_16x8 = _mm_and_si128(flag4_16x8, flag2_16x8);
1582
1583 // First 8 pixels
1584 p3_8x16 = _mm_unpacklo_epi8(p3_16x8, zero);
1585 p2_8x16 = _mm_unpacklo_epi8(p2_16x8, zero);
1586 p1_8x16 = _mm_unpacklo_epi8(p1_16x8, zero);
1587 p0_8x16 = _mm_unpacklo_epi8(p0_16x8, zero);
1588 q0_8x16 = _mm_unpacklo_epi8(q0_16x8, zero);
1589 q1_8x16 = _mm_unpacklo_epi8(q1_16x8, zero);
1590 q2_8x16 = _mm_unpacklo_epi8(q2_16x8, zero);
1591 q3_8x16 = _mm_unpacklo_epi8(q3_16x8, zero);
1592
1593 // p0_1 and q0_1
1594 temp1 = _mm_add_epi16(p0_8x16, q1_8x16);
1595 temp2 = _mm_add_epi16(p1_8x16, q0_8x16);
1596 temp5 = _mm_add_epi16(temp1, const_val2_16x8);
1597 temp6 = _mm_add_epi16(temp2, const_val2_16x8);
1598 temp3 = _mm_slli_epi16(p1_8x16, 1);
1599 temp4 = _mm_slli_epi16(q1_8x16, 1);
1600 temp1 = _mm_add_epi16(temp5, temp3);
1601 temp2 = _mm_add_epi16(temp6, temp4);
1602 p0_16x8_1 = _mm_srai_epi16(temp1, 2);
1603 q0_16x8_1 = _mm_srai_epi16(temp2, 2);
1604
1605 // p1_2 and q1_2
1606 temp6 = _mm_add_epi16(temp6, p0_8x16);
1607 temp5 = _mm_add_epi16(temp5, q0_8x16);
1608 temp1 = _mm_add_epi16(temp6, p2_8x16);
1609 temp2 = _mm_add_epi16(temp5, q2_8x16);
1610 p1_16x8_2 = _mm_srai_epi16(temp1, 2);
1611 q1_16x8_2 = _mm_srai_epi16(temp2, 2);
1612
1613 // p0_2 and q0_2
1614 temp1 = _mm_add_epi16(temp3, p2_8x16);
1615 temp2 = _mm_add_epi16(temp4, q2_8x16);
1616 temp1 = _mm_add_epi16(temp1, q1_8x16);
1617 temp2 = _mm_add_epi16(temp2, p1_8x16);
1618 temp3 = _mm_add_epi16(p0_8x16, q0_8x16);
1619 temp3 = _mm_slli_epi16(temp3, 1);
1620 temp1 = _mm_add_epi16(temp1, temp3);
1621 temp2 = _mm_add_epi16(temp2, temp3);
1622 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(4));
1623 temp2 = _mm_add_epi16(temp2, _mm_set1_epi16(4));
1624 p0_16x8_2 = _mm_srai_epi16(temp1, 3);
1625 q0_16x8_2 = _mm_srai_epi16(temp2, 3);
1626
1627 // p2_2 and q2_2
1628 temp1 = _mm_add_epi16(temp6, const_val2_16x8);
1629 temp2 = _mm_add_epi16(temp5, const_val2_16x8);
1630 temp3 = _mm_slli_epi16(p2_8x16, 1);
1631 temp4 = _mm_slli_epi16(q2_8x16, 1);
1632 temp3 = _mm_add_epi16(p2_8x16, temp3);
1633 temp4 = _mm_add_epi16(q2_8x16, temp4);
1634 temp5 = _mm_slli_epi16(p3_8x16, 1);
1635 temp6 = _mm_slli_epi16(q3_8x16, 1);
1636 temp1 = _mm_add_epi16(temp1, temp3);
1637 temp2 = _mm_add_epi16(temp2, temp4);
1638 temp1 = _mm_add_epi16(temp1, temp5);
1639 temp2 = _mm_add_epi16(temp2, temp6);
1640 p2_16x8_2 = _mm_srai_epi16(temp1, 3);
1641 q2_16x8_2 = _mm_srai_epi16(temp2, 3);
1642
1643 // p0_1 and q0_1
1644 p0_16x8_1 = _mm_packus_epi16(p0_16x8_1, zero);
1645 q0_16x8_1 = _mm_packus_epi16(q0_16x8_1, zero);
1646
1647 // p1_2 and q1_2
1648 p1_16x8_2 = _mm_packus_epi16(p1_16x8_2, zero);
1649 q1_16x8_2 = _mm_packus_epi16(q1_16x8_2, zero);
1650
1651 // p0_2 and q0_2
1652 p0_16x8_2 = _mm_packus_epi16(p0_16x8_2, zero);
1653 q0_16x8_2 = _mm_packus_epi16(q0_16x8_2, zero);
1654
1655 // p2_2 and q2_2
1656 p2_16x8_2 = _mm_packus_epi16(p2_16x8_2, zero);
1657 q2_16x8_2 = _mm_packus_epi16(q2_16x8_2, zero);
1658
1659 // p0 and q0
1660 p0_16x8 = _mm_and_si128(p0_16x8,
1661 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1662 p0_16x8_1 = _mm_and_si128(p0_16x8_1, flag1_16x8);
1663 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_1);
1664 q0_16x8 = _mm_and_si128(q0_16x8,
1665 _mm_xor_si128(flag1_16x8, _mm_set1_epi8(0xFF)));
1666 q0_16x8_1 = _mm_and_si128(q0_16x8_1, flag1_16x8);
1667 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_1);
1668
1669 // p0 and q0
1670 p0_16x8 = _mm_and_si128(p0_16x8,
1671 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1672 p0_16x8_2 = _mm_and_si128(p0_16x8_2, flag3_16x8);
1673 p0_16x8 = _mm_add_epi8(p0_16x8, p0_16x8_2);
1674 q0_16x8 = _mm_and_si128(q0_16x8,
1675 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1676 q0_16x8_2 = _mm_and_si128(q0_16x8_2, flag4_16x8);
1677 q0_16x8 = _mm_add_epi8(q0_16x8, q0_16x8_2);
1678
1679 // p1 and q1
1680 p1_16x8 = _mm_and_si128(p1_16x8,
1681 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1682 p1_16x8_2 = _mm_and_si128(p1_16x8_2, flag3_16x8);
1683 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_2);
1684 q1_16x8 = _mm_and_si128(q1_16x8,
1685 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1686 q1_16x8_2 = _mm_and_si128(q1_16x8_2, flag4_16x8);
1687 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_2);
1688
1689 // p2 and q2
1690 p2_16x8 = _mm_and_si128(p2_16x8,
1691 _mm_xor_si128(flag3_16x8, _mm_set1_epi8(0xFF)));
1692 p2_16x8_2 = _mm_and_si128(p2_16x8_2, flag3_16x8);
1693 p2_16x8 = _mm_add_epi8(p2_16x8, p2_16x8_2);
1694 q2_16x8 = _mm_and_si128(q2_16x8,
1695 _mm_xor_si128(flag4_16x8, _mm_set1_epi8(0xFF)));
1696 q2_16x8_2 = _mm_and_si128(q2_16x8_2, flag4_16x8);
1697 q2_16x8 = _mm_add_epi8(q2_16x8, q2_16x8_2);
1698
1699 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1700 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8);
1701 temp3 = _mm_unpacklo_epi8(q0_16x8, q1_16x8);
1702 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1703
1704 p3_8x16 = _mm_unpacklo_epi16(temp1, temp2);
1705 p2_8x16 = _mm_unpackhi_epi16(temp1, temp2);
1706 q2_8x16 = _mm_unpacklo_epi16(temp3, temp4);
1707 q3_8x16 = _mm_unpackhi_epi16(temp3, temp4);
1708
1709 line1 = _mm_unpacklo_epi32(p3_8x16, q2_8x16);
1710 line2 = _mm_srli_si128(line1, 8);
1711 line3 = _mm_unpackhi_epi32(p3_8x16, q2_8x16);
1712 line4 = _mm_srli_si128(line3, 8);
1713 line5 = _mm_unpacklo_epi32(p2_8x16, q3_8x16);
1714 line6 = _mm_srli_si128(line5, 8);
1715 line7 = _mm_unpackhi_epi32(p2_8x16, q3_8x16);
1716 line8 = _mm_srli_si128(line7, 8);
1717
1718 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
1719 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
1720 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
1721 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
1722 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
1723 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
1724 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
1725 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
1726
1727 }
1728
1729 /*****************************************************************************/
1730 /* */
1731 /* Function Name : ih264_deblk_luma_vert_bslt4_mbaff_ssse3() */
1732 /* */
1733 /* Description : This function performs filtering of a luma block */
1734 /* vertical edge when boundary strength is less than 4. */
1735 /* */
1736 /* Inputs : pu1_src - pointer to the src sample q0 */
1737 /* src_strd - source stride */
1738 /* alpha - alpha value for the boundary */
1739 /* beta - beta value for the boundary */
1740 /* u4_bs - packed Boundary strength array */
1741 /* pu1_cliptab - tc0_table */
1742 /* */
1743 /* Globals : None */
1744 /* */
1745 /* Processing : When the function is called twice, this operation is as */
1746 /* described in Sec. 8.7.2.3 under the title "Filtering */
1747 /* process for edges for bS less than 4" in ITU T Rec H.264.*/
1748 /* */
1749 /* Outputs : None */
1750 /* */
1751 /* Returns : None */
1752 /* */
1753 /* Issues : None */
1754 /* */
1755 /* Revision History: */
1756 /* */
1757 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
1758 /* 12 02 2015 Naveen Kumar P Initial version */
1759 /* */
1760 /*****************************************************************************/
ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha,WORD32 beta,UWORD32 u4_bs,const UWORD8 * pu1_cliptab)1761 void ih264_deblk_luma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
1762 WORD32 src_strd,
1763 WORD32 alpha,
1764 WORD32 beta,
1765 UWORD32 u4_bs,
1766 const UWORD8 *pu1_cliptab)
1767 {
1768 __m128i zero = _mm_setzero_si128();
1769 __m128i bs_flag_16x8b, C0_16x8, C0_8x16, C_8x16;
1770 __m128i q0_16x8, q1_16x8, q2_16x8, q3_16x8;
1771 __m128i p0_16x8, p1_16x8, p2_16x8, p3_16x8;
1772 __m128i temp1, temp2, temp3, temp4;
1773 __m128i Alpha_8x16, Beta_8x16, flag1_16x8, flag2_16x8, flag3_16x8;
1774 __m128i in_macro_16x8;
1775 __m128i const_val4_8x16;
1776 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
1777 UWORD8 clip0, clip1, clip2, clip3;
1778 __m128i line1, line2, line3, line4, line5, line6, line7, line8;
1779 __m128i q0_16x8_1, q1_16x8_1, q0_16x8_2;
1780 __m128i p0_16x8_1, p1_16x8_1, p0_16x8_2;
1781
1782 line1 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd));
1783 line2 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd));
1784 line3 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd));
1785 line4 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd));
1786 line5 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd));
1787 line6 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd));
1788 line7 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd));
1789 line8 = _mm_loadl_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd));
1790
1791 temp1 = _mm_unpacklo_epi8(line1, line2);
1792 temp2 = _mm_unpacklo_epi8(line3, line4);
1793 temp3 = _mm_unpacklo_epi8(line5, line6);
1794 temp4 = _mm_unpacklo_epi8(line7, line8);
1795
1796 line1 = _mm_unpacklo_epi16(temp1, temp2);
1797 line2 = _mm_unpackhi_epi16(temp1, temp2);
1798 line3 = _mm_unpacklo_epi16(temp3, temp4);
1799 line4 = _mm_unpackhi_epi16(temp3, temp4);
1800
1801 temp1 = _mm_unpacklo_epi32(line1, line3);
1802 temp2 = _mm_unpackhi_epi32(line1, line3);
1803 temp3 = _mm_unpacklo_epi32(line2, line4);
1804 temp4 = _mm_unpackhi_epi32(line2, line4);
1805
1806 p3_16x8 = _mm_unpacklo_epi64(temp1, zero);
1807 p2_16x8 = _mm_unpackhi_epi64(temp1, zero);
1808 q2_16x8 = _mm_unpacklo_epi64(temp4, zero);
1809 q3_16x8 = _mm_unpackhi_epi64(temp4, zero);
1810 p1_16x8 = _mm_unpacklo_epi64(temp2, zero);
1811 p0_16x8 = _mm_unpackhi_epi64(temp2, zero);
1812 q0_16x8 = _mm_unpacklo_epi64(temp3, zero);
1813 q1_16x8 = _mm_unpackhi_epi64(temp3, zero);
1814
1815 u1_Bs0 = (u4_bs >> 24) & 0xff;
1816 u1_Bs1 = (u4_bs >> 16) & 0xff;
1817 u1_Bs2 = (u4_bs >> 8) & 0xff;
1818 u1_Bs3 = (u4_bs >> 0) & 0xff;
1819 clip0 = pu1_cliptab[u1_Bs0];
1820 clip1 = pu1_cliptab[u1_Bs1];
1821 clip2 = pu1_cliptab[u1_Bs2];
1822 clip3 = pu1_cliptab[u1_Bs3];
1823
1824 Alpha_8x16 = _mm_set1_epi16(alpha);
1825 Beta_8x16 = _mm_set1_epi16(beta);
1826
1827 bs_flag_16x8b = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
1828 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
1829
1830 C0_16x8 = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, clip3, clip3, clip2, clip2,
1831 clip1, clip1, clip0, clip0);
1832
1833 bs_flag_16x8b = _mm_cmpeq_epi8(bs_flag_16x8b, zero);
1834 bs_flag_16x8b = _mm_xor_si128(bs_flag_16x8b, _mm_set1_epi8(0xFF)); //Invert for required mask
1835 C0_8x16 = _mm_unpacklo_epi8(C0_16x8, zero);
1836
1837 //Cond1 (ABS(p0 - q0) < alpha)
1838 temp1 = _mm_subs_epu8(q0_16x8, p0_16x8);
1839 temp2 = _mm_subs_epu8(p0_16x8, q0_16x8);
1840 temp1 = _mm_add_epi8(temp1, temp2);
1841
1842 temp2 = _mm_unpacklo_epi8(temp1, zero);
1843 temp2 = _mm_cmpgt_epi16(Alpha_8x16, temp2);
1844
1845 flag1_16x8 = _mm_packs_epi16(temp2, zero);
1846 flag1_16x8 = _mm_and_si128(flag1_16x8, bs_flag_16x8b);
1847
1848 //Cond2 (ABS(q1 - q0) < beta)
1849 temp1 = _mm_subs_epu8(q0_16x8, q1_16x8);
1850 temp2 = _mm_subs_epu8(q1_16x8, q0_16x8);
1851 temp1 = _mm_add_epi8(temp1, temp2);
1852
1853 temp2 = _mm_unpacklo_epi8(temp1, zero);
1854 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1855
1856 flag2_16x8 = _mm_packs_epi16(temp2, zero);
1857 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1858
1859 //Cond3 (ABS(p1 - p0) < beta)
1860 temp1 = _mm_subs_epu8(p0_16x8, p1_16x8);
1861 temp2 = _mm_subs_epu8(p1_16x8, p0_16x8);
1862 temp1 = _mm_add_epi8(temp1, temp2);
1863
1864 temp2 = _mm_unpacklo_epi8(temp1, zero);
1865 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1866
1867 flag2_16x8 = _mm_packs_epi16(temp2, zero);
1868
1869 // !((ABS(p0 - q0) < alpha) || (ABS(q1 - q0) < beta) || (ABS(p1 - p0) < beta))
1870 flag1_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1871
1872 // (ABS(p2 - p0) < beta)
1873 temp1 = _mm_subs_epu8(p0_16x8, p2_16x8);
1874 temp2 = _mm_subs_epu8(p2_16x8, p0_16x8);
1875 temp1 = _mm_add_epi8(temp1, temp2);
1876
1877 temp2 = _mm_unpacklo_epi8(temp1, zero);
1878 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1879
1880 flag2_16x8 = _mm_packs_epi16(temp2, zero);
1881 flag2_16x8 = _mm_and_si128(flag1_16x8, flag2_16x8);
1882
1883 temp2 = _mm_subs_epi16(zero, temp2);
1884
1885 C_8x16 = _mm_add_epi16(C0_8x16, temp2);
1886
1887 // (ABS(q2 - q0) < beta)
1888 temp1 = _mm_subs_epu8(q0_16x8, q2_16x8);
1889 temp2 = _mm_subs_epu8(q2_16x8, q0_16x8);
1890 temp1 = _mm_add_epi8(temp1, temp2);
1891
1892 temp2 = _mm_unpacklo_epi8(temp1, zero);
1893 temp2 = _mm_cmpgt_epi16(Beta_8x16, temp2);
1894
1895 flag3_16x8 = _mm_packs_epi16(temp2, zero);
1896 flag3_16x8 = _mm_and_si128(flag1_16x8, flag3_16x8);
1897
1898 temp2 = _mm_subs_epi16(zero, temp2);
1899
1900 C_8x16 = _mm_add_epi16(C_8x16, temp2);
1901
1902 const_val4_8x16 = _mm_set1_epi16(4);
1903 temp1 = _mm_subs_epi16(_mm_unpacklo_epi8(q0_16x8, zero),
1904 _mm_unpacklo_epi8(p0_16x8, zero));
1905 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p1_16x8, zero),
1906 _mm_unpacklo_epi8(q1_16x8, zero));
1907 temp1 = _mm_slli_epi16(temp1, 2);
1908 temp1 = _mm_add_epi16(temp1, temp2);
1909 temp1 = _mm_add_epi16(temp1, const_val4_8x16);
1910 in_macro_16x8 = _mm_srai_epi16(temp1, 3);
1911
1912 in_macro_16x8 = _mm_min_epi16(C_8x16, in_macro_16x8); //CLIP3
1913 C_8x16 = _mm_subs_epi16(zero, C_8x16);
1914 in_macro_16x8 = _mm_max_epi16(C_8x16, in_macro_16x8); //CLIP3
1915
1916 // p0
1917 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p0_16x8, zero), in_macro_16x8);
1918
1919 temp1 = _mm_packus_epi16(temp1, zero);
1920
1921 p0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1922 p0_16x8_2 = _mm_and_si128(
1923 p0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1924
1925 p0_16x8_1 = _mm_add_epi8(p0_16x8_1, p0_16x8_2);
1926
1927 // q0
1928 temp1 = _mm_sub_epi16(_mm_unpacklo_epi8(q0_16x8, zero), in_macro_16x8);
1929
1930 temp1 = _mm_packus_epi16(temp1, zero);
1931
1932 q0_16x8_1 = _mm_and_si128(temp1, flag1_16x8);
1933 q0_16x8_2 = _mm_and_si128(
1934 q0_16x8, _mm_xor_si128(flag1_16x8, _mm_set1_epi16(0xFFFF)));
1935
1936 q0_16x8_1 = _mm_add_epi8(q0_16x8_1, q0_16x8_2);
1937
1938 //if(Ap < Beta)
1939 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1940 _mm_unpacklo_epi8(p0_16x8, zero));
1941 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(p1_16x8, zero), 1);
1942 //temp2 = _mm_subs_epi16(zero,temp2);
1943 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(p2_16x8, zero), temp2);
1944 temp2 = _mm_add_epi16(temp1, temp2);
1945 in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1946
1947 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1948 C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1949 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1950
1951 // p1
1952 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(p1_16x8, zero), in_macro_16x8);
1953
1954 temp1 = _mm_packus_epi16(temp1, zero);
1955
1956 p1_16x8_1 = _mm_and_si128(temp1, flag2_16x8);
1957 p1_16x8 = _mm_and_si128(p1_16x8,
1958 _mm_xor_si128(flag2_16x8, _mm_set1_epi16(0xFFFF)));
1959 p1_16x8 = _mm_add_epi8(p1_16x8, p1_16x8_1);
1960
1961 //if(Aq < Beta)
1962 temp1 = _mm_avg_epu16(_mm_unpacklo_epi8(q0_16x8, zero),
1963 _mm_unpacklo_epi8(p0_16x8, zero));
1964 temp2 = _mm_slli_epi16(_mm_unpacklo_epi8(q1_16x8, zero), 1);
1965 //temp2 = _mm_slli_epi16 (temp2, 1);
1966 temp2 = _mm_subs_epi16(_mm_unpacklo_epi8(q2_16x8, zero), temp2);
1967 temp2 = _mm_add_epi16(temp1, temp2);
1968 in_macro_16x8 = _mm_srai_epi16(temp2, 1);
1969
1970 in_macro_16x8 = _mm_max_epi16(C0_8x16, in_macro_16x8); //CLIP3
1971 C0_8x16 = _mm_subs_epi16(zero, C0_8x16);
1972 in_macro_16x8 = _mm_min_epi16(C0_8x16, in_macro_16x8); //CLIP3
1973
1974 temp1 = _mm_add_epi16(_mm_unpacklo_epi8(q1_16x8, zero), in_macro_16x8);
1975
1976 // q1
1977 temp1 = _mm_packus_epi16(temp1, zero);
1978
1979 q1_16x8_1 = _mm_and_si128(temp1, flag3_16x8);
1980 q1_16x8 = _mm_and_si128(q1_16x8,
1981 _mm_xor_si128(flag3_16x8, _mm_set1_epi16(0xFFFF)));
1982 q1_16x8 = _mm_add_epi8(q1_16x8, q1_16x8_1);
1983
1984 temp1 = _mm_unpacklo_epi8(p3_16x8, p2_16x8);
1985 temp2 = _mm_unpacklo_epi8(p1_16x8, p0_16x8_1);
1986 temp3 = _mm_unpacklo_epi8(q0_16x8_1, q1_16x8);
1987 temp4 = _mm_unpacklo_epi8(q2_16x8, q3_16x8);
1988
1989 line7 = _mm_unpacklo_epi16(temp1, temp2);
1990 temp1 = _mm_unpackhi_epi16(temp1, temp2);
1991 line8 = _mm_unpacklo_epi16(temp3, temp4);
1992 temp2 = _mm_unpackhi_epi16(temp3, temp4);
1993
1994 line1 = _mm_unpacklo_epi32(line7, line8);
1995 line2 = _mm_srli_si128(line1, 8);
1996 line3 = _mm_unpackhi_epi32(line7, line8);
1997 line4 = _mm_srli_si128(line3, 8);
1998 line5 = _mm_unpacklo_epi32(temp1, temp2);
1999 line6 = _mm_srli_si128(line5, 8);
2000 line7 = _mm_unpackhi_epi32(temp1, temp2);
2001 line8 = _mm_srli_si128(line7, 8);
2002
2003 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 0 * src_strd), line1);
2004 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 1 * src_strd), line2);
2005 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 2 * src_strd), line3);
2006 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 3 * src_strd), line4);
2007 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 4 * src_strd), line5);
2008 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 5 * src_strd), line6);
2009 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 6 * src_strd), line7);
2010 _mm_storel_epi64((__m128i *)(pu1_src - 4 + 7 * src_strd), line8);
2011 }
2012
2013