1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*****************************************************************************/
21 /* */
22 /* File Name : ih264_deblk_chroma_ssse3.c */
23 /* */
24 /* Description : Contains function definitions for deblocking */
25 /* */
26 /* List of Functions : ih264_deblk_chroma_vert_bs4_ssse3() */
27 /* ih264_deblk_chroma_horz_bs4_ssse3() */
28 /* ih264_deblk_chroma_vert_bslt4_ssse3() */
29 /* ih264_deblk_chroma_horz_bslt4_ssse3() */
30 /* ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
31 /* ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
32 /* */
33 /* Issues / Problems : None */
34 /* */
35 /* Revision History : */
36 /* */
37 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
38 /* 12 02 2015 Naveen Kumar P Added chrom deblocking ssse3 */
39 /* intrinsics */
40 /* */
41 /*****************************************************************************/
42
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46
47 /* System include files */
48 #include <stdio.h>
49
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "ih264_platform_macros.h"
53 #include "ih264_deblk_edge_filters.h"
54 #include "ih264_macros.h"
55
56 /*****************************************************************************/
57 /* Function Definitions */
58 /*****************************************************************************/
59
60 /*****************************************************************************/
61 /* */
62 /* Function Name : ih264_deblk_chroma_vert_bs4_ssse3() */
63 /* */
64 /* Description : This function performs filtering of a chroma block */
65 /* vertical edge when the boundary strength is set to 4 in */
66 /* high profile. */
67 /* */
68 /* Inputs : pu1_src - pointer to the src sample q0 of U */
69 /* src_strd - source stride */
70 /* alpha_cb - alpha value for the boundary in U */
71 /* beta_cb - beta value for the boundary in U */
72 /* alpha_cr - alpha value for the boundary in V */
73 /* beta_cr - beta value for the boundary in V */
74 /* */
75 /* Globals : None */
76 /* */
77 /* Processing : This operation is described in Sec. 8.7.2.4 under the */
78 /* title "Filtering process for edges for bS equal to 4" in */
79 /* ITU T Rec H.264 with alpha and beta values different in */
80 /* U and V. */
81 /* */
82 /* Outputs : None */
83 /* */
84 /* Returns : None */
85 /* */
86 /* Issues : None */
87 /* */
88 /* Revision History: */
89 /* */
90 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
91 /* 12 02 2015 Naveen Kumar P Initial version */
92 /* */
93 /*****************************************************************************/
ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)94 void ih264_deblk_chroma_vert_bs4_ssse3(UWORD8 *pu1_src,
95 WORD32 src_strd,
96 WORD32 alpha_cb,
97 WORD32 beta_cb,
98 WORD32 alpha_cr,
99 WORD32 beta_cr)
100 {
101 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
102 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
103 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
104 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
105 __m128i temp1, temp2, temp3, temp4;
106
107 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
108 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
109 __m128i flag1, flag2;
110 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
111 __m128i zero = _mm_setzero_si128();
112 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
113
114 /* Load and transpose the pixel values */
115 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
116 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
117 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
118 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
119 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
120 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
121 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
122 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
123
124 temp1 = _mm_unpacklo_epi16(linea, lineb);
125 temp2 = _mm_unpacklo_epi16(linec, lined);
126 temp3 = _mm_unpacklo_epi16(linee, linef);
127 temp4 = _mm_unpacklo_epi16(lineg, lineh);
128
129 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
130 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
131 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
132 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
133
134 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
135 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
136 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
137 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
138 /* End of transpose */
139
140 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
141 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
142 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
143 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
144
145 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
146 diff = _mm_abs_epi16(diff);
147 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
148 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
149
150 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
151 diff = _mm_abs_epi16(diff);
152 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
153 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
154
155 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
156 diff = _mm_abs_epi16(diff);
157 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
158
159 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
160 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
161 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
162 temp1 = _mm_add_epi16(temp1, temp2);
163 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
164
165 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
166 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
167 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
168 temp1 = _mm_add_epi16(temp1, temp2);
169 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
170
171 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
172 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
173 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
174 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
175
176 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
177 diff = _mm_abs_epi16(diff);
178 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
179 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
180
181 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
182 diff = _mm_abs_epi16(diff);
183 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
184 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
185
186 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
187 diff = _mm_abs_epi16(diff);
188 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
189
190 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
191 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
192 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
193 temp1 = _mm_add_epi16(temp1, temp2);
194 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
195
196 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
197 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
198 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
199 temp1 = _mm_add_epi16(temp1, temp2);
200 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
201
202 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
203 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
204
205 flag1 = _mm_packs_epi16(flag1, flag2);
206
207 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
208 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
209 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
210 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
211
212 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
213 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
214 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
215 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
216
217 /* Inverse-transpose and store back */
218 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
219 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
220 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
221 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
222
223 linea = _mm_unpacklo_epi32(temp1, temp3);
224 lineb = _mm_srli_si128(linea, 8);
225 linec = _mm_unpackhi_epi32(temp1, temp3);
226 lined = _mm_srli_si128(linec, 8);
227 linee = _mm_unpacklo_epi32(temp2, temp4);
228 linef = _mm_srli_si128(linee, 8);
229 lineg = _mm_unpackhi_epi32(temp2, temp4);
230 lineh = _mm_srli_si128(lineg, 8);
231
232 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
233 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
234 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
235 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
236 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
237 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
238 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
239 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
240
241 }
242
243 /*****************************************************************************/
244 /* */
245 /* Function Name : ih264_deblk_chroma_horz_bs4_ssse3() */
246 /* */
247 /* Description : This function performs filtering of a chroma block */
248 /* horizontal edge when the boundary strength is set to 4 */
249 /* in high profile. */
250 /* */
251 /* Inputs : pu1_src - pointer to the src sample q0 of U */
252 /* src_strd - source stride */
253 /* alpha_cb - alpha value for the boundary in U */
254 /* beta_cb - beta value for the boundary in U */
255 /* alpha_cr - alpha value for the boundary in V */
256 /* beta_cr - beta value for the boundary in V */
257 /* */
258 /* Globals : None */
259 /* */
260 /* Processing : This operation is described in Sec. 8.7.2.4 under the */
261 /* title "Filtering process for edges for bS equal to 4" in */
262 /* ITU T Rec H.264 with alpha and beta values different in */
263 /* U and V. */
264 /* */
265 /* Outputs : None */
266 /* */
267 /* Returns : None */
268 /* */
269 /* Issues : None */
270 /* */
271 /* Revision History: */
272 /* */
273 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
274 /* 12 02 2015 Naveen Kumar P Initial version */
275 /* */
276 /*****************************************************************************/
ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)277 void ih264_deblk_chroma_horz_bs4_ssse3(UWORD8 *pu1_src,
278 WORD32 src_strd,
279 WORD32 alpha_cb,
280 WORD32 beta_cb,
281 WORD32 alpha_cr,
282 WORD32 beta_cr)
283 {
284 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
285 WORD16 i16_posP1, i16_posP0, i16_posQ1;
286
287 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
288 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
289 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
290 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
291 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
292 __m128i flag1, flag2;
293 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
294 __m128i zero = _mm_setzero_si128();
295 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
296 __m128i temp1, temp2;
297
298 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
299
300 i16_posQ1 = src_strd;
301 i16_posP0 = src_strd;
302 i16_posP1 = 0;
303
304 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
305 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
306 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
307 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
308
309 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
310 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
311 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
312 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
313
314 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
315 diff = _mm_abs_epi16(diff);
316 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
317 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
318
319 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
320 diff = _mm_abs_epi16(diff);
321 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
322 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
323
324 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
325 diff = _mm_abs_epi16(diff);
326 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
327
328 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
329 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
330 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
331 temp1 = _mm_add_epi16(temp1, temp2);
332 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
333
334 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
335 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
336 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
337 temp1 = _mm_add_epi16(temp1, temp2);
338 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
339
340 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
341 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
342 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
343 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
344
345 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
346 diff = _mm_abs_epi16(diff);
347 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
348 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
349
350 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
351 diff = _mm_abs_epi16(diff);
352 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
353 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
354
355 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
356 diff = _mm_abs_epi16(diff);
357 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
358
359 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
360 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
361 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
362 temp1 = _mm_add_epi16(temp1, temp2);
363 p0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
364
365 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
366 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
367 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
368 temp1 = _mm_add_epi16(temp1, temp2);
369 q0_uv_8x16_2 = _mm_srai_epi16(temp1, 2);
370
371 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
372 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
373
374 flag1 = _mm_packs_epi16(flag1, flag2);
375
376 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
377 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
378 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
379 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
380 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
381
382 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
383 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
384 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
385 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
386 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
387
388 }
389
390 /*****************************************************************************/
391 /* */
392 /* Function Name : ih264_deblk_chroma_vert_bslt4_ssse3() */
393 /* */
394 /* Description : This function performs filtering of a chroma block */
395 /* vertical edge when the boundary strength is less than 4 */
396 /* in high profile. */
397 /* */
398 /* Inputs : pu1_src - pointer to the src sample q0 of U */
399 /* src_strd - source stride */
400 /* alpha_cb - alpha value for the boundary in U */
401 /* beta_cb - beta value for the boundary in U */
402 /* alpha_cr - alpha value for the boundary in V */
403 /* beta_cr - beta value for the boundary in V */
404 /* u4_bs - packed Boundary strength array */
405 /* pu1_cliptab_cb - tc0_table for U */
406 /* pu1_cliptab_cr - tc0_table for V */
407 /* */
408 /* Globals : None */
409 /* */
410 /* Processing : This operation is described in Sec. 8.7.2.3 under the */
411 /* title "Filtering process for edges for bS less than 4" */
412 /* in ITU T Rec H.264 with alpha and beta values different */
413 /* in U and V. */
414 /* */
415 /* Outputs : None */
416 /* */
417 /* Returns : None */
418 /* */
419 /* Issues : None */
420 /* */
421 /* Revision History: */
422 /* */
423 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
424 /* 12 02 2015 Naveen Kumar P Initial version */
425 /* */
426 /*****************************************************************************/
ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)427 void ih264_deblk_chroma_vert_bslt4_ssse3(UWORD8 *pu1_src,
428 WORD32 src_strd,
429 WORD32 alpha_cb,
430 WORD32 beta_cb,
431 WORD32 alpha_cr,
432 WORD32 beta_cr,
433 UWORD32 u4_bs,
434 const UWORD8 *pu1_cliptab_cb,
435 const UWORD8 *pu1_cliptab_cr)
436 {
437 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
438 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
439 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
440 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
441 __m128i linea, lineb, linec, lined, linee, linef, lineg, lineh;
442 __m128i temp1, temp2, temp3, temp4;
443
444 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
445 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
446 __m128i flag_bs, flag1, flag2;
447 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
448 __m128i zero = _mm_setzero_si128();
449 __m128i C0_uv_8x16;
450 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
451
452 u1_Bs0 = (u4_bs >> 24) & 0xff;
453 u1_Bs1 = (u4_bs >> 16) & 0xff;
454 u1_Bs2 = (u4_bs >> 8) & 0xff;
455 u1_Bs3 = (u4_bs >> 0) & 0xff;
456
457 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
458 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
459 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
460 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
461 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
462
463 /* Load and transpose the pixel values */
464 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
465 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
466 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
467 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
468 linee = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd));
469 linef = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd));
470 lineg = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd));
471 lineh = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd));
472
473 temp1 = _mm_unpacklo_epi16(linea, lineb);
474 temp2 = _mm_unpacklo_epi16(linec, lined);
475 temp3 = _mm_unpacklo_epi16(linee, linef);
476 temp4 = _mm_unpacklo_epi16(lineg, lineh);
477
478 p1_uv_8x16 = _mm_unpacklo_epi32(temp1, temp2);
479 p0_uv_8x16 = _mm_unpacklo_epi32(temp3, temp4);
480 q0_uv_8x16 = _mm_unpackhi_epi32(temp1, temp2);
481 q1_uv_8x16 = _mm_unpackhi_epi32(temp3, temp4);
482
483 p1_uv_16x8 = _mm_unpacklo_epi64(p1_uv_8x16, p0_uv_8x16);
484 p0_uv_16x8 = _mm_unpackhi_epi64(p1_uv_8x16, p0_uv_8x16);
485 q0_uv_16x8 = _mm_unpacklo_epi64(q0_uv_8x16, q1_uv_8x16);
486 q1_uv_16x8 = _mm_unpackhi_epi64(q0_uv_8x16, q1_uv_8x16);
487 /* End of transpose */
488
489 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
490 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
491 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
492 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
493
494 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
495 diff = _mm_abs_epi16(diff);
496 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
497 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
498
499 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
500 diff = _mm_abs_epi16(diff);
501 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
502 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
503
504 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
505 diff = _mm_abs_epi16(diff);
506 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
507
508 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
509 diff = _mm_slli_epi16(diff, 2);
510 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
511 diff = _mm_add_epi16(diff, diff1);
512 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
513 in_macro = _mm_srai_epi16(diff, 3);
514
515 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
516 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
517 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
518 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
519
520 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
521
522 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
523 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
524 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
525
526 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
527 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
528
529 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
530 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
531 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
532 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
533
534 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
535 diff = _mm_abs_epi16(diff);
536 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
537 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
538
539 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
540 diff = _mm_abs_epi16(diff);
541 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
542 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
543
544 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
545 diff = _mm_abs_epi16(diff);
546 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
547
548 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
549 diff = _mm_slli_epi16(diff, 2);
550 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
551 diff = _mm_add_epi16(diff, diff1);
552 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
553 in_macro = _mm_srai_epi16(diff, 3);
554
555 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
556 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
557 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
558 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
559
560 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
561
562 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
563 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
564 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
565
566 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
567 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
568
569 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
570 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
571
572 flag1 = _mm_packs_epi16(flag1, flag2);
573 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
574
575 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
576 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
577 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
578 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
579
580 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
581 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
582 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
583 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
584
585 /* Inverse-transpose and store back */
586 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
587 temp2 = _mm_unpackhi_epi16(p1_uv_16x8, p0_uv_16x8);
588 temp3 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
589 temp4 = _mm_unpackhi_epi16(q0_uv_16x8, q1_uv_16x8);
590
591 linea = _mm_unpacklo_epi32(temp1, temp3);
592 lineb = _mm_srli_si128(linea, 8);
593 linec = _mm_unpackhi_epi32(temp1, temp3);
594 lined = _mm_srli_si128(linec, 8);
595 linee = _mm_unpacklo_epi32(temp2, temp4);
596 linef = _mm_srli_si128(linee, 8);
597 lineg = _mm_unpackhi_epi32(temp2, temp4);
598 lineh = _mm_srli_si128(lineg, 8);
599
600 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
601 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
602 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
603 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
604 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 4 * src_strd), linee);
605 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 5 * src_strd), linef);
606 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 6 * src_strd), lineg);
607 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 7 * src_strd), lineh);
608
609 }
610
611 /*****************************************************************************/
612 /* */
613 /* Function Name : ih264_deblk_chroma_horz_bslt4_ssse3() */
614 /* */
615 /* Description : This function performs filtering of a chroma block */
616 /* horizontal edge when the boundary strength is less than */
617 /* 4 in high profile. */
618 /* */
619 /* Inputs : pu1_src - pointer to the src sample q0 of U */
620 /* src_strd - source stride */
621 /* alpha_cb - alpha value for the boundary in U */
622 /* beta_cb - beta value for the boundary in U */
623 /* alpha_cr - alpha value for the boundary in V */
624 /* beta_cr - beta value for the boundary in V */
625 /* u4_bs - packed Boundary strength array */
626 /* pu1_cliptab_cb - tc0_table for U */
627 /* pu1_cliptab_cr - tc0_table for V */
628 /* */
629 /* Globals : None */
630 /* */
631 /* Processing : This operation is described in Sec. 8.7.2.3 under the */
632 /* title "Filtering process for edges for bS less than 4" */
633 /* in ITU T Rec H.264 with alpha and beta values different */
634 /* in U and V. */
635 /* */
636 /* Outputs : None */
637 /* */
638 /* Returns : None */
639 /* */
640 /* Issues : None */
641 /* */
642 /* Revision History: */
643 /* */
644 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
645 /* 12 02 2015 Naveen Kumar P Initial version */
646 /* */
647 /*****************************************************************************/
ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)648 void ih264_deblk_chroma_horz_bslt4_ssse3(UWORD8 *pu1_src,
649 WORD32 src_strd,
650 WORD32 alpha_cb,
651 WORD32 beta_cb,
652 WORD32 alpha_cr,
653 WORD32 beta_cr,
654 UWORD32 u4_bs,
655 const UWORD8 *pu1_cliptab_cb,
656 const UWORD8 *pu1_cliptab_cr)
657 {
658 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
659 WORD16 i16_posP1, i16_posP0, i16_posQ1;
660 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
661
662 UWORD8 *pu1_HorzPixelUV; /*! < Pointer to the first pixel of the boundary */
663 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
664 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
665 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
666 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
667 __m128i flag_bs, flag1, flag2;
668 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
669 __m128i zero = _mm_setzero_si128();
670 __m128i C0_uv_8x16;
671 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
672
673 pu1_HorzPixelUV = pu1_src_uv - (src_strd << 1);
674
675 i16_posQ1 = src_strd;
676 i16_posP0 = src_strd;
677 i16_posP1 = 0;
678
679 u1_Bs0 = (u4_bs >> 24) & 0xff;
680 u1_Bs1 = (u4_bs >> 16) & 0xff;
681 u1_Bs2 = (u4_bs >> 8) & 0xff;
682 u1_Bs3 = (u4_bs >> 0) & 0xff;
683
684 flag_bs = _mm_set_epi8(u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs3, u1_Bs2, u1_Bs2,
685 u1_Bs2, u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs1, u1_Bs1,
686 u1_Bs0, u1_Bs0, u1_Bs0, u1_Bs0);
687 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
688 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
689
690 q0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv));
691 q1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_src_uv + i16_posQ1));
692 p1_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP1));
693 p0_uv_16x8 = _mm_loadu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0));
694
695 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
696 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
697 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
698 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
699
700 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
701 diff = _mm_abs_epi16(diff);
702 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
703 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
704
705 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
706 diff = _mm_abs_epi16(diff);
707 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
708 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
709
710 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
711 diff = _mm_abs_epi16(diff);
712 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
713
714 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
715 diff = _mm_slli_epi16(diff, 2);
716 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
717 diff = _mm_add_epi16(diff, diff1);
718 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
719 in_macro = _mm_srai_epi16(diff, 3);
720
721 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
722 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
723 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0],
724 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
725
726 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
727
728 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
729 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
730 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
731
732 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
733 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
734
735 q0_uv_8x16 = _mm_unpackhi_epi8(q0_uv_16x8, zero);
736 q1_uv_8x16 = _mm_unpackhi_epi8(q1_uv_16x8, zero);
737 p1_uv_8x16 = _mm_unpackhi_epi8(p1_uv_16x8, zero);
738 p0_uv_8x16 = _mm_unpackhi_epi8(p0_uv_16x8, zero);
739
740 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
741 diff = _mm_abs_epi16(diff);
742 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
743 flag2 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
744
745 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
746 diff = _mm_abs_epi16(diff);
747 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
748 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
749
750 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
751 diff = _mm_abs_epi16(diff);
752 flag2 = _mm_and_si128(flag2, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
753
754 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
755 diff = _mm_slli_epi16(diff, 2);
756 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
757 diff = _mm_add_epi16(diff, diff1);
758 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
759 in_macro = _mm_srai_epi16(diff, 3);
760
761 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
762 pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
763 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
764 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2]);
765
766 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
767
768 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
769 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
770 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
771
772 p0_uv_8x16_2 = _mm_add_epi16(p0_uv_8x16, in_macro);
773 q0_uv_8x16_2 = _mm_sub_epi16(q0_uv_8x16, in_macro);
774
775 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_2);
776 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_2);
777
778 flag1 = _mm_packs_epi16(flag1, flag2);
779 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
780
781 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
782 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
783 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
784 p0_uv_8x16_1 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
785 _mm_storeu_si128((__m128i *)(pu1_HorzPixelUV + i16_posP0), p0_uv_8x16_1);
786
787 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
788 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
789 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
790 q0_uv_8x16_1 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
791 _mm_storeu_si128((__m128i *)(pu1_src_uv), q0_uv_8x16_1);
792
793 }
794
795 /*****************************************************************************/
796 /* */
797 /* Function Name : ih264_deblk_chroma_vert_bs4_mbaff_ssse3() */
798 /* */
799 /* Description : This function performs filtering of a chroma block */
800 /* vertical edge when boundary strength is set to 4 in high */
801 /* profile. */
802 /* */
803 /* Inputs : pu1_src - pointer to the src sample q0 of U */
804 /* src_strd - source stride */
805 /* alpha_cb - alpha value for the boundary in U */
806 /* beta_cb - beta value for the boundary in U */
807 /* alpha_cr - alpha value for the boundary in V */
808 /* beta_cr - beta value for the boundary in V */
809 /* u4_bs - packed Boundary strength array */
810 /* pu1_cliptab_cb - tc0_table for U */
811 /* pu1_cliptab_cr - tc0_table for V */
812 /* */
813 /* Globals : None */
814 /* */
815 /* Processing : When the function is called twice, this operation is as */
816 /* described in Sec. 8.7.2.4 under the title "Filtering */
817 /* process for edges for bS equal to 4" in ITU T Rec H.264 */
818 /* with alpha and beta values different in U and V. */
819 /* */
820 /* Outputs : None */
821 /* */
822 /* Returns : None */
823 /* */
824 /* Issues : None */
825 /* */
826 /* Revision History: */
827 /* */
828 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
829 /* 12 02 2015 Naveen Kumar P Initial version */
830 /* */
831 /*****************************************************************************/
ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr)832 void ih264_deblk_chroma_vert_bs4_mbaff_ssse3(UWORD8 *pu1_src,
833 WORD32 src_strd,
834 WORD32 alpha_cb,
835 WORD32 beta_cb,
836 WORD32 alpha_cr,
837 WORD32 beta_cr)
838 {
839 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
840 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
841 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
842 __m128i linea, lineb, linec, lined;
843 __m128i temp1, temp2;
844
845 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
846 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
847 __m128i flag1;
848 __m128i diff, alpha_cbcr_16x8, beta_cbcr_16x8;
849 __m128i zero = _mm_setzero_si128();
850 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
851
852 /* Load and transpose the pixel values */
853 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
854 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
855 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
856 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
857
858 temp1 = _mm_unpacklo_epi16(linea, lineb);
859 temp2 = _mm_unpacklo_epi16(linec, lined);
860
861 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
862 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
863 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
864 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
865 /* End of transpose */
866
867 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
868 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
869 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
870 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
871
872 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
873 diff = _mm_abs_epi16(diff);
874 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
875 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
876
877 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
878 diff = _mm_abs_epi16(diff);
879 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
880 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
881
882 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
883 diff = _mm_abs_epi16(diff);
884 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
885
886 temp1 = _mm_slli_epi16(p1_uv_8x16, 1);
887 temp2 = _mm_add_epi16(p0_uv_8x16, q1_uv_8x16);
888 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
889 temp1 = _mm_add_epi16(temp1, temp2);
890 p0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
891
892 temp1 = _mm_slli_epi16(q1_uv_8x16, 1);
893 temp2 = _mm_add_epi16(p1_uv_8x16, q0_uv_8x16);
894 temp1 = _mm_add_epi16(temp1, _mm_set1_epi16(2));
895 temp1 = _mm_add_epi16(temp1, temp2);
896 q0_uv_8x16_1 = _mm_srai_epi16(temp1, 2);
897
898 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
899 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
900
901 flag1 = _mm_packs_epi16(flag1, flag1);
902
903 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
904 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
905 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
906 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
907
908 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
909 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
910 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
911 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
912
913 /* Inverse-transpose and store back */
914 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
915 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
916
917 linea = _mm_unpacklo_epi32(temp1, temp2);
918 lineb = _mm_srli_si128(linea, 8);
919 linec = _mm_unpackhi_epi32(temp1, temp2);
920 lined = _mm_srli_si128(linec, 8);
921
922 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
923 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
924 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
925 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
926
927 }
928
929 /*****************************************************************************/
930 /* */
931 /* Function Name : ih264_deblk_chroma_vert_bslt4_mbaff_ssse3() */
932 /* */
933 /* Description : This function performs filtering of a chroma block */
934 /* vertical edge when boundary strength is less than 4 in */
935 /* high profile. */
936 /* */
937 /* Inputs : pu1_src - pointer to the src sample q0 of U */
938 /* src_strd - source stride */
939 /* alpha_cb - alpha value for the boundary in U */
940 /* beta_cb - beta value for the boundary in U */
941 /* alpha_cr - alpha value for the boundary in V */
942 /* beta_cr - beta value for the boundary in V */
943 /* u4_bs - packed Boundary strength array */
944 /* pu1_cliptab_cb - tc0_table for U */
945 /* pu1_cliptab_cr - tc0_table for V */
946 /* */
947 /* Globals : None */
948 /* */
949 /* Processing : When the function is called twice, this operation is as */
950 /* described in Sec. 8.7.2.4 under the title "Filtering */
951 /* process for edges for bS less than 4" in ITU T Rec H.264 */
952 /* with alpha and beta values different in U and V. */
953 /* */
954 /* Outputs : None */
955 /* */
956 /* Returns : None */
957 /* */
958 /* Issues : None */
959 /* */
960 /* Revision History: */
961 /* */
962 /* DD MM YYYY Author(s) Changes (Describe the changes made) */
963 /* 12 02 2015 Naveen Kumar P Initial version */
964 /* */
965 /*****************************************************************************/
ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 alpha_cb,WORD32 beta_cb,WORD32 alpha_cr,WORD32 beta_cr,UWORD32 u4_bs,const UWORD8 * pu1_cliptab_cb,const UWORD8 * pu1_cliptab_cr)966 void ih264_deblk_chroma_vert_bslt4_mbaff_ssse3(UWORD8 *pu1_src,
967 WORD32 src_strd,
968 WORD32 alpha_cb,
969 WORD32 beta_cb,
970 WORD32 alpha_cr,
971 WORD32 beta_cr,
972 UWORD32 u4_bs,
973 const UWORD8 *pu1_cliptab_cb,
974 const UWORD8 *pu1_cliptab_cr)
975 {
976 UWORD8 *pu1_src_uv = pu1_src; /* Pointer to the src sample q0 of plane U*/
977 UWORD8 u1_Bs0, u1_Bs1, u1_Bs2, u1_Bs3;
978 WORD32 alpha_cbcr = (alpha_cr << 16) + alpha_cb;
979 WORD32 beta_cbcr = (beta_cr << 16) + beta_cb;
980 __m128i linea, lineb, linec, lined;
981 __m128i temp1, temp2;
982
983 __m128i q0_uv_16x8, p0_uv_16x8, q1_uv_16x8, p1_uv_16x8;
984 __m128i q0_uv_8x16, p0_uv_8x16, q1_uv_8x16, p1_uv_8x16;
985 __m128i flag_bs, flag1;
986 __m128i diff, diff1, alpha_cbcr_16x8, beta_cbcr_16x8, in_macro;
987 __m128i zero = _mm_setzero_si128();
988 __m128i C0_uv_8x16;
989 __m128i p0_uv_8x16_1, p0_uv_8x16_2, q0_uv_8x16_1, q0_uv_8x16_2;
990
991 u1_Bs0 = (u4_bs >> 24) & 0xff;
992 u1_Bs1 = (u4_bs >> 16) & 0xff;
993 u1_Bs2 = (u4_bs >> 8) & 0xff;
994 u1_Bs3 = (u4_bs >> 0) & 0xff;
995
996 flag_bs = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, u1_Bs3, u1_Bs3, u1_Bs2,
997 u1_Bs2, u1_Bs1, u1_Bs1, u1_Bs0, u1_Bs0);
998 flag_bs = _mm_cmpeq_epi8(flag_bs, zero); //Set flag to 1s and 0s
999 flag_bs = _mm_xor_si128(flag_bs, _mm_set1_epi8(0xFF)); //Invert for required mask
1000
1001 /* Load and transpose the pixel values */
1002 linea = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4));
1003 lineb = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + src_strd));
1004 linec = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd));
1005 lined = _mm_loadl_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd));
1006
1007 temp1 = _mm_unpacklo_epi16(linea, lineb);
1008 temp2 = _mm_unpacklo_epi16(linec, lined);
1009
1010 p1_uv_16x8 = _mm_unpacklo_epi32(temp1, temp2);
1011 p0_uv_16x8 = _mm_srli_si128(p1_uv_16x8, 8);
1012 q0_uv_16x8 = _mm_unpackhi_epi32(temp1, temp2);
1013 q1_uv_16x8 = _mm_srli_si128(q0_uv_16x8, 8);
1014 /* End of transpose */
1015
1016 q0_uv_8x16 = _mm_unpacklo_epi8(q0_uv_16x8, zero);
1017 q1_uv_8x16 = _mm_unpacklo_epi8(q1_uv_16x8, zero);
1018 p1_uv_8x16 = _mm_unpacklo_epi8(p1_uv_16x8, zero);
1019 p0_uv_8x16 = _mm_unpacklo_epi8(p0_uv_16x8, zero);
1020
1021 diff = _mm_subs_epi16(p0_uv_8x16, q0_uv_8x16); //Condn 1
1022 diff = _mm_abs_epi16(diff);
1023 alpha_cbcr_16x8 = _mm_set1_epi32(alpha_cbcr);
1024 flag1 = _mm_cmpgt_epi16(alpha_cbcr_16x8, diff);
1025
1026 diff = _mm_subs_epi16(q1_uv_8x16, q0_uv_8x16); //Condtn 2
1027 diff = _mm_abs_epi16(diff);
1028 beta_cbcr_16x8 = _mm_set1_epi32(beta_cbcr);
1029 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1030
1031 diff = _mm_subs_epi16(p1_uv_8x16, p0_uv_8x16); //Condtn 3
1032 diff = _mm_abs_epi16(diff);
1033 flag1 = _mm_and_si128(flag1, _mm_cmpgt_epi16(beta_cbcr_16x8, diff));
1034
1035 diff = _mm_subs_epi16(q0_uv_8x16, p0_uv_8x16);
1036 diff = _mm_slli_epi16(diff, 2);
1037 diff1 = _mm_subs_epi16(p1_uv_8x16, q1_uv_8x16);
1038 diff = _mm_add_epi16(diff, diff1);
1039 diff = _mm_add_epi16(diff, _mm_set1_epi16(4));
1040 in_macro = _mm_srai_epi16(diff, 3);
1041
1042 C0_uv_8x16 = _mm_set_epi16(pu1_cliptab_cr[u1_Bs3], pu1_cliptab_cb[u1_Bs3],
1043 pu1_cliptab_cr[u1_Bs2], pu1_cliptab_cb[u1_Bs2],
1044 pu1_cliptab_cr[u1_Bs1], pu1_cliptab_cb[u1_Bs1],
1045 pu1_cliptab_cr[u1_Bs0], pu1_cliptab_cb[u1_Bs0]);
1046
1047 C0_uv_8x16 = _mm_add_epi16(C0_uv_8x16, _mm_set1_epi16(1));
1048
1049 in_macro = _mm_min_epi16(C0_uv_8x16, in_macro); //CLIP3
1050 C0_uv_8x16 = _mm_subs_epi16(zero, C0_uv_8x16);
1051 in_macro = _mm_max_epi16(C0_uv_8x16, in_macro);
1052
1053 p0_uv_8x16_1 = _mm_add_epi16(p0_uv_8x16, in_macro);
1054 q0_uv_8x16_1 = _mm_sub_epi16(q0_uv_8x16, in_macro);
1055
1056 p0_uv_8x16_2 = _mm_packus_epi16(p0_uv_8x16_1, p0_uv_8x16_1);
1057 q0_uv_8x16_2 = _mm_packus_epi16(q0_uv_8x16_1, q0_uv_8x16_1);
1058
1059 flag1 = _mm_packs_epi16(flag1, flag1);
1060 flag1 = _mm_and_si128(flag1, flag_bs); //Final flag (BS condition + other 3 conditions)
1061
1062 p0_uv_8x16_1 = _mm_and_si128(p0_uv_16x8,
1063 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1064 p0_uv_8x16_2 = _mm_and_si128(p0_uv_8x16_2, flag1);
1065 p0_uv_16x8 = _mm_add_epi8(p0_uv_8x16_1, p0_uv_8x16_2);
1066
1067 q0_uv_8x16_1 = _mm_and_si128(q0_uv_16x8,
1068 _mm_xor_si128(flag1, _mm_set1_epi8(0xFF)));
1069 q0_uv_8x16_2 = _mm_and_si128(q0_uv_8x16_2, flag1);
1070 q0_uv_16x8 = _mm_add_epi8(q0_uv_8x16_1, q0_uv_8x16_2);
1071
1072 /* Inverse-transpose and store back */
1073 temp1 = _mm_unpacklo_epi16(p1_uv_16x8, p0_uv_16x8);
1074 temp2 = _mm_unpacklo_epi16(q0_uv_16x8, q1_uv_16x8);
1075
1076 linea = _mm_unpacklo_epi32(temp1, temp2);
1077 lineb = _mm_srli_si128(linea, 8);
1078 linec = _mm_unpackhi_epi32(temp1, temp2);
1079 lined = _mm_srli_si128(linec, 8);
1080
1081 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4), linea);
1082 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + src_strd), lineb);
1083 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 2 * src_strd), linec);
1084 _mm_storel_epi64((__m128i *)(pu1_src_uv - 4 + 3 * src_strd), lined);
1085
1086 }
1087
1088