1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264_chroma_intra_pred_filters_ssse3.c
24 *
25 * @brief
26 * Contains function definitions for chroma intra prediction filters in x86
27 * intrinsics
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
34 * -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
35 * -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
36 *
37 * @remarks
38 * None
39 *
40 *******************************************************************************
41 */
42
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46
47 /* System include files */
48 #include <stdio.h>
49 #include <stddef.h>
50 #include <string.h>
51
52 /* User include files */
53 #include "ih264_defs.h"
54 #include "ih264_typedefs.h"
55 #include "ih264_macros.h"
56 #include "ih264_platform_macros.h"
57 #include "ih264_intra_pred_filters.h"
58
59
60 /*****************************************************************************/
61 /* Chroma Intra prediction 8x8 filters */
62 /*****************************************************************************/
63 /**
64 *******************************************************************************
65 *
66 * ih264_intra_pred_chroma_8x8_mode_horz_ssse3
67 *
68 * @brief
69 * Perform Intra prediction for chroma_8x8 mode:Horizontal
70 *
71 * @par Description:
72 * Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
73 *
74 * @param[in] pu1_src
75 * UWORD8 pointer to the source containing alternate U and V samples
76 *
77 * @param[out] pu1_dst
78 * UWORD8 pointer to the destination with alternate U and V samples
79 *
80 * @param[in] src_strd
81 * integer source stride
82 *
83 * @param[in] dst_strd
84 * integer destination stride
85 *
86 * @param[in] ngbr_avail
87 * availability of neighbouring pixels(Not used in this function)
88 *
89 * @returns
90 *
91 * @remarks
92 * None
93 *
94 ******************************************************************************
95 */
ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)96 void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
97 UWORD8 *pu1_dst,
98 WORD32 src_strd,
99 WORD32 dst_strd,
100 WORD32 ngbr_avail)
101 {
102
103 UWORD8 *pu1_left; /* Pointer to start of top predictors */
104 WORD32 dst_strd2;
105
106 __m128i row1_16x8b, row2_16x8b;
107
108 UNUSED(src_strd);
109 UNUSED(ngbr_avail);
110
111 pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
112
113
114 dst_strd2 = dst_strd << 1;
115 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left)));
116 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2)));
117 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
118 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
119
120 pu1_dst += dst_strd2;
121 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4)));
122 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6)));
123 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
124 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
125
126 pu1_dst += dst_strd2;
127 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8)));
128 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10)));
129 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
130 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
131
132 pu1_dst += dst_strd2;
133 row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12)));
134 row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14)));
135 _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
136 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
137 }
138
139 /**
140 *******************************************************************************
141 *
142 * ih264_intra_pred_chroma_8x8_mode_vert_ssse3
143 *
144 * @brief
145 * Perform Intra prediction for chroma_8x8 mode:vertical
146 *
147 * @par Description:
148 * Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
149 *
150 * @param[in] pu1_src
151 * UWORD8 pointer to the source containing alternate U and V samples
152 *
153 * @param[out] pu1_dst
154 * UWORD8 pointer to the destination with alternate U and V samples
155 *
156 * @param[in] src_strd
157 * integer source stride
158 *
159 * @param[in] dst_strd
160 * integer destination stride
161 *
162 * @param[in] ngbr_avail
163 * availability of neighbouring pixels(Not used in this function)
164 *
165 * @returns
166 *
167 * @remarks
168 * None
169 *
170 *******************************************************************************
171 */
ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)172 void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
173 UWORD8 *pu1_dst,
174 WORD32 src_strd,
175 WORD32 dst_strd,
176 WORD32 ngbr_avail)
177 {
178 UWORD8 *pu1_top; /* Pointer to start of top predictors */
179 WORD32 dst_strd2;
180
181 __m128i top_16x8b;
182
183 UNUSED(src_strd);
184 UNUSED(ngbr_avail);
185
186 pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
187
188 top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
189
190 dst_strd2 = dst_strd << 1;
191 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
192 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
193
194 pu1_dst += dst_strd2;
195 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
196 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
197
198 pu1_dst += dst_strd2;
199 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
200 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
201
202 pu1_dst += dst_strd2;
203 _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
204 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
205 }
206
207 /**
208 *******************************************************************************
209 *
210 * ih264_intra_pred_chroma_8x8_mode_plane_ssse3
211 *
212 * @brief
213 * Perform Intra prediction for chroma_8x8 mode:PLANE
214 *
215 * @par Description:
216 * Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
217 *
218 * @param[in] pu1_src
219 * UWORD8 pointer to the source containing alternate U and V samples
220 *
221 * @param[out] pu1_dst
222 * UWORD8 pointer to the destination with alternate U and V samples
223 *
224 * @param[in] src_strd
225 * integer source stride
226 *
227 * @param[in] dst_strd
228 * integer destination stride
229 *
230 * @param[in] ngbr_avail
231 * availability of neighbouring pixels(Not used in this function)
232 *
233 * @returns
234 *
235 * @remarks
236 * None
237 *
238 ******************************************************************************
239 */
ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)240 void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
241 UWORD8 *pu1_dst,
242 WORD32 src_strd,
243 WORD32 dst_strd,
244 WORD32 ngbr_avail)
245 {
246 UWORD8 *pu1_left, *pu1_top;
247 WORD32 a_u, a_v, b_u, b_v, c_u, c_v;
248
249 __m128i mul_8x16b, shuffle_8x16b;
250
251 UNUSED(src_strd);
252 UNUSED(ngbr_avail);
253
254 pu1_top = pu1_src + MB_SIZE + 2;
255 pu1_left = pu1_src + MB_SIZE - 2;
256
257 mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
258 shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
259 0xff01, 0xff03, 0xff05, 0xff07);
260
261 //calculating a, b and c
262 {
263 WORD32 h_u, h_v, v_u, v_v;
264
265 __m128i h_val1_16x8b, h_val2_16x8b;
266 __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
267 __m128i v_val1_16x8b, v_val2_16x8b;
268 __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
269 __m128i hv_val_4x32b;
270
271 h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
272 h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
273 v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
274 v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));
275
276 // reversing the order
277 h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
278 v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);
279
280 // separating u and v and 8-bit to 16-bit conversion
281 h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
282 h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
283 v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
284 v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);
285
286 h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
287 v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
288
289 h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
290 v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
291
292 hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
293
294 a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
295 a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;
296
297 h_u = _mm_extract_epi16(hv_val_4x32b, 0);
298 h_v = _mm_extract_epi16(hv_val_4x32b, 2);
299 v_u = _mm_extract_epi16(hv_val_4x32b, 4);
300 v_v = _mm_extract_epi16(hv_val_4x32b, 6);
301
302 h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
303 h_v = (h_v << 16) >> 15;
304 v_u = (v_u << 16) >> 15;
305 v_v = (v_v << 16) >> 15;
306
307 b_u = ((h_u << 4) + h_u + 32) >> 6;
308 b_v = ((h_v << 4) + h_v + 32) >> 6;
309 c_u = ((v_u << 4) + v_u + 32) >> 6;
310 c_v = ((v_v << 4) + v_v + 32) >> 6;
311 }
312 //using a, b and c to compute the fitted plane values
313 {
314 __m128i const_8x16b, c2_8x16b;
315 __m128i res1_l_8x16b, res1_h_8x16b;
316 __m128i res2_l_8x16b, res2_h_8x16b;
317 __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
318 __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
319
320 WORD32 b_u2, b_v2, b_u3, b_v3;
321 WORD32 const_u, const_v;
322 WORD32 dst_strd2;
323
324 const_u = a_u - (c_u << 1) - c_u + 16;
325 const_v = a_v - (c_v << 1) - c_v + 16;
326
327 b_u2 = b_u << 1;
328 b_v2 = b_v << 1;
329 b_u3 = b_u + b_u2;
330 b_v3 = b_v + b_v2;
331
332 const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
333 res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
334 //contains {-b*3, -b*2, -b*1, b*0}
335 res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
336 //contains {b*1, b*2, b*3, b*4}
337 c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);
338
339 // rows 1, 2
340 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
341 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
342 res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
343 res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
344
345 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
346 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
347 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
348 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
349
350 dst_strd2 = dst_strd << 1;
351 c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);
352
353 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
354 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
355
356 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
357 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
358
359 // rows 3, 4
360 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
361 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
362 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
363 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
364
365 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
366 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
367 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
368 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
369
370 pu1_dst += dst_strd2;
371
372 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
373 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
374
375 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
376 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
377
378 // rows 5, 6
379 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
380 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
381 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
382 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
383
384 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
385 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
386 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
387 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
388
389 pu1_dst += dst_strd2;
390
391 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
392 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
393
394 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
395 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
396
397 // rows 7, 8
398 res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
399 res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
400 res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
401 res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
402
403 res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
404 res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
405 res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
406 res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
407
408 pu1_dst += dst_strd2;
409
410 res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
411 res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
412
413 _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
414 _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
415
416 }
417 }
418