• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ih264_chroma_intra_pred_filters_ssse3.c
24 *
25 * @brief
26 *  Contains function definitions for chroma intra prediction filters in x86
27 *  intrinsics
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  -ih264_intra_pred_chroma_8x8_mode_horz_ssse3
34 *  -ih264_intra_pred_chroma_8x8_mode_vert_ssse3
35 *  -ih264_intra_pred_chroma_8x8_mode_plane_ssse3
36 *
37 * @remarks
38 *  None
39 *
40 *******************************************************************************
41 */
42 
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 
47 /* System include files */
48 #include <stdio.h>
49 #include <stddef.h>
50 #include <string.h>
51 
52 /* User include files */
53 #include "ih264_defs.h"
54 #include "ih264_typedefs.h"
55 #include "ih264_macros.h"
56 #include "ih264_platform_macros.h"
57 #include "ih264_intra_pred_filters.h"
58 
59 
60 /*****************************************************************************/
61 /* Chroma Intra prediction 8x8 filters                                       */
62 /*****************************************************************************/
63 /**
64 *******************************************************************************
65 *
66 * ih264_intra_pred_chroma_8x8_mode_horz_ssse3
67 *
68 * @brief
69 *  Perform Intra prediction for chroma_8x8 mode:Horizontal
70 *
71 * @par Description:
72 *  Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
73 *
74 * @param[in] pu1_src
75 *  UWORD8 pointer to the source containing alternate U and V samples
76 *
77 * @param[out] pu1_dst
78 *  UWORD8 pointer to the destination with alternate U and V samples
79 *
80 * @param[in] src_strd
81 *  integer source stride
82 *
83 * @param[in] dst_strd
84 *  integer destination stride
85 *
86 * @param[in] ngbr_avail
87 * availability of neighbouring pixels(Not used in this function)
88 *
89 * @returns
90 *
91 * @remarks
92 *  None
93 *
94 ******************************************************************************
95 */
ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)96 void ih264_intra_pred_chroma_8x8_mode_horz_ssse3(UWORD8 *pu1_src,
97                                                  UWORD8 *pu1_dst,
98                                                  WORD32 src_strd,
99                                                  WORD32 dst_strd,
100                                                  WORD32 ngbr_avail)
101 {
102 
103     UWORD8 *pu1_left; /* Pointer to start of top predictors */
104     WORD32 dst_strd2;
105 
106     __m128i row1_16x8b, row2_16x8b;
107 
108     UNUSED(src_strd);
109     UNUSED(ngbr_avail);
110 
111     pu1_left = pu1_src + 2 * BLK8x8SIZE - 2;
112 
113 
114     dst_strd2 = dst_strd << 1;
115     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left)));
116     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 2)));
117     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
118     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
119 
120     pu1_dst += dst_strd2;
121     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 4)));
122     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 6)));
123     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
124     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
125 
126     pu1_dst += dst_strd2;
127     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 8)));
128     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 10)));
129     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
130     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
131 
132     pu1_dst += dst_strd2;
133     row1_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 12)));
134     row2_16x8b = _mm_set1_epi16(*((WORD16 *)(pu1_left - 14)));
135     _mm_storeu_si128((__m128i *)pu1_dst, row1_16x8b);
136     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), row2_16x8b);
137 }
138 
139 /**
140 *******************************************************************************
141 *
142 * ih264_intra_pred_chroma_8x8_mode_vert_ssse3
143 *
144 * @brief
145 *  Perform Intra prediction for  chroma_8x8 mode:vertical
146 *
147 * @par Description:
148 *  Perform Intra prediction for  chroma_8x8 mode:vertical ,described in sec 8.3.4.3
149 *
150 * @param[in] pu1_src
151 *  UWORD8 pointer to the source containing alternate U and V samples
152 *
153 * @param[out] pu1_dst
154 *  UWORD8 pointer to the destination with alternate U and V samples
155 *
156 * @param[in] src_strd
157 *  integer source stride
158 *
159 * @param[in] dst_strd
160 *  integer destination stride
161 *
162 * @param[in] ngbr_avail
163 * availability of neighbouring pixels(Not used in this function)
164 *
165 * @returns
166 *
167 * @remarks
168 *  None
169 *
170 *******************************************************************************
171 */
ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)172 void ih264_intra_pred_chroma_8x8_mode_vert_ssse3(UWORD8 *pu1_src,
173                                                  UWORD8 *pu1_dst,
174                                                  WORD32 src_strd,
175                                                  WORD32 dst_strd,
176                                                  WORD32 ngbr_avail)
177 {
178     UWORD8 *pu1_top; /* Pointer to start of top predictors */
179     WORD32 dst_strd2;
180 
181     __m128i top_16x8b;
182 
183     UNUSED(src_strd);
184     UNUSED(ngbr_avail);
185 
186     pu1_top = pu1_src + 2 * BLK8x8SIZE + 2;
187 
188     top_16x8b = _mm_loadu_si128((__m128i *)pu1_top);
189 
190     dst_strd2 = dst_strd << 1;
191     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
192     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
193 
194     pu1_dst += dst_strd2;
195     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
196     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
197 
198     pu1_dst += dst_strd2;
199     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
200     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
201 
202     pu1_dst += dst_strd2;
203     _mm_storeu_si128((__m128i *)pu1_dst, top_16x8b);
204     _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), top_16x8b);
205 }
206 
207 /**
208 *******************************************************************************
209 *
210 * ih264_intra_pred_chroma_8x8_mode_plane_ssse3
211 *
212 * @brief
213 *  Perform Intra prediction for chroma_8x8 mode:PLANE
214 *
215 * @par Description:
216 *  Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
217 *
218 * @param[in] pu1_src
219 *  UWORD8 pointer to the source containing alternate U and V samples
220 *
221 * @param[out] pu1_dst
222 *  UWORD8 pointer to the destination with alternate U and V samples
223 *
224 * @param[in] src_strd
225 *  integer source stride
226 *
227 * @param[in] dst_strd
228 *  integer destination stride
229 *
230 * @param[in] ngbr_avail
231 * availability of neighbouring pixels(Not used in this function)
232 *
233 * @returns
234 *
235 * @remarks
236 *  None
237 *
238 ******************************************************************************
239 */
ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 dst_strd,WORD32 ngbr_avail)240 void ih264_intra_pred_chroma_8x8_mode_plane_ssse3(UWORD8 *pu1_src,
241                                                   UWORD8 *pu1_dst,
242                                                   WORD32 src_strd,
243                                                   WORD32 dst_strd,
244                                                   WORD32 ngbr_avail)
245 {
246     UWORD8 *pu1_left, *pu1_top;
247     WORD32 a_u, a_v, b_u, b_v, c_u, c_v;
248 
249     __m128i mul_8x16b, shuffle_8x16b;
250 
251     UNUSED(src_strd);
252     UNUSED(ngbr_avail);
253 
254     pu1_top = pu1_src + MB_SIZE + 2;
255     pu1_left = pu1_src + MB_SIZE - 2;
256 
257     mul_8x16b = _mm_setr_epi16(1, 2, 3, 4, 1, 2, 3, 4);
258     shuffle_8x16b = _mm_setr_epi16(0xff00, 0xff02, 0xff04, 0xff06,
259                                    0xff01, 0xff03, 0xff05, 0xff07);
260 
261     //calculating a, b and c
262     {
263         WORD32 h_u, h_v, v_u, v_v;
264 
265         __m128i h_val1_16x8b, h_val2_16x8b;
266         __m128i h_val1_8x16b, h_val2_8x16b, h_val_4x32b;
267         __m128i v_val1_16x8b, v_val2_16x8b;
268         __m128i v_val1_8x16b, v_val2_8x16b, v_val_4x32b;
269         __m128i hv_val_4x32b;
270 
271         h_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top + 8));
272         h_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_top - 2));
273         v_val1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 14));
274         v_val2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_left - 4));
275 
276         // reversing the order
277         h_val2_16x8b = _mm_shufflelo_epi16(h_val2_16x8b, 0x1b);
278         v_val1_16x8b = _mm_shufflelo_epi16(v_val1_16x8b, 0x1b);
279 
280         // separating u and v and 8-bit to 16-bit conversion
281         h_val1_8x16b = _mm_shuffle_epi8(h_val1_16x8b, shuffle_8x16b);
282         h_val2_8x16b = _mm_shuffle_epi8(h_val2_16x8b, shuffle_8x16b);
283         v_val1_8x16b = _mm_shuffle_epi8(v_val1_16x8b, shuffle_8x16b);
284         v_val2_8x16b = _mm_shuffle_epi8(v_val2_16x8b, shuffle_8x16b);
285 
286         h_val1_8x16b = _mm_sub_epi16(h_val1_8x16b, h_val2_8x16b);
287         v_val1_8x16b = _mm_sub_epi16(v_val1_8x16b, v_val2_8x16b);
288 
289         h_val_4x32b = _mm_madd_epi16(mul_8x16b, h_val1_8x16b);
290         v_val_4x32b = _mm_madd_epi16(mul_8x16b, v_val1_8x16b);
291 
292         hv_val_4x32b = _mm_hadd_epi32(h_val_4x32b, v_val_4x32b);
293 
294         a_u = (pu1_left[7 * (-2)] + pu1_top[14]) << 4;
295         a_v = (pu1_left[7 * (-2) + 1] + pu1_top[15]) << 4;
296 
297         h_u = _mm_extract_epi16(hv_val_4x32b, 0);
298         h_v = _mm_extract_epi16(hv_val_4x32b, 2);
299         v_u = _mm_extract_epi16(hv_val_4x32b, 4);
300         v_v = _mm_extract_epi16(hv_val_4x32b, 6);
301 
302         h_u = (h_u << 16) >> 15; // sign-extension and multiplication by 2
303         h_v = (h_v << 16) >> 15;
304         v_u = (v_u << 16) >> 15;
305         v_v = (v_v << 16) >> 15;
306 
307         b_u = ((h_u << 4) + h_u + 32) >> 6;
308         b_v = ((h_v << 4) + h_v + 32) >> 6;
309         c_u = ((v_u << 4) + v_u + 32) >> 6;
310         c_v = ((v_v << 4) + v_v + 32) >> 6;
311     }
312     //using a, b and c to compute the fitted plane values
313     {
314         __m128i const_8x16b, c2_8x16b;
315         __m128i res1_l_8x16b, res1_h_8x16b;
316         __m128i res2_l_8x16b, res2_h_8x16b;
317         __m128i res1_sh_l_8x16b, res1_sh_h_8x16b, res1_16x8b;
318         __m128i res2_sh_l_8x16b, res2_sh_h_8x16b, res2_16x8b;
319 
320         WORD32 b_u2, b_v2, b_u3, b_v3;
321         WORD32 const_u, const_v;
322         WORD32 dst_strd2;
323 
324         const_u = a_u - (c_u << 1) - c_u + 16;
325         const_v = a_v - (c_v << 1) - c_v + 16;
326 
327         b_u2 = b_u << 1;
328         b_v2 = b_v << 1;
329         b_u3 = b_u + b_u2;
330         b_v3 = b_v + b_v2;
331 
332         const_8x16b = _mm_setr_epi16(const_u, const_v, const_u, const_v, const_u, const_v, const_u, const_v);
333         res1_l_8x16b = _mm_setr_epi16(-b_u3, -b_v3, -b_u2, -b_v2, -b_u, -b_v, 0, 0);
334         //contains {-b*3, -b*2, -b*1, b*0}
335         res1_h_8x16b = _mm_setr_epi16(b_u, b_v, b_u2, b_v2, b_u3, b_v3, b_u << 2, b_v << 2);
336         //contains {b*1, b*2, b*3, b*4}
337         c2_8x16b = _mm_setr_epi16(c_u, c_v, c_u, c_v, c_u, c_v, c_u, c_v);
338 
339         // rows 1, 2
340         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, const_8x16b);
341         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, const_8x16b);
342         res2_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
343         res2_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
344 
345         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
346         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
347         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
348         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
349 
350         dst_strd2 = dst_strd << 1;
351         c2_8x16b = _mm_slli_epi16(c2_8x16b, 1);
352 
353         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
354         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
355 
356         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
357         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
358 
359         // rows 3, 4
360         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
361         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
362         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
363         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
364 
365         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
366         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
367         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
368         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
369 
370         pu1_dst += dst_strd2;
371 
372         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
373         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
374 
375         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
376         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
377 
378         // rows 5, 6
379         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
380         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
381         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
382         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
383 
384         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
385         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
386         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
387         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
388 
389         pu1_dst += dst_strd2;
390 
391         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
392         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
393 
394         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
395         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
396 
397         // rows 7, 8
398         res1_l_8x16b = _mm_add_epi16(res1_l_8x16b, c2_8x16b);
399         res1_h_8x16b = _mm_add_epi16(res1_h_8x16b, c2_8x16b);
400         res2_l_8x16b = _mm_add_epi16(res2_l_8x16b, c2_8x16b);
401         res2_h_8x16b = _mm_add_epi16(res2_h_8x16b, c2_8x16b);
402 
403         res1_sh_l_8x16b = _mm_srai_epi16(res1_l_8x16b, 5);
404         res1_sh_h_8x16b = _mm_srai_epi16(res1_h_8x16b, 5);
405         res2_sh_l_8x16b = _mm_srai_epi16(res2_l_8x16b, 5);
406         res2_sh_h_8x16b = _mm_srai_epi16(res2_h_8x16b, 5);
407 
408         pu1_dst += dst_strd2;
409 
410         res1_16x8b = _mm_packus_epi16(res1_sh_l_8x16b, res1_sh_h_8x16b);
411         res2_16x8b = _mm_packus_epi16(res2_sh_l_8x16b, res2_sh_h_8x16b);
412 
413         _mm_storeu_si128((__m128i *)pu1_dst, res1_16x8b);
414         _mm_storeu_si128((__m128i *)(pu1_dst + dst_strd), res2_16x8b);
415 
416     }
417 }
418