1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevcd_it_rec_dc_atom_intr.c
22 *
23 * @brief
24 * Platform specific intrinsic implementation of certain functions
25 *
26 * @author
27 * Ittiam
28 * @par List of Functions:
29 * - ihevcd_itrans_recon_dc
30 * - ihevcd_fmt_conv_420sp_to_420p
31 *
32 * @remarks
33 * None
34 *
35 *******************************************************************************
36 */
37
38 #include "ihevc_typedefs.h"
39 #include "ihevc_defs.h"
40 #include "ihevc_macros.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevcd_function_selector.h"
43
44 #include <immintrin.h>
45
46
47
48
ihevcd_itrans_recon_dc_luma_ssse3(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)49 void ihevcd_itrans_recon_dc_luma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
50 WORD32 log2_trans_size, WORD16 i2_coeff_value)
51 {
52 __m128i m_temp_reg_0;
53 __m128i m_temp_reg_1;
54 __m128i m_temp_reg_2;
55 __m128i m_temp_reg_3;
56 __m128i m_temp_reg_4;
57 __m128i m_temp_reg_5;
58 __m128i m_temp_reg_6;
59 __m128i m_temp_reg_7;
60 __m128i m_temp_reg_8;
61 __m128i m_temp_reg_9;
62 __m128i m_temp_reg_10;
63 __m128i m_temp_reg_11;
64 __m128i m_temp_reg_12;
65 __m128i m_temp_reg_13;
66 __m128i m_temp_reg_14;
67 __m128i m_temp_reg_15;
68 __m128i m_temp_reg_20, zero_8x16b;
69 __m128i *pi4_dst = (__m128i *)pu1_dst;
70
71
72
73 WORD32 add, shift;
74 WORD32 dc_value, quant_out;
75 WORD32 trans_size;
76
77
78
79 trans_size = (1 << log2_trans_size);
80
81 quant_out = i2_coeff_value;
82
83 shift = IT_SHIFT_STAGE_1;
84 add = 1 << (shift - 1);
85 dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
86 shift = IT_SHIFT_STAGE_2;
87 add = 1 << (shift - 1);
88 dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
89
90 /*Replicate the DC value within 16 bits in 128 bit register*/
91 m_temp_reg_20 = _mm_set1_epi16(dc_value);
92 zero_8x16b = _mm_setzero_si128();
93
94 if(trans_size == 4)
95 {
96 WORD32 *pi4_dst = (WORD32 *)pu1_dst;
97
98 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
99 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
100 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
101 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
102
103 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_0, m_temp_reg_1);
104 m_temp_reg_5 = _mm_unpacklo_epi32(m_temp_reg_2, m_temp_reg_3);
105
106 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
107 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
108
109 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
110 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
111
112 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
113
114
115 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_8);
116 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_8, 4);
117 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_8, 8);
118 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_8, 12);
119 pu1_dst += dst_strd;
120 pi4_dst = (WORD32 *)(pu1_dst);
121
122 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1);
123 pu1_dst += dst_strd;
124 pi4_dst = (WORD32 *)(pu1_dst);
125
126 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2);
127 pu1_dst += dst_strd;
128 pi4_dst = (WORD32 *)(pu1_dst);
129
130 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3);
131 }
132 else
133 {
134 WORD32 i, j;
135
136 for(i = 1; i <= trans_size; i += 4)
137 {
138 for(j = 1; j <= trans_size; j += 8)
139 {
140
141 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
142 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
143 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
144 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
145
146
147 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_0, zero_8x16b);
148 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_1, zero_8x16b);
149 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_2, zero_8x16b);
150 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_3, zero_8x16b);
151
152 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
153 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
154 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
155 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
156
157 pi4_dst = (__m128i *)(pu1_dst);
158
159 m_temp_reg_12 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
160 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
161
162 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
163
164 m_temp_reg_13 = _mm_srli_si128(m_temp_reg_12, 8);
165 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
166
167 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
168
169 m_temp_reg_14 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
170 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
171
172 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
173
174 m_temp_reg_15 = _mm_srli_si128(m_temp_reg_14, 8);
175 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
176
177 pu1_pred += 8;
178 pu1_dst += 8;
179 }
180 pu1_pred += 4 * pred_strd - trans_size;
181 pu1_dst += 4 * dst_strd - trans_size;
182 }
183 }
184
185
186 }
187
ihevcd_itrans_recon_dc_chroma_ssse3(UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 pred_strd,WORD32 dst_strd,WORD32 log2_trans_size,WORD16 i2_coeff_value)188 void ihevcd_itrans_recon_dc_chroma_ssse3(UWORD8 *pu1_pred, UWORD8 *pu1_dst, WORD32 pred_strd, WORD32 dst_strd,
189 WORD32 log2_trans_size, WORD16 i2_coeff_value)
190 {
191 __m128i m_temp_reg_0;
192 __m128i m_temp_reg_1;
193 __m128i m_temp_reg_2;
194 __m128i m_temp_reg_3;
195 __m128i m_temp_reg_4;
196 __m128i m_temp_reg_5;
197 __m128i m_temp_reg_6;
198 __m128i m_temp_reg_7;
199 __m128i m_temp_reg_8;
200 __m128i m_temp_reg_9;
201 __m128i m_temp_reg_10;
202 __m128i m_temp_reg_11;
203 __m128i m_temp_reg_12;
204 __m128i m_temp_reg_13;
205 __m128i m_temp_reg_14;
206 __m128i m_temp_reg_15;
207 __m128i m_temp_reg_20, zero_8x16b;
208 __m128i *pi4_dst = (__m128i *)pu1_dst;
209
210
211 WORD32 add, shift;
212 WORD32 dc_value, quant_out;
213 WORD32 trans_size;
214
215
216 WORD32 shuffle_mask_4x4 = 0x06040200;
217 WORD32 unchanged_mask_4x4 = 0x07050301;
218 LWORD64 shuffle_mask = 0x0E0C0A0806040200LL;
219 LWORD64 unchanged_mask = 0x0F0D0B0907050301LL;
220
221 trans_size = (1 << log2_trans_size);
222
223 quant_out = i2_coeff_value;
224
225 shift = IT_SHIFT_STAGE_1;
226 add = 1 << (shift - 1);
227 dc_value = CLIP_S16((quant_out * 64 + add) >> shift);
228 shift = IT_SHIFT_STAGE_2;
229 add = 1 << (shift - 1);
230 dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
231
232 /*Replicate the DC value within 16 bits in 128 bit register*/
233 m_temp_reg_20 = _mm_set1_epi16(dc_value);
234 zero_8x16b = _mm_setzero_si128();
235
236 if(trans_size == 4)
237 {
238 __m128i chroma_shuffle_mask_16x8b;
239 __m128i chroma_unchanged_mask_16x8b;
240 chroma_shuffle_mask_16x8b = _mm_cvtsi32_si128(shuffle_mask_4x4);
241 chroma_unchanged_mask_16x8b = _mm_cvtsi32_si128(unchanged_mask_4x4);
242
243 /*Load the prediction data*/
244 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)(pu1_pred));
245 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_pred + pred_strd));
246 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_pred + 2 * pred_strd));
247 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_pred + 3 * pred_strd));
248
249 m_temp_reg_10 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
250 m_temp_reg_11 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
251 m_temp_reg_12 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
252 m_temp_reg_13 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
253
254 m_temp_reg_14 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
255 m_temp_reg_15 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
256
257 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_14, zero_8x16b);
258 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_15, zero_8x16b);
259
260 m_temp_reg_6 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
261 m_temp_reg_7 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
262
263 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
264 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_dst);
265 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)(pu1_dst + dst_strd));
266 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)(pu1_dst + 2 * dst_strd));
267 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)(pu1_dst + 3 * dst_strd));
268
269 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
270 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
271 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
272 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
273
274
275 m_temp_reg_8 = _mm_packus_epi16(m_temp_reg_6, m_temp_reg_7);
276 m_temp_reg_9 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_0);
277 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
278 m_temp_reg_10 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_1);
279 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
280 m_temp_reg_11 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_2);
281 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_8, 4);
282 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_8, m_temp_reg_3);
283
284 /*Store the result in the destination*/
285 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
286 pu1_dst += dst_strd;
287 pi4_dst = (__m128i *)(pu1_dst);
288
289
290 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
291 pu1_dst += dst_strd;
292 pi4_dst = (__m128i *)(pu1_dst);
293
294 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
295 pu1_dst += dst_strd;
296 pi4_dst = (__m128i *)(pu1_dst);
297
298 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
299 }
300 else
301 {
302 WORD32 i, j;
303 __m128i chroma_shuffle_mask_16x8b;
304 __m128i chroma_unchanged_mask_16x8b;
305 chroma_shuffle_mask_16x8b = _mm_loadl_epi64((__m128i *)(&shuffle_mask));
306 chroma_unchanged_mask_16x8b =
307 _mm_loadl_epi64((__m128i *)(&unchanged_mask));
308
309 for(i = 0; i < trans_size; i += 4)
310 {
311 for(j = 0; j < trans_size; j += 8)
312 {
313
314 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_pred);
315 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_pred + pred_strd));
316 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_pred + 2 * pred_strd));
317 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_pred + 3 * pred_strd));
318
319 /*Retain only one chroma component*/
320 m_temp_reg_4 = _mm_shuffle_epi8(m_temp_reg_0, chroma_shuffle_mask_16x8b);
321 m_temp_reg_5 = _mm_shuffle_epi8(m_temp_reg_1, chroma_shuffle_mask_16x8b);
322 m_temp_reg_6 = _mm_shuffle_epi8(m_temp_reg_2, chroma_shuffle_mask_16x8b);
323 m_temp_reg_7 = _mm_shuffle_epi8(m_temp_reg_3, chroma_shuffle_mask_16x8b);
324
325 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, zero_8x16b);
326 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, zero_8x16b);
327 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, zero_8x16b);
328 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, zero_8x16b);
329
330 m_temp_reg_8 = _mm_add_epi16(m_temp_reg_4, m_temp_reg_20);
331 m_temp_reg_9 = _mm_add_epi16(m_temp_reg_5, m_temp_reg_20);
332 m_temp_reg_10 = _mm_add_epi16(m_temp_reg_6, m_temp_reg_20);
333 m_temp_reg_11 = _mm_add_epi16(m_temp_reg_7, m_temp_reg_20);
334
335
336 /*Load the recon data to make sure that 'v' is not corrupted when 'u' is called and vice versa*/
337 m_temp_reg_0 = _mm_loadu_si128((__m128i *)pu1_dst);
338 m_temp_reg_1 = _mm_loadu_si128((__m128i *)(pu1_dst + dst_strd));
339 m_temp_reg_2 = _mm_loadu_si128((__m128i *)(pu1_dst + 2 * dst_strd));
340 m_temp_reg_3 = _mm_loadu_si128((__m128i *)(pu1_dst + 3 * dst_strd));
341
342 m_temp_reg_0 = _mm_shuffle_epi8(m_temp_reg_0, chroma_unchanged_mask_16x8b);
343 m_temp_reg_1 = _mm_shuffle_epi8(m_temp_reg_1, chroma_unchanged_mask_16x8b);
344 m_temp_reg_2 = _mm_shuffle_epi8(m_temp_reg_2, chroma_unchanged_mask_16x8b);
345 m_temp_reg_3 = _mm_shuffle_epi8(m_temp_reg_3, chroma_unchanged_mask_16x8b);
346
347 m_temp_reg_4 = _mm_packus_epi16(m_temp_reg_8, m_temp_reg_9);
348 m_temp_reg_5 = _mm_packus_epi16(m_temp_reg_10, m_temp_reg_11);
349
350 m_temp_reg_12 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_0);
351 m_temp_reg_4 = _mm_srli_si128(m_temp_reg_4, 8);
352 m_temp_reg_13 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_1);
353
354 m_temp_reg_14 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_2);
355 m_temp_reg_5 = _mm_srli_si128(m_temp_reg_5, 8);
356 m_temp_reg_15 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_3);
357
358 /*Store the result in the destination*/
359 pi4_dst = (__m128i *)(pu1_dst);
360
361 _mm_storel_epi64(pi4_dst, m_temp_reg_12);
362 m_temp_reg_8 = _mm_srli_si128(m_temp_reg_12, 8);
363
364 pi4_dst = (__m128i *)(pu1_dst + 8);
365 _mm_storel_epi64(pi4_dst, m_temp_reg_8);
366
367 pi4_dst = (__m128i *)(pu1_dst + dst_strd);
368
369 _mm_storel_epi64(pi4_dst, m_temp_reg_13);
370 m_temp_reg_9 = _mm_srli_si128(m_temp_reg_13, 8);
371
372 pi4_dst = (__m128i *)(pu1_dst + dst_strd + 8);
373 _mm_storel_epi64(pi4_dst, m_temp_reg_9);
374
375 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd);
376
377 _mm_storel_epi64(pi4_dst, m_temp_reg_14);
378 m_temp_reg_10 = _mm_srli_si128(m_temp_reg_14, 8);
379
380 pi4_dst = (__m128i *)(pu1_dst + 2 * dst_strd + 8);
381 _mm_storel_epi64(pi4_dst, m_temp_reg_10);
382
383 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd);
384
385 _mm_storel_epi64(pi4_dst, m_temp_reg_15);
386 m_temp_reg_11 = _mm_srli_si128(m_temp_reg_15, 8);
387
388 pi4_dst = (__m128i *)(pu1_dst + 3 * dst_strd + 8);
389 _mm_storel_epi64(pi4_dst, m_temp_reg_11);
390
391 pu1_pred += 16;
392 pu1_dst += 16;
393 }
394
395 pu1_pred += 4 * pred_strd - 2 * trans_size;
396 pu1_dst += 4 * dst_strd - 2 * trans_size;
397 }
398 }
399
400
401 }
402