1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264_iquant_itrans_recon_dc_ssse3.c
24 *
25 * @brief
26 * Contains function definitions for inverse quantization, inverse
27 * transform and reconstruction
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - ih264_iquant_itrans_recon_4x4_dc_ssse3()
34 * - ih264_iquant_itrans_recon_8x8_dc_ssse3()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 /* User include files */
42 #include "ih264_typedefs.h"
43 #include "ih264_defs.h"
44 #include "ih264_trans_macros.h"
45 #include "ih264_macros.h"
46 #include "ih264_platform_macros.h"
47 #include "ih264_trans_data.h"
48 #include "ih264_size_defs.h"
49 #include "ih264_structs.h"
50 #include "ih264_trans_quant_itrans_iquant.h"
51 #include <immintrin.h>
52
53 /*
54 ********************************************************************************
55 *
56 * @brief This function reconstructs a 4x4 sub block from quantized resiude and
57 * prediction buffer for dc input pattern only, i.e. only the (0,0) element of the input
58 * 4x4 block is non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
59 *
60 * @par Description:
61 * The quantized residue is first inverse quantized, then inverse transformed.
62 * This inverse transformed content is added to the prediction buffer to recon-
63 * struct the end output
64 *
65 * @param[in] pi2_src
66 * quantized 4x4 block
67 *
68 * @param[in] pu1_pred
69 * prediction 4x4 block
70 *
71 * @param[out] pu1_out
72 * reconstructed 4x4 block
73 *
74 * @param[in] src_strd
75 * quantization buffer stride
76 *
77 * @param[in] pred_strd,
78 * Prediction buffer stride
79 *
80 * @param[in] out_strd
81 * recon buffer Stride
82 *
83 * @param[in] pu2_scaling_list
84 * pointer to scaling list
85 *
86 * @param[in] pu2_norm_adjust
87 * pointer to inverse scale matrix
88 *
89 * @param[in] u4_qp_div_6
90 * Floor (qp/6)
91 *
92 * @param[in] pi4_tmp
93 * temporary buffer of size 1*16
94 *
95 * @returns none
96 *
97 * @remarks none
98 *
99 *******************************************************************************
100 */
ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 * pi2_src,UWORD8 * pu1_pred,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)101 void ih264_iquant_itrans_recon_4x4_dc_ssse3(WORD16 *pi2_src,
102 UWORD8 *pu1_pred,
103 UWORD8 *pu1_out,
104 WORD32 pred_strd,
105 WORD32 out_strd,
106 const UWORD16 *pu2_iscal_mat,
107 const UWORD16 *pu2_weigh_mat,
108 UWORD32 u4_qp_div_6,
109 WORD16 *pi2_tmp,
110 WORD32 iq_start_idx,
111 WORD16 *pi2_dc_ld_addr)
112 {
113 UWORD32 *pu4_out = (UWORD32 *)pu1_out;
114 WORD32 q0 = pi2_src[0];
115 WORD16 i_macro, rnd_fact = (u4_qp_div_6 < 4) ? 1 << (3 - u4_qp_div_6) : 0;
116
117 __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3;
118 __m128i sign_reg;
119 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
120 __m128i temp4, temp5, temp6, temp7;
121 __m128i value_add;
122
123 UNUSED (pi2_tmp);
124
125 INV_QUANT(q0, pu2_iscal_mat[0], pu2_weigh_mat[0], u4_qp_div_6, rnd_fact, 4);
126
127 if (iq_start_idx != 0 )
128 q0 = pi2_dc_ld_addr[0]; // Restoring dc value for intra case
129
130 i_macro = ((q0 + 32) >> 6);
131
132 value_add = _mm_set1_epi16(i_macro);
133
134 zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
135 //Load pred buffer
136 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
137 pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p00 p01 p02 p03 0 0 0 0 -- all 16 bits
138 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
139 pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p10 p11 p12 p13 0 0 0 0 -- all 16 bits
140 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[2*pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
141 pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p20 p21 p22 p23 0 0 0 0 -- all 16 bits
142 predload_r = _mm_loadl_epi64((__m128i *) (&pu1_pred[3*pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
143 pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p30 p31 p32 p33 0 0 0 0 -- all 16 bits
144
145 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
146 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
147
148 temp4 = _mm_add_epi16(value_add, pred_r0);
149 temp5 = _mm_add_epi16(value_add, pred_r2);
150 /*------------------------------------------------------------------*/
151 //Clipping the results to 8 bits
152 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
153 temp4 = _mm_and_si128(temp4, sign_reg);
154 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
155 temp5 = _mm_and_si128(temp5, sign_reg);
156
157 temp4 = _mm_packus_epi16(temp4,temp5);
158 temp5 = _mm_srli_si128(temp4,4);
159 temp6 = _mm_srli_si128(temp5,4);
160 temp7 = _mm_srli_si128(temp6,4);
161
162 *pu4_out = _mm_cvtsi128_si32(temp4);
163 pu1_out += out_strd;
164 pu4_out = (UWORD32 *)(pu1_out);
165 *(pu4_out) = _mm_cvtsi128_si32(temp5);
166 pu1_out += out_strd;
167 pu4_out = (UWORD32 *)(pu1_out);
168 *(pu4_out) = _mm_cvtsi128_si32(temp6);
169 pu1_out += out_strd;
170 pu4_out = (UWORD32 *)(pu1_out);
171 *(pu4_out) = _mm_cvtsi128_si32(temp7);
172 }
173 /**
174 *******************************************************************************
175 *
176 * @brief
177 * This function performs inverse quant and Inverse transform type Ci4 for 8x8 block
178 * for dc input pattern only, i.e. only the (0,0) element of the input 8x8 block is
179 * non-zero. For complete function, refer ih264_iquant_itrans_recon_ssse3.c
180 *
181 * @par Description:
182 * Performs inverse transform Ci8 and adds the residue to get the
183 * reconstructed block
184 *
185 * @param[in] pi2_src
186 * Input 8x8coefficients
187 *
188 * @param[in] pu1_pred
189 * Prediction 8x8 block
190 *
191 * @param[out] pu1_recon
192 * Output 8x8 block
193 *
194 * @param[in] q_div
195 * QP/6
196 *
197 * @param[in] q_rem
198 * QP%6
199 *
200 * @param[in] q_lev
201 * Quantizer level
202 *
203 * @param[in] u4_src_stride
204 * Input stride
205 *
206 * @param[in] u4_pred_stride,
207 * Prediction stride
208 *
209 * @param[in] u4_out_stride
210 * Output Stride
211 *
212 * @param[in] pi4_tmp
213 * temporary buffer of size 1*64
214 * the tmp for each block
215 *
216 * @param[in] pu4_iquant_mat
217 * Pointer to the inverse quantization matrix
218 *
219 * @returns Void
220 *
221 * @remarks
222 * None
223 *
224 *******************************************************************************
225 */
226
ih264_iquant_itrans_recon_8x8_dc_ssse3(WORD16 * pi2_src,UWORD8 * pu1_pred,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,WORD16 * pi2_tmp,WORD32 iq_start_idx,WORD16 * pi2_dc_ld_addr)227 void ih264_iquant_itrans_recon_8x8_dc_ssse3 (WORD16 *pi2_src,
228 UWORD8 *pu1_pred,
229 UWORD8 *pu1_out,
230 WORD32 pred_strd,
231 WORD32 out_strd,
232 const UWORD16 *pu2_iscale_mat,
233 const UWORD16 *pu2_weigh_mat,
234 UWORD32 qp_div,
235 WORD16 *pi2_tmp,
236 WORD32 iq_start_idx,
237 WORD16 *pi2_dc_ld_addr)
238 {
239 WORD32 q0 = pi2_src[0];
240 WORD16 i_macro, rnd_fact = (qp_div < 6) ? 1 << (5 - qp_div) : 0;
241
242 __m128i predload_r,pred_r0, pred_r1, pred_r2, pred_r3,pred_r4,pred_r5,pred_r6,pred_r7;
243 __m128i sign_reg;
244 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
245 __m128i temp1,temp2,temp3,temp4, temp5, temp6, temp7,temp8;
246 __m128i value_add;
247
248 UNUSED (pi2_tmp);
249 UNUSED (iq_start_idx);
250 UNUSED (pi2_dc_ld_addr);
251
252 INV_QUANT(q0, pu2_iscale_mat[0], pu2_weigh_mat[0], qp_div, rnd_fact, 6);
253 i_macro = ((q0 + 32) >> 6);
254
255 value_add = _mm_set1_epi16(i_macro);
256
257 //Load pred buffer row 0
258 predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[0])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
259 pred_r0 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
260 //Load pred buffer row 1
261 predload_r = _mm_loadl_epi64((__m128i *)(&pu1_pred[pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
262 pred_r1 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
263 //Load pred buffer row 2
264 predload_r = _mm_loadl_epi64(
265 (__m128i *)(&pu1_pred[2 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
266 pred_r2 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
267 //Load pred buffer row 3
268 predload_r = _mm_loadl_epi64(
269 (__m128i *)(&pu1_pred[3 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
270 pred_r3 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
271 //Load pred buffer row 4
272 predload_r = _mm_loadl_epi64(
273 (__m128i *)(&pu1_pred[4 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
274 pred_r4 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
275 //Load pred buffer row 5
276 predload_r = _mm_loadl_epi64(
277 (__m128i *)(&pu1_pred[5 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bit
278 pred_r5 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
279 //Load pred buffer row 6
280 predload_r = _mm_loadl_epi64(
281 (__m128i *)(&pu1_pred[6 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
282 pred_r6 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
283 //Load pred buffer row 7
284 predload_r = _mm_loadl_epi64(
285 (__m128i *)(&pu1_pred[7 * pred_strd])); //p0 p1 p2 p3 p4 p5 p6 p7 0 0 0 0 0 0 0 0 -- all 8 bits
286 pred_r7 = _mm_unpacklo_epi8(predload_r, zero_8x16b); //p0 p1 p2 p3 p4 p5 p6 p7 -- all 16 bits
287
288 temp1 = _mm_add_epi16(value_add, pred_r0);
289
290 temp2 = _mm_add_epi16(value_add, pred_r1);
291
292 temp3 = _mm_add_epi16(value_add, pred_r2);
293
294 temp4 = _mm_add_epi16(value_add, pred_r3);
295
296 temp5 = _mm_add_epi16(value_add, pred_r4);
297
298 temp6 = _mm_add_epi16(value_add, pred_r5);
299
300 temp7 = _mm_add_epi16(value_add, pred_r6);
301
302 temp8 = _mm_add_epi16(value_add, pred_r7);
303 /*------------------------------------------------------------------*/
304 //Clipping the results to 8 bits
305 sign_reg = _mm_cmpgt_epi16(temp1, zero_8x16b); // sign check
306 temp1 = _mm_and_si128(temp1, sign_reg);
307 sign_reg = _mm_cmpgt_epi16(temp2, zero_8x16b); // sign check
308 temp2 = _mm_and_si128(temp2, sign_reg);
309 sign_reg = _mm_cmpgt_epi16(temp3, zero_8x16b); // sign check
310 temp3 = _mm_and_si128(temp3, sign_reg);
311 sign_reg = _mm_cmpgt_epi16(temp4, zero_8x16b); // sign check
312 temp4 = _mm_and_si128(temp4, sign_reg);
313 sign_reg = _mm_cmpgt_epi16(temp5, zero_8x16b); // sign check
314 temp5 = _mm_and_si128(temp5, sign_reg);
315 sign_reg = _mm_cmpgt_epi16(temp6, zero_8x16b); // sign check
316 temp6 = _mm_and_si128(temp6, sign_reg);
317 sign_reg = _mm_cmpgt_epi16(temp7, zero_8x16b); // sign check
318 temp7 = _mm_and_si128(temp7, sign_reg);
319 sign_reg = _mm_cmpgt_epi16(temp8, zero_8x16b); // sign check
320 temp8 = _mm_and_si128(temp8, sign_reg);
321
322 temp1 = _mm_packus_epi16(temp1, zero_8x16b);
323 temp2 = _mm_packus_epi16(temp2, zero_8x16b);
324 temp3 = _mm_packus_epi16(temp3, zero_8x16b);
325 temp4 = _mm_packus_epi16(temp4, zero_8x16b);
326 temp5 = _mm_packus_epi16(temp5, zero_8x16b);
327 temp6 = _mm_packus_epi16(temp6, zero_8x16b);
328 temp7 = _mm_packus_epi16(temp7, zero_8x16b);
329 temp8 = _mm_packus_epi16(temp8, zero_8x16b);
330
331 _mm_storel_epi64((__m128i *)(&pu1_out[0]), temp1);
332 _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), temp2);
333 _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), temp3);
334 _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), temp4);
335 _mm_storel_epi64((__m128i *)(&pu1_out[4 * out_strd]), temp5);
336 _mm_storel_epi64((__m128i *)(&pu1_out[5 * out_strd]), temp6);
337 _mm_storel_epi64((__m128i *)(&pu1_out[6 * out_strd]), temp7);
338 _mm_storel_epi64((__m128i *)(&pu1_out[7 * out_strd]), temp8);
339 }
340
341 /*
342 ********************************************************************************
343 *
344 * @brief This function reconstructs a 4x4 sub block from quantized chroma resiude and
345 * prediction buffer
346 *
347 * @par Description:
348 * The quantized residue is first inverse quantized, then inverse transformed.
349 * This inverse transformed content is added to the prediction buffer to recon-
350 * struct the end output
351 *
352 * @param[in] pi2_src
353 * quantized 4x4 block
354 *
355 * @param[in] pu1_pred
356 * prediction 4x4 block
357 *
358 * @param[out] pu1_out
359 * reconstructed 4x4 block
360 *
361 * @param[in] src_strd
362 * quantization buffer stride
363 *
364 * @param[in] pred_strd,
365 * Prediction buffer stride
366 *
367 * @param[in] out_strd
368 * recon buffer Stride
369 *
370 * @param[in] pu2_scaling_list
371 * pointer to scaling list
372 *
373 * @param[in] pu2_norm_adjust
374 * pointer to inverse scale matrix
375 *
376 * @param[in] u4_qp_div_6
377 * Floor (qp/6)
378 *
379 * @param[in] pi4_tmp
380 * temporary buffer of size 1*16
381 *
382 * @returns none
383 *
384 * @remarks none
385 *
386 *******************************************************************************
387 */
ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 * pi2_src,UWORD8 * pu1_pred,UWORD8 * pu1_out,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscal_mat,const UWORD16 * pu2_weigh_mat,UWORD32 u4_qp_div_6,WORD16 * pi2_tmp,WORD16 * pi2_dc_src)388 void ih264_iquant_itrans_recon_chroma_4x4_dc_ssse3(WORD16 *pi2_src,
389 UWORD8 *pu1_pred,
390 UWORD8 *pu1_out,
391 WORD32 pred_strd,
392 WORD32 out_strd,
393 const UWORD16 *pu2_iscal_mat,
394 const UWORD16 *pu2_weigh_mat,
395 UWORD32 u4_qp_div_6,
396 WORD16 *pi2_tmp,
397 WORD16 *pi2_dc_src)
398 {
399 WORD16 q0 = pi2_dc_src[0]; // DC value won't be dequantized for chroma inverse transform
400 WORD16 i_macro = ((q0 + 32) >> 6);
401
402 __m128i pred_r0, pred_r1, pred_r2, pred_r3, sign_reg;
403 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
404 __m128i chroma_mask = _mm_set1_epi16 (0xFF);
405 __m128i value_add = _mm_set1_epi16(i_macro);
406 __m128i out_r0, out_r1, out_r2, out_r3;
407
408 UNUSED (pi2_src);
409 UNUSED (pu2_iscal_mat);
410 UNUSED (pu2_weigh_mat);
411 UNUSED (u4_qp_div_6);
412 UNUSED (pi2_tmp);
413
414 //Load pred buffer
415 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
416 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
417 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
418 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
419
420 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
421 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
422 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
423 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
424
425 pred_r0 = _mm_unpacklo_epi64(pred_r0, pred_r1); //p00 p01 p02 p03 p10 p11 p12 p13
426 pred_r2 = _mm_unpacklo_epi64(pred_r2, pred_r3); //p20 p21 p22p p23 p30 p31 p32 p33
427
428 pred_r0 = _mm_add_epi16(value_add, pred_r0);
429 pred_r2 = _mm_add_epi16(value_add, pred_r2);
430
431 /*------------------------------------------------------------------*/
432 //Clipping the results to 8 bits
433 sign_reg = _mm_cmpgt_epi16(pred_r0, zero_8x16b); // sign check
434 pred_r0 = _mm_and_si128(pred_r0, sign_reg);
435 sign_reg = _mm_cmpgt_epi16(pred_r2, zero_8x16b);
436 pred_r2 = _mm_and_si128(pred_r2, sign_reg);
437
438 pred_r0 = _mm_packus_epi16(pred_r0, pred_r2);
439 pred_r1 = _mm_srli_si128(pred_r0, 4);
440 pred_r2 = _mm_srli_si128(pred_r1, 4);
441 pred_r3 = _mm_srli_si128(pred_r2, 4);
442
443 pred_r0 = _mm_unpacklo_epi8(pred_r0, zero_8x16b); //p00 p01 p02 p03 -- all 16 bits
444 pred_r1 = _mm_unpacklo_epi8(pred_r1, zero_8x16b); //p10 p11 p12 p13 -- all 16 bits
445 pred_r2 = _mm_unpacklo_epi8(pred_r2, zero_8x16b); //p20 p21 p22 p23 -- all 16 bits
446 pred_r3 = _mm_unpacklo_epi8(pred_r3, zero_8x16b); //p30 p31 p32 p33 -- all 16 bits
447
448 chroma_mask = _mm_set1_epi16 (0xFF00);
449 out_r0 = _mm_loadl_epi64((__m128i *) (&pu1_out[0]));
450 out_r1 = _mm_loadl_epi64((__m128i *) (&pu1_out[out_strd]));
451 out_r2 = _mm_loadl_epi64((__m128i *) (&pu1_out[2 * out_strd]));
452 out_r3 = _mm_loadl_epi64((__m128i *) (&pu1_out[3 * out_strd]));
453
454 out_r0 = _mm_and_si128(out_r0, chroma_mask);
455 out_r1 = _mm_and_si128(out_r1, chroma_mask);
456 out_r2 = _mm_and_si128(out_r2, chroma_mask);
457 out_r3 = _mm_and_si128(out_r3, chroma_mask);
458
459 out_r0 = _mm_add_epi8(out_r0, pred_r0);
460 out_r1 = _mm_add_epi8(out_r1, pred_r1);
461 out_r2 = _mm_add_epi8(out_r2, pred_r2);
462 out_r3 = _mm_add_epi8(out_r3, pred_r3);
463
464 _mm_storel_epi64((__m128i *)(&pu1_out[0]), out_r0);
465 _mm_storel_epi64((__m128i *)(&pu1_out[out_strd]), out_r1);
466 _mm_storel_epi64((__m128i *)(&pu1_out[2 * out_strd]), out_r2);
467 _mm_storel_epi64((__m128i *)(&pu1_out[3 * out_strd]), out_r3);
468 }
469
470
471