1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21
22 * *******************************************************************************
23
24 * * @file
25 * isvc_resi_trans_quant_sse42.c
26 *
27 * @brief
28 * Contains function
29 * definitions single stage forward transform for H.264
30 * It will calculate
31 * the residue, do the cf and then do quantization
32 *
33 * @author
34 * Mohit
35 * [100664]
36 *
37 * @par List of Functions:
38 * -
39 * isvc_resi_trans_quant_4x4_sse42()
40 * -
41 * isvc_resi_trans_quant_chroma_4x4_sse42()
42 *
43 * @remarks
44 * None
45 *
46
47 * *******************************************************************************
48
49 */
50 #include <immintrin.h>
51
52 #include "ih264_typedefs.h"
53 #include "ih264_debug.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "isvc_structs.h"
61 #include "isvc_trans_quant_itrans_iquant.h"
62
63 /**|
64 *******************************************************************************
65 *
66 *
67 * @brief
68 * This function performs forward transform and quantization on a 4*4
69 * block
70 *
71 * @par Description:
72 * The function accepts source buffer and
73 * estimation buffer. From these, it
74 * computes the residue. This is residue
75 * is then transformed and quantized.
76 * The transform and quantization are in
77 * placed computed. They use the residue
78 * buffer for this.
79 *
80 * @param[in]
81 * pu1_src
82 * Pointer to source sub-block
83 *
84 * @param[in] pu1_pred
85 * Pointer
86 * to prediction sub-block
87 *
88 * @param[in] pi2_out
89 * Pointer to residual
90 * sub-block
91 *
92 * @param[in] i4_src_stride
93 * Source stride
94 *
95 * @param[in]
96 * i4_pred_stride
97 * Prediction stride
98 *
99 * @param[in] dst_strd
100 * Destination
101 * stride
102 *
103 * @param[in] u4_qbits
104 * QP_BITS_h264_4x4 + floor(QP/6)
105 *
106 *
107 * @param[in] pu2_threshold_matrix
108 * Pointer to Forward Quant Threshold
109 * Matrix
110 *
111 * @param[in] pu2_scale_matrix
112 * Pointer to Forward Quant Scale
113 * Matrix
114 *
115 * @param[in] u4_round_factor
116 * Quantization Round factor
117 *
118 *
119 * @param[out] pu1_nnz
120 * Total non-zero coefficients in the current
121 * sub-block
122 *
123 * @returns
124 *
125 * @remarks
126 *
127 * None
128 *
129 *******************************************************************************
130 */
isvc_resi_trans_quant_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)131 void isvc_resi_trans_quant_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
132 buffer_container_t *ps_out,
133 buffer_container_t *ps_upsampled_res,
134 resi_trans_quant_constants_t *ps_quant_constants,
135 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
136 UWORD8 u1_use_upsampled_res)
137 {
138 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
139 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
140 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
141 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
142 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
143 WORD32 mask0, mask1;
144 __m128i sum0, sum1, sum2, cmp0, cmp1;
145 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
146 __m128i temp_2 = _mm_set1_epi16(2);
147 __m128i temp_1 = _mm_set1_epi16(1);
148 __m128i src_r0, src_r1, src_r2, src_r3;
149 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
150 __m128i temp0, temp1, temp2, temp3;
151 /* all bits reset to zero */
152 __m128i zero_8x16b = _mm_setzero_si128();
153 __m128i sign_reg0, sign_reg2;
154 __m128i scalemat_r0_r1, scalemat_r2_r3;
155 __m128i threshold_r0_r1, threshold_r2_r3;
156 __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
157
158 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
159 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
160 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
161 WORD32 i4_src_stride = ps_src->i4_data_stride;
162 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
163 WORD32 i4_out_stride = ps_out->i4_data_stride;
164
165 ASSERT(0 == u1_use_upsampled_res);
166 ASSERT(4 == i4_out_stride);
167 UNUSED(u1_use_upsampled_res);
168 UNUSED(i4_out_stride);
169 UNUSED(ps_upsampled_res);
170
171 /* b00 b01 b02 b03 b10 b11 b12 b13
172 -- the scaling matrix 0th,1st row */
173 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
174
175 /* b20 b21 b22 b23 b30 b31 b32 b33
176 -- the scaling matrix 2nd,3rd row */
177 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
178
179 /* b00 b01 b02 b03 b10 b11 b12 b13
180 -- the treshold matrix 0th,1st row */
181 threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
182
183 /* b20 b21 b22 b23 b30 b31 b32 b33
184 -- the threshold matrix 2nd,3rd row */
185 threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
186
187 /* a00 a01 a02 a03 0 0 0 0 0
188 0 0 0 -- all 8 bits */
189 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
190
191 /* a10 a11 a12 a13 0 0 0 0 0 0 0
192 0 -- all 8 bits */
193 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
194
195 /* a20 a21 a22 a23 0 0 0 0 0 0 0
196 0 -- all 8 bits */
197 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
198
199 /* a30 a31 a32 a33 0 0 0 0 0 0 0
200 0 -- all 8 bits */
201 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
202
203 src_r0 = _mm_cvtepu8_epi16(src_r0);
204 src_r1 = _mm_cvtepu8_epi16(src_r1);
205 src_r2 = _mm_cvtepu8_epi16(src_r2);
206 src_r3 = _mm_cvtepu8_epi16(src_r3);
207
208 /* p00 p01 p02 p03 0 0 0 0 0
209 0 0 0 -- all 8 bits */
210 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
211
212 /* p10 p11 p12 p13 0 0 0 0 0
213 0 0 0 -- all 8 bits */
214 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
215
216 /* p20 p21 p22 p23 0 0 0 0 0
217 0 0 0 -- all 8 bits */
218 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
219
220 /* p30 p31 p32 p33 0 0 0 0 0
221 0 0 0 -- all 8 bits */
222 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
223
224 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
225 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
226 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
227 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
228
229 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
230 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
231 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
232 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
233
234 /* Perform Forward transform */
235 /*-------------------------------------------------------------*/
236 /* DCT [ Horizontal transformation ] */
237 /*-------------------------------------------------------------*/
238 // Matrix transpose
239 /*
240 * a0 a1 a2 a3
241 * b0 b1 b2 b3
242 * c0 c1 c2 c3
243 * d0 d1 d2 d3
244 */
245 /* a0 b0 a1 b1 a2 b2 a3 b3 */
246 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
247 /* c0 d0 c1 d1 c2 d2 c3 d3 */
248 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
249 /* a0 b0 c0 d0 a1 b1 c1 d1 */
250 temp1 = _mm_unpacklo_epi32(temp0, temp2);
251 /* a2 b2 c2 d2 a3 b3 c3 d3 */
252 temp3 = _mm_unpackhi_epi32(temp0, temp2);
253
254 /* a0 b0 c0 d0 */
255 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
256 /* a1 b1 c1 d1 */
257 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
258 /* a2 b2 c2 d2 */
259 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
260 /* a3 b3 c3 d3 */
261 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
262
263 /*----------------------------------------------------------*/
264 /* x0 = z0 + z3 */
265 temp0 = _mm_add_epi16(src_r0, src_r3);
266 /* x1 = z1 + z2 */
267 temp1 = _mm_add_epi16(src_r1, src_r2);
268 /* x2 = z1 - z2 */
269 temp2 = _mm_sub_epi16(src_r1, src_r2);
270 /* x3 = z0 - z3 */
271 temp3 = _mm_sub_epi16(src_r0, src_r3);
272
273 /* z0 = x0 + x1 */
274 src_r0 = _mm_add_epi16(temp0, temp1);
275 /* z1 = (x3 << 1) + x2 */
276 src_r1 = _mm_slli_epi16(temp3, 1);
277 src_r1 = _mm_add_epi16(src_r1, temp2);
278 /* z2 = x0 - x1 */
279 src_r2 = _mm_sub_epi16(temp0, temp1);
280 /* z3 = x3 - (x2 << 1) */
281 src_r3 = _mm_slli_epi16(temp2, 1);
282 src_r3 = _mm_sub_epi16(temp3, src_r3);
283
284 // Matrix transpose
285 /*
286 * a0 b0 c0 d0
287 * a1 b1 c1 d1
288 * a2 b2 c2 d2
289 * a3 b3 c3 d3
290 */
291 /* a0 a1 b0 b1 c0 c1 d0 d1 */
292 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
293 /* a2 a3 b2 b3 c2 c3 d2 d3 */
294 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
295 /* a0 a1 a2 a3 b0 b1 b2 b3 */
296 temp1 = _mm_unpacklo_epi32(temp0, temp2);
297 /* c0 c1 c2 c3 d0 d1 d2 d3 */
298 temp3 = _mm_unpackhi_epi32(temp0, temp2);
299
300 /* a0 a1 a2 a3 */
301 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
302 /* b0 b1 b2 b3 */
303 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
304 /* c0 c1 c2 c3 */
305 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
306 /* d0 d1 d2 d3 */
307 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
308
309 /*----------------------------------------------------------*/
310 /* x0 = z0 + z3 */
311 temp0 = _mm_add_epi16(src_r0, src_r3);
312 /* x1 = z1 + z2 */
313 temp1 = _mm_add_epi16(src_r1, src_r2);
314 /* x2 = z1 - z2 */
315 temp2 = _mm_sub_epi16(src_r1, src_r2);
316 /* x3 = z0 - z3 */
317 temp3 = _mm_sub_epi16(src_r0, src_r3);
318
319 /* z0 = x0 + x1 */
320 src_r0 = _mm_add_epi16(temp0, temp1);
321 /* z1 = (x3 << 1) + x2 */
322 src_r1 = _mm_slli_epi16(temp3, 1);
323 src_r1 = _mm_add_epi16(src_r1, temp2);
324 /* z2 = x0 - x1 */
325 src_r2 = _mm_sub_epi16(temp0, temp1);
326 /* z3 = x3 - (x2 << 1) */
327 src_r3 = _mm_slli_epi16(temp2, 1);
328 src_r3 = _mm_sub_epi16(temp3, src_r3);
329
330 /* get the first 16 bits from the register */
331 tmp_dc = _mm_extract_epi16(src_r0, 0);
332 *pi2_dc_out = tmp_dc;
333
334 /* a0 a1 a2 a3 b0 b1 b2 b3 */
335 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
336 /* c0 c1 c2 c3 d0 d1 d2 d3 */
337 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
338 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
339 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
340
341 sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
342 sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
343
344 sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
345 sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
346
347 src_r0 = _mm_abs_epi16(src_r0);
348 src_r2 = _mm_abs_epi16(src_r2);
349
350 threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
351 threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
352
353 src_r1 = _mm_srli_si128(src_r0, 8);
354 src_r0 = _mm_cvtepu16_epi32(src_r0);
355 src_r1 = _mm_cvtepu16_epi32(src_r1);
356 src_r3 = _mm_srli_si128(src_r2, 8);
357 src_r2 = _mm_cvtepu16_epi32(src_r2);
358 src_r3 = _mm_cvtepu16_epi32(src_r3);
359
360 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
361 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
362 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
363 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
364 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
365 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
366
367 temp0 = _mm_mullo_epi32(temp0, src_r0);
368 temp1 = _mm_mullo_epi32(temp1, src_r1);
369 temp2 = _mm_mullo_epi32(temp2, src_r2);
370 temp3 = _mm_mullo_epi32(temp3, src_r3);
371
372 temp0 = _mm_add_epi32(temp0, rnd_fact);
373 temp1 = _mm_add_epi32(temp1, rnd_fact);
374 temp2 = _mm_add_epi32(temp2, rnd_fact);
375 temp3 = _mm_add_epi32(temp3, rnd_fact);
376
377 temp0 = _mm_srli_epi32(temp0, u4_qbits);
378 temp1 = _mm_srli_epi32(temp1, u4_qbits);
379 temp2 = _mm_srli_epi32(temp2, u4_qbits);
380 temp3 = _mm_srli_epi32(temp3, u4_qbits);
381
382 temp0 = _mm_packs_epi32(temp0, temp1);
383 temp2 = _mm_packs_epi32(temp2, temp3);
384
385 temp0 = _mm_sign_epi16(temp0, sign_reg0);
386 temp2 = _mm_sign_epi16(temp2, sign_reg2);
387
388 temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
389 temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
390
391 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
392 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
393
394 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
395 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
396
397 mask0 = _mm_movemask_epi8(cmp0);
398 mask1 = _mm_movemask_epi8(cmp1);
399 u4_zero_coeff = 0;
400
401 if(mask0)
402 {
403 if(mask0 == 0xffff)
404 u4_zero_coeff += 8;
405 else
406 {
407 cmp0 = _mm_and_si128(temp_1, cmp0);
408 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
409 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
410 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
411 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
412 }
413 }
414 if(mask1)
415 {
416 if(mask1 == 0xffff)
417 u4_zero_coeff += 8;
418 else
419 {
420 cmp1 = _mm_and_si128(temp_1, cmp1);
421 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
422 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
423 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
424 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
425 }
426 }
427
428 /* Return total nonzero coefficients in the current sub block */
429 u4_nonzero_coeff = 16 - u4_zero_coeff;
430 *pu1_nnz = u4_nonzero_coeff;
431 }
432
isvc_resi_trans_quant_4x4_with_res_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)433 void isvc_resi_trans_quant_4x4_with_res_pred_sse42(
434 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
435 buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
436 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
437 {
438 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
439 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
440 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
441 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
442 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
443 WORD32 mask0, mask1;
444 __m128i sum0, sum1, sum2, cmp0, cmp1;
445 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
446 __m128i temp_2 = _mm_set1_epi16(2);
447 __m128i temp_1 = _mm_set1_epi16(1);
448 __m128i src_r0, src_r1, src_r2, src_r3;
449 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
450 __m128i temp0, temp1, temp2, temp3;
451 /* all bits reset to zero */
452 __m128i zero_8x16b = _mm_setzero_si128();
453 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
454 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
455 __m128i sign_reg0, sign_reg2;
456 __m128i scalemat_r0_r1, scalemat_r2_r3;
457 __m128i upsampled_res0, upsampled_res1, upsampled_res2, upsampled_res3;
458 __m128i threshold_r0_r1, threshold_r2_r3;
459 __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
460
461 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
462 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
463 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
464 WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
465 WORD32 i4_src_stride = ps_src->i4_data_stride;
466 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
467 WORD32 i4_out_stride = ps_out->i4_data_stride;
468 WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
469
470 ASSERT(1 == u1_use_upsampled_res);
471 ASSERT(4 == i4_out_stride);
472 UNUSED(u1_use_upsampled_res);
473 UNUSED(i4_out_stride);
474 UNUSED(ps_upsampled_res);
475
476 /* b00 b01 b02 b03 b10 b11 b12 b13
477 -- the scaling matrix 0th,1st row */
478 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
479
480 /* b20 b21 b22 b23 b30 b31 b32 b33
481 -- the scaling matrix 2nd,3rd row */
482 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
483
484 /* b00 b01 b02 b03 b10 b11 b12 b13
485 -- the treshold matrix 0th,1st row */
486 threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
487
488 /* b20 b21 b22 b23 b30 b31 b32 b33
489 -- the threshold matrix 2nd,3rd row */
490 threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
491
492 /* a00 a01 a02 a03 0 0 0 0 0
493 0 0 0 -- all 8 bits */
494 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
495
496 /* a10 a11 a12 a13 0 0 0 0 0 0 0
497 0 -- all 8 bits */
498 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
499
500 /* a20 a21 a22 a23 0 0 0 0 0 0 0
501 0 -- all 8 bits */
502 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
503
504 /* a30 a31 a32 a33 0 0 0 0 0 0 0
505 0 -- all 8 bits */
506 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
507
508 src_r0 = _mm_cvtepu8_epi16(src_r0);
509 src_r1 = _mm_cvtepu8_epi16(src_r1);
510 src_r2 = _mm_cvtepu8_epi16(src_r2);
511 src_r3 = _mm_cvtepu8_epi16(src_r3);
512
513 /* p00 p01 p02 p03 0 0 0 0 0
514 0 0 0 -- all 8 bits */
515 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
516
517 /* p10 p11 p12 p13 0 0 0 0 0
518 0 0 0 -- all 8 bits */
519 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
520
521 /* p20 p21 p22 p23 0 0 0 0 0
522 0 0 0 -- all 8 bits */
523 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
524
525 /* p30 p31 p32 p33 0 0 0 0 0
526 0 0 0 -- all 8 bits */
527 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
528
529 pred_r0 = _mm_cvtepu8_epi16(pred_r0);
530 pred_r1 = _mm_cvtepu8_epi16(pred_r1);
531 pred_r2 = _mm_cvtepu8_epi16(pred_r2);
532 pred_r3 = _mm_cvtepu8_epi16(pred_r3);
533
534 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
535 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
536 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
537 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
538
539 /* load upsampled residual values and subtract from
540 the previous residue */
541 upsampled_res0 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[0]));
542
543 upsampled_res1 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[i4_upsampled_res_stride]));
544
545 upsampled_res2 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[2 * i4_upsampled_res_stride]));
546
547 upsampled_res3 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[3 * i4_upsampled_res_stride]));
548
549 src_r0 = _mm_sub_epi16(src_r0, upsampled_res0);
550 src_r1 = _mm_sub_epi16(src_r1, upsampled_res1);
551 src_r2 = _mm_sub_epi16(src_r2, upsampled_res2);
552 src_r3 = _mm_sub_epi16(src_r3, upsampled_res3);
553
554 src_r1 = _mm_unpacklo_epi16(src_r0, src_r1);
555 src_r3 = _mm_unpacklo_epi16(src_r2, src_r3);
556
557 /* Saturate all values < -255 to -255 and retain the rest as it is */
558 src_r1 = _mm_max_epi16(src_r1, neg_255_8x16b);
559 /* Saturate all values > 255 to 255 and retain the rest as it is */
560 temp0 = _mm_min_epi16(src_r1, pos_255_8x16b);
561
562 /* Saturate all values < -255 to -255 and retain the rest as it is */
563 src_r3 = _mm_max_epi16(src_r3, neg_255_8x16b);
564 /* Saturate all values > 255 to 255 and retain the rest as it is */
565 temp2 = _mm_min_epi16(src_r3, pos_255_8x16b);
566
567 /* Perform Forward transform */
568 /*-------------------------------------------------------------*/
569 /* DCT [ Horizontal transformation ] */
570 /*-------------------------------------------------------------*/
571 // Matrix transpose
572 /*
573 * a0 a1 a2 a3
574 * b0 b1 b2 b3
575 * c0 c1 c2 c3
576 * d0 d1 d2 d3
577 */
578 /* a0 b0 c0 d0 a1 b1 c1 d1 */
579 temp1 = _mm_unpacklo_epi32(temp0, temp2);
580 /* a2 b2 c2 d2 a3 b3 c3 d3 */
581 temp3 = _mm_unpackhi_epi32(temp0, temp2);
582
583 /* a0 b0 c0 d0 */
584 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
585 /* a1 b1 c1 d1 */
586 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
587 /* a2 b2 c2 d2 */
588 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
589 /* a3 b3 c3 d3 */
590 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
591
592 /*----------------------------------------------------------*/
593 /* x0 = z0 + z3 */
594 temp0 = _mm_add_epi16(src_r0, src_r3);
595 /* x1 = z1 + z2 */
596 temp1 = _mm_add_epi16(src_r1, src_r2);
597 /* x2 = z1 - z2 */
598 temp2 = _mm_sub_epi16(src_r1, src_r2);
599 /* x3 = z0 - z3 */
600 temp3 = _mm_sub_epi16(src_r0, src_r3);
601
602 /* z0 = x0 + x1 */
603 src_r0 = _mm_add_epi16(temp0, temp1);
604 /* z1 = (x3 << 1) + x2 */
605 src_r1 = _mm_slli_epi16(temp3, 1);
606 src_r1 = _mm_add_epi16(src_r1, temp2);
607 /* z2 = x0 - x1 */
608 src_r2 = _mm_sub_epi16(temp0, temp1);
609 /* z3 = x3 - (x2 << 1) */
610 src_r3 = _mm_slli_epi16(temp2, 1);
611 src_r3 = _mm_sub_epi16(temp3, src_r3);
612
613 // Matrix transpose
614 /*
615 * a0 b0 c0 d0
616 * a1 b1 c1 d1
617 * a2 b2 c2 d2
618 * a3 b3 c3 d3
619 */
620 /* a0 a1 b0 b1 c0 c1 d0 d1 */
621 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
622 /* a2 a3 b2 b3 c2 c3 d2 d3 */
623 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
624 /* a0 a1 a2 a3 b0 b1 b2 b3 */
625 temp1 = _mm_unpacklo_epi32(temp0, temp2);
626 /* c0 c1 c2 c3 d0 d1 d2 d3 */
627 temp3 = _mm_unpackhi_epi32(temp0, temp2);
628
629 /* a0 a1 a2 a3 */
630 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
631 /* b0 b1 b2 b3 */
632 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
633 /* c0 c1 c2 c3 */
634 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
635 /* d0 d1 d2 d3 */
636 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
637
638 /*----------------------------------------------------------*/
639 /* x0 = z0 + z3 */
640 temp0 = _mm_add_epi16(src_r0, src_r3);
641 /* x1 = z1 + z2 */
642 temp1 = _mm_add_epi16(src_r1, src_r2);
643 /* x2 = z1 - z2 */
644 temp2 = _mm_sub_epi16(src_r1, src_r2);
645 /* x3 = z0 - z3 */
646 temp3 = _mm_sub_epi16(src_r0, src_r3);
647
648 /* z0 = x0 + x1 */
649 src_r0 = _mm_add_epi16(temp0, temp1);
650 /* z1 = (x3 << 1) + x2 */
651 src_r1 = _mm_slli_epi16(temp3, 1);
652 src_r1 = _mm_add_epi16(src_r1, temp2);
653 /* z2 = x0 - x1 */
654 src_r2 = _mm_sub_epi16(temp0, temp1);
655 /* z3 = x3 - (x2 << 1) */
656 src_r3 = _mm_slli_epi16(temp2, 1);
657 src_r3 = _mm_sub_epi16(temp3, src_r3);
658
659 /* get the first 16 bits from the register */
660 tmp_dc = _mm_extract_epi16(src_r0, 0);
661 *pi2_dc_out = tmp_dc;
662
663 /* a0 a1 a2 a3 b0 b1 b2 b3 */
664 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
665 /* c0 c1 c2 c3 d0 d1 d2 d3 */
666 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
667 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
668 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
669
670 sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
671 sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
672
673 sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
674 sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
675
676 src_r0 = _mm_abs_epi16(src_r0);
677 src_r2 = _mm_abs_epi16(src_r2);
678
679 threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
680 threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
681
682 src_r1 = _mm_srli_si128(src_r0, 8);
683 src_r0 = _mm_cvtepu16_epi32(src_r0);
684 src_r1 = _mm_cvtepu16_epi32(src_r1);
685 src_r3 = _mm_srli_si128(src_r2, 8);
686 src_r2 = _mm_cvtepu16_epi32(src_r2);
687 src_r3 = _mm_cvtepu16_epi32(src_r3);
688
689 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
690 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
691 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
692 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
693 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
694 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
695
696 temp0 = _mm_mullo_epi32(temp0, src_r0);
697 temp1 = _mm_mullo_epi32(temp1, src_r1);
698 temp2 = _mm_mullo_epi32(temp2, src_r2);
699 temp3 = _mm_mullo_epi32(temp3, src_r3);
700
701 temp0 = _mm_add_epi32(temp0, rnd_fact);
702 temp1 = _mm_add_epi32(temp1, rnd_fact);
703 temp2 = _mm_add_epi32(temp2, rnd_fact);
704 temp3 = _mm_add_epi32(temp3, rnd_fact);
705
706 temp0 = _mm_srli_epi32(temp0, u4_qbits);
707 temp1 = _mm_srli_epi32(temp1, u4_qbits);
708 temp2 = _mm_srli_epi32(temp2, u4_qbits);
709 temp3 = _mm_srli_epi32(temp3, u4_qbits);
710
711 temp0 = _mm_packs_epi32(temp0, temp1);
712 temp2 = _mm_packs_epi32(temp2, temp3);
713
714 temp0 = _mm_sign_epi16(temp0, sign_reg0);
715 temp2 = _mm_sign_epi16(temp2, sign_reg2);
716
717 temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
718 temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
719
720 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
721 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
722
723 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
724 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
725
726 mask0 = _mm_movemask_epi8(cmp0);
727 mask1 = _mm_movemask_epi8(cmp1);
728 u4_zero_coeff = 0;
729 if(mask0)
730 {
731 if(mask0 == 0xffff)
732 u4_zero_coeff += 8;
733 else
734 {
735 cmp0 = _mm_and_si128(temp_1, cmp0);
736 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
737 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
738 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
739 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
740 }
741 }
742 if(mask1)
743 {
744 if(mask1 == 0xffff)
745 u4_zero_coeff += 8;
746 else
747 {
748 cmp1 = _mm_and_si128(temp_1, cmp1);
749 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
750 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
751 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
752 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
753 }
754 }
755
756 /* Return total nonzero coefficients in the current sub block */
757 u4_nonzero_coeff = 16 - u4_zero_coeff;
758 *pu1_nnz = u4_nonzero_coeff;
759 }
760
761 /**
762
763 * *******************************************************************************
764
765 * *
766 * @brief
767 * This function performs forward transform and quantization on
768 * a 4*4 chroma
769 *block
770 *
771 * @par Description:
772 * The function accepts source
773 * buffer and estimation buffer. From these, it
774 * computes the residue. This
775 * is residue is then transformed and quantized.
776 * The transform and
777 * quantization are in placed computed. They use the residue
778 * buffer for
779 * this.
780 *
781 * @param[in] pu1_src
782 * Pointer to source sub-block
783 *
784 *
785 * @param[in] pu1_pred
786 * Pointer to prediction sub-block
787 *
788 * @param[in]
789 * pi2_out
790 * Pointer to residual sub-block
791 *
792 * @param[in] i4_src_stride
793 *
794 * Source stride
795 *
796 * @param[in] i4_pred_stride
797 * Prediction stride
798 *
799 *
800 * @param[in] dst_strd
801 * Destination stride
802 *
803 * @param[in] u4_qbits
804 *
805 * QP_BITS_h264_4x4 + floor(QP/6)
806 *
807 * @param[in] pu2_threshold_matrix
808 *
809 * Pointer to Forward Quant Threshold Matrix
810 *
811 * @param[in] pu2_scale_matrix
812
813 * * Pointer to Forward Quant Scale Matrix
814 *
815 * @param[in] u4_round_factor
816 *
817 * Quantization Round factor
818 *
819 * @param[out] pu1_nnz
820 * Total non-zero
821 * coefficients in the current sub-block
822 *
823 * @returns
824 *
825 * @remarks
826 *
827 * None
828 *
829
830 * *******************************************************************************
831
832 */
isvc_resi_trans_quant_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)833 void isvc_resi_trans_quant_chroma_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
834 buffer_container_t *ps_out,
835 buffer_container_t *ps_upsampled_res,
836 resi_trans_quant_constants_t *ps_quant_constants,
837 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
838 UWORD8 u1_use_upsampled_res)
839 {
840 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
841 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
842 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
843 WORD32 i4_src_stride = ps_src->i4_data_stride;
844 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
845 WORD32 i4_out_stride = ps_out->i4_data_stride;
846 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
847 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
848 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
849 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
850 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
851 WORD32 mask0, mask1;
852 __m128i cmp0, cmp1, sum0, sum1, sum2;
853 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
854 __m128i temp_2 = _mm_set1_epi16(2);
855 __m128i temp_1 = _mm_set1_epi16(1);
856 __m128i src_r0, src_r1, src_r2, src_r3;
857 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
858 __m128i temp0, temp1, temp2, temp3;
859 /* all bits reset to zero */
860 __m128i zero_8x16b = _mm_setzero_si128();
861 __m128i sign_reg0, sign_reg2;
862 __m128i scalemat_r0_r1, scalemat_r2_r3;
863 __m128i threshold_r0_r1, threshold_r2_r3;
864 __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
865 __m128i chroma_mask = _mm_set1_epi16(0xFF);
866
867 ASSERT(0 == u1_use_upsampled_res);
868 ASSERT(4 == i4_out_stride);
869 UNUSED(u1_use_upsampled_res);
870 UNUSED(i4_out_stride);
871 UNUSED(ps_upsampled_res);
872
873 /* b00 b01 b02 b03 b10 b11 b12 b13
874 -- the scaling matrix 0th,1st row */
875 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
876
877 /* b20 b21 b22 b23 b30 b31 b32 b33
878 -- the scaling matrix 2nd,3rd row */
879 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
880
881 /* b00 b01 b02 b03 b10 b11 b12 b13
882 -- the treshold matrix 0th,1st row */
883 threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
884
885 /* b20 b21 b22 b23 b30 b31 b32 b33
886 -- the threshold matrix 2nd,3rd row */
887 threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
888
889 /* a00 a01 a02 a03 0 0 0 0 0
890 0 0 0 -- all 8 bits */
891 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
892 /* a10 a11 a12 a13 0 0 0 0 0 0 0
893 0 -- all 8 bits */
894 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
895 /* a20 a21 a22 a23 0 0 0 0 0 0 0
896 0 -- all 8 bits */
897 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
898 /* a30 a31 a32 a33 0 0 0 0 0 0 0
899 0 -- all 8 bits */
900 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
901
902 src_r0 = _mm_and_si128(src_r0, chroma_mask);
903 src_r1 = _mm_and_si128(src_r1, chroma_mask);
904 src_r2 = _mm_and_si128(src_r2, chroma_mask);
905 src_r3 = _mm_and_si128(src_r3, chroma_mask);
906
907 /* p00 p01 p02 p03 0 0 0 0 0
908 0 0 0 -- all 8 bits */
909 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
910 /* p10 p11 p12 p13 0 0 0 0 0
911 0 0 0 -- all 8 bits */
912 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
913 /* p20 p21 p22 p23 0 0 0 0 0
914 0 0 0 -- all 8 bits */
915 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
916 /* p30 p31 p32 p33 0 0 0 0 0
917 0 0 0 -- all 8 bits */
918 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
919
920 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
921 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
922 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
923 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
924
925 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
926 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
927 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
928 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
929
930 /* Perform Forward transform */
931 /*-------------------------------------------------------------*/
932 /* DCT [ Horizontal transformation ] */
933 /*-------------------------------------------------------------*/
934 // Matrix transpose
935 /*
936 * a0 a1 a2 a3
937 * b0 b1 b2 b3
938 * c0 c1 c2 c3
939 * d0 d1 d2 d3
940 */
941 /* a0 b0 a1 b1 a2 b2 a3 b3 */
942 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
943 /* c0 d0 c1 d1 c2 d2 c3 d3 */
944 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
945 /* a0 b0 c0 d0 a1 b1 c1 d1 */
946 temp1 = _mm_unpacklo_epi32(temp0, temp2);
947 /* a2 b2 c2 d2 a3 b3 c3 d3 */
948 temp3 = _mm_unpackhi_epi32(temp0, temp2);
949
950 /* a0 b0 c0 d0 */
951 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
952 /* a1 b1 c1 d1 */
953 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
954 /* a2 b2 c2 d2 */
955 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
956 /* a3 b3 c3 d3 */
957 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
958
959 /*----------------------------------------------------------*/
960 /* x0 = z0 + z3 */
961 temp0 = _mm_add_epi16(src_r0, src_r3);
962 /* x1 = z1 + z2 */
963 temp1 = _mm_add_epi16(src_r1, src_r2);
964 /* x2 = z1 - z2 */
965 temp2 = _mm_sub_epi16(src_r1, src_r2);
966 /* x3 = z0 - z3 */
967 temp3 = _mm_sub_epi16(src_r0, src_r3);
968
969 /* z0 = x0 + x1 */
970 src_r0 = _mm_add_epi16(temp0, temp1);
971 /* z1 = (x3 << 1) + x2 */
972 src_r1 = _mm_slli_epi16(temp3, 1);
973 src_r1 = _mm_add_epi16(src_r1, temp2);
974 /* z2 = x0 - x1 */
975 src_r2 = _mm_sub_epi16(temp0, temp1);
976 /* z3 = x3 - (x2 << 1) */
977 src_r3 = _mm_slli_epi16(temp2, 1);
978 src_r3 = _mm_sub_epi16(temp3, src_r3);
979
980 // Matrix transpose
981 /*
982 * a0 b0 c0 d0
983 * a1 b1 c1 d1
984 * a2 b2 c2 d2
985 * a3 b3 c3 d3
986 */
987 /* a0 a1 b0 b1 c0 c1 d0 d1 */
988 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
989 /* a2 a3 b2 b3 c2 c3 d2 d3 */
990 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
991 /* a0 a1 a2 a3 b0 b1 b2 b3 */
992 temp1 = _mm_unpacklo_epi32(temp0, temp2);
993 /* c0 c1 c2 c3 d0 d1 d2 d3 */
994 temp3 = _mm_unpackhi_epi32(temp0, temp2);
995
996 /* a0 a1 a2 a3 */
997 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
998 /* b0 b1 b2 b3 */
999 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1000 /* c0 c1 c2 c3 */
1001 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1002 /* d0 d1 d2 d3 */
1003 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1004
1005 /*----------------------------------------------------------*/
1006 /* x0 = z0 + z3 */
1007 temp0 = _mm_add_epi16(src_r0, src_r3);
1008 /* x1 = z1 + z2 */
1009 temp1 = _mm_add_epi16(src_r1, src_r2);
1010 /* x2 = z1 - z2 */
1011 temp2 = _mm_sub_epi16(src_r1, src_r2);
1012 /* x3 = z0 - z3 */
1013 temp3 = _mm_sub_epi16(src_r0, src_r3);
1014
1015 /* z0 = x0 + x1 */
1016 src_r0 = _mm_add_epi16(temp0, temp1);
1017 /* z1 = (x3 << 1) + x2 */
1018 src_r1 = _mm_slli_epi16(temp3, 1);
1019 src_r1 = _mm_add_epi16(src_r1, temp2);
1020 /* z2 = x0 - x1 */
1021 src_r2 = _mm_sub_epi16(temp0, temp1);
1022 /* z3 = x3 - (x2 << 1) */
1023 src_r3 = _mm_slli_epi16(temp2, 1);
1024 src_r3 = _mm_sub_epi16(temp3, src_r3);
1025
1026 /* get the first 16 bits from the register */
1027 tmp_dc = _mm_extract_epi16(src_r0, 0);
1028 *pi2_dc_out = tmp_dc;
1029
1030 /* a0 a1 a2 a3 b0 b1 b2 b3 */
1031 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
1032 /* c0 c1 c2 c3 d0 d1 d2 d3 */
1033 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
1034 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
1035 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
1036
1037 sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
1038 sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
1039
1040 sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
1041 sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1042
1043 src_r0 = _mm_abs_epi16(src_r0);
1044 src_r2 = _mm_abs_epi16(src_r2);
1045
1046 threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
1047 threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
1048
1049 src_r1 = _mm_srli_si128(src_r0, 8);
1050 src_r0 = _mm_cvtepu16_epi32(src_r0);
1051 src_r1 = _mm_cvtepu16_epi32(src_r1);
1052 src_r3 = _mm_srli_si128(src_r2, 8);
1053 src_r2 = _mm_cvtepu16_epi32(src_r2);
1054 src_r3 = _mm_cvtepu16_epi32(src_r3);
1055
1056 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1057 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
1058 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1059 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
1060 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1061 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1062
1063 temp0 = _mm_mullo_epi32(temp0, src_r0);
1064 temp1 = _mm_mullo_epi32(temp1, src_r1);
1065 temp2 = _mm_mullo_epi32(temp2, src_r2);
1066 temp3 = _mm_mullo_epi32(temp3, src_r3);
1067
1068 temp0 = _mm_add_epi32(temp0, rnd_fact);
1069 temp1 = _mm_add_epi32(temp1, rnd_fact);
1070 temp2 = _mm_add_epi32(temp2, rnd_fact);
1071 temp3 = _mm_add_epi32(temp3, rnd_fact);
1072
1073 temp0 = _mm_srli_epi32(temp0, u4_qbits);
1074 temp1 = _mm_srli_epi32(temp1, u4_qbits);
1075 temp2 = _mm_srli_epi32(temp2, u4_qbits);
1076 temp3 = _mm_srli_epi32(temp3, u4_qbits);
1077
1078 temp0 = _mm_packs_epi32(temp0, temp1);
1079 temp2 = _mm_packs_epi32(temp2, temp3);
1080
1081 temp0 = _mm_sign_epi16(temp0, sign_reg0);
1082 temp2 = _mm_sign_epi16(temp2, sign_reg2);
1083
1084 temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
1085 temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
1086
1087 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
1088 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
1089
1090 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1091 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1092
1093 mask0 = _mm_movemask_epi8(cmp0);
1094 mask1 = _mm_movemask_epi8(cmp1);
1095 u4_zero_coeff = 0;
1096 if(mask0)
1097 {
1098 if(mask0 == 0xffff)
1099 u4_zero_coeff += 8;
1100 else
1101 {
1102 cmp0 = _mm_and_si128(temp_1, cmp0);
1103 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1104 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1105 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1106 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1107 }
1108 }
1109 if(mask1)
1110 {
1111 if(mask1 == 0xffff)
1112 u4_zero_coeff += 8;
1113 else
1114 {
1115 cmp1 = _mm_and_si128(temp_1, cmp1);
1116 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1117 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1118 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1119 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1120 }
1121 }
1122
1123 /* Return total nonzero coefficients in the current sub block */
1124 u4_nonzero_coeff = 16 - u4_zero_coeff;
1125 *pu1_nnz = u4_nonzero_coeff;
1126 }
1127
isvc_resi_trans_quant_chroma_4x4_with_res_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)1128 void isvc_resi_trans_quant_chroma_4x4_with_res_pred_sse42(
1129 buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
1130 buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
1131 UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
1132 {
1133 UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
1134 UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1135 WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
1136 WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
1137 WORD32 i4_src_stride = ps_src->i4_data_stride;
1138 WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1139 WORD32 i4_out_stride = ps_out->i4_data_stride;
1140 WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
1141 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1142 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1143 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1144 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1145 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
1146 WORD32 mask0, mask1;
1147 __m128i cmp0, cmp1, sum0, sum1, sum2;
1148 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1149 __m128i temp_2 = _mm_set1_epi16(2);
1150 __m128i temp_1 = _mm_set1_epi16(1);
1151 __m128i src_r0, src_r1, src_r2, src_r3;
1152 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1153 __m128i temp0, temp1, temp2, temp3;
1154 /* all bits reset to zero */
1155 __m128i zero_8x16b = _mm_setzero_si128();
1156 __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1157 __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1158 __m128i sign_reg0, sign_reg2;
1159 __m128i scalemat_r0_r1, scalemat_r2_r3;
1160 __m128i upsampled_res0, upsampled_res1, upsampled_res2, upsampled_res3;
1161 __m128i threshold_r0_r1, threshold_r2_r3;
1162 __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
1163 __m128i chroma_mask = _mm_set1_epi16(0xFF);
1164
1165 ASSERT(1 == u1_use_upsampled_res);
1166 ASSERT(4 == i4_out_stride);
1167 UNUSED(u1_use_upsampled_res);
1168 UNUSED(i4_out_stride);
1169 UNUSED(ps_upsampled_res);
1170
1171 /* b00 b01 b02 b03 b10 b11 b12 b13
1172 -- the scaling matrix 0th,1st row */
1173 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
1174
1175 /* b20 b21 b22 b23 b30 b31 b32 b33
1176 -- the scaling matrix 2nd,3rd row */
1177 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
1178
1179 /* b00 b01 b02 b03 b10 b11 b12 b13
1180 -- the treshold matrix 0th,1st row */
1181 threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
1182
1183 /* b20 b21 b22 b23 b30 b31 b32 b33
1184 -- the threshold matrix 2nd,3rd row */
1185 threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
1186
1187 /* a00 a01 a02 a03 0 0 0 0 0
1188 0 0 0 -- all 8 bits */
1189 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
1190 /* a10 a11 a12 a13 0 0 0 0 0 0 0
1191 0 -- all 8 bits */
1192 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
1193 /* a20 a21 a22 a23 0 0 0 0 0 0 0
1194 0 -- all 8 bits */
1195 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
1196 /* a30 a31 a32 a33 0 0 0 0 0 0 0
1197 0 -- all 8 bits */
1198 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
1199
1200 src_r0 = _mm_and_si128(src_r0, chroma_mask);
1201 src_r1 = _mm_and_si128(src_r1, chroma_mask);
1202 src_r2 = _mm_and_si128(src_r2, chroma_mask);
1203 src_r3 = _mm_and_si128(src_r3, chroma_mask);
1204
1205 /* p00 p01 p02 p03 0 0 0 0 0
1206 0 0 0 -- all 8 bits */
1207 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1208 /* p10 p11 p12 p13 0 0 0 0 0
1209 0 0 0 -- all 8 bits */
1210 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1211 /* p20 p21 p22 p23 0 0 0 0 0
1212 0 0 0 -- all 8 bits */
1213 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1214 /* p30 p31 p32 p33 0 0 0 0 0
1215 0 0 0 -- all 8 bits */
1216 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1217
1218 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1219 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1220 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1221 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1222
1223 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
1224 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
1225 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
1226 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
1227
1228 /* load upsampled residual values and subtract from
1229 the previous residue */
1230 upsampled_res0 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[0]));
1231
1232 upsampled_res1 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[i4_upsampled_res_stride]));
1233
1234 upsampled_res2 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[2 * i4_upsampled_res_stride]));
1235
1236 upsampled_res3 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[3 * i4_upsampled_res_stride]));
1237
1238 src_r0 = _mm_sub_epi16(src_r0, upsampled_res0);
1239 src_r1 = _mm_sub_epi16(src_r1, upsampled_res1);
1240 src_r2 = _mm_sub_epi16(src_r2, upsampled_res2);
1241 src_r3 = _mm_sub_epi16(src_r3, upsampled_res3);
1242
1243 src_r1 = _mm_unpacklo_epi16(src_r0, src_r1);
1244 src_r3 = _mm_unpacklo_epi16(src_r2, src_r3);
1245
1246 /* Saturate all values < -255 to -255 and retain the rest as it is */
1247 src_r1 = _mm_max_epi16(src_r1, neg_255_8x16b);
1248 /* Saturate all values > 255 to 255 and retain the rest as it is */
1249 temp0 = _mm_min_epi16(src_r1, pos_255_8x16b);
1250
1251 /* Saturate all values < -255 to -255 and retain the rest as it is */
1252 src_r3 = _mm_max_epi16(src_r3, neg_255_8x16b);
1253 /* Saturate all values > 255 to 255 and retain the rest as it is */
1254 temp2 = _mm_min_epi16(src_r3, pos_255_8x16b);
1255
1256 /* Perform Forward transform */
1257 /*-------------------------------------------------------------*/
1258 /* DCT [ Horizontal transformation ] */
1259 /*-------------------------------------------------------------*/
1260 // Matrix transpose
1261 /*
1262 * a0 a1 a2 a3
1263 * b0 b1 b2 b3
1264 * c0 c1 c2 c3
1265 * d0 d1 d2 d3
1266 */
1267 temp1 = _mm_unpacklo_epi32(temp0, temp2);
1268 /* a2 b2 c2 d2 a3 b3 c3 d3 */
1269 temp3 = _mm_unpackhi_epi32(temp0, temp2);
1270
1271 /* a0 b0 c0 d0 */
1272 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
1273 /* a1 b1 c1 d1 */
1274 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1275 /* a2 b2 c2 d2 */
1276 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1277 /* a3 b3 c3 d3 */
1278 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1279
1280 /*----------------------------------------------------------*/
1281 /* x0 = z0 + z3 */
1282 temp0 = _mm_add_epi16(src_r0, src_r3);
1283 /* x1 = z1 + z2 */
1284 temp1 = _mm_add_epi16(src_r1, src_r2);
1285 /* x2 = z1 - z2 */
1286 temp2 = _mm_sub_epi16(src_r1, src_r2);
1287 /* x3 = z0 - z3 */
1288 temp3 = _mm_sub_epi16(src_r0, src_r3);
1289
1290 /* z0 = x0 + x1 */
1291 src_r0 = _mm_add_epi16(temp0, temp1);
1292 /* z1 = (x3 << 1) + x2 */
1293 src_r1 = _mm_slli_epi16(temp3, 1);
1294 src_r1 = _mm_add_epi16(src_r1, temp2);
1295 /* z2 = x0 - x1 */
1296 src_r2 = _mm_sub_epi16(temp0, temp1);
1297 /* z3 = x3 - (x2 << 1) */
1298 src_r3 = _mm_slli_epi16(temp2, 1);
1299 src_r3 = _mm_sub_epi16(temp3, src_r3);
1300
1301 // Matrix transpose
1302 /*
1303 * a0 b0 c0 d0
1304 * a1 b1 c1 d1
1305 * a2 b2 c2 d2
1306 * a3 b3 c3 d3
1307 */
1308 /* a0 a1 b0 b1 c0 c1 d0 d1 */
1309 temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
1310 /* a2 a3 b2 b3 c2 c3 d2 d3 */
1311 temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
1312 /* a0 a1 a2 a3 b0 b1 b2 b3 */
1313 temp1 = _mm_unpacklo_epi32(temp0, temp2);
1314 /* c0 c1 c2 c3 d0 d1 d2 d3 */
1315 temp3 = _mm_unpackhi_epi32(temp0, temp2);
1316
1317 /* a0 a1 a2 a3 */
1318 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
1319 /* b0 b1 b2 b3 */
1320 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1321 /* c0 c1 c2 c3 */
1322 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1323 /* d0 d1 d2 d3 */
1324 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1325
1326 /*----------------------------------------------------------*/
1327 /* x0 = z0 + z3 */
1328 temp0 = _mm_add_epi16(src_r0, src_r3);
1329 /* x1 = z1 + z2 */
1330 temp1 = _mm_add_epi16(src_r1, src_r2);
1331 /* x2 = z1 - z2 */
1332 temp2 = _mm_sub_epi16(src_r1, src_r2);
1333 /* x3 = z0 - z3 */
1334 temp3 = _mm_sub_epi16(src_r0, src_r3);
1335
1336 /* z0 = x0 + x1 */
1337 src_r0 = _mm_add_epi16(temp0, temp1);
1338 /* z1 = (x3 << 1) + x2 */
1339 src_r1 = _mm_slli_epi16(temp3, 1);
1340 src_r1 = _mm_add_epi16(src_r1, temp2);
1341 /* z2 = x0 - x1 */
1342 src_r2 = _mm_sub_epi16(temp0, temp1);
1343 /* z3 = x3 - (x2 << 1) */
1344 src_r3 = _mm_slli_epi16(temp2, 1);
1345 src_r3 = _mm_sub_epi16(temp3, src_r3);
1346
1347 /* get the first 16 bits from the register */
1348 tmp_dc = _mm_extract_epi16(src_r0, 0);
1349 *pi2_dc_out = tmp_dc;
1350
1351 /* a0 a1 a2 a3 b0 b1 b2 b3 */
1352 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
1353 /* c0 c1 c2 c3 d0 d1 d2 d3 */
1354 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
1355 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
1356 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
1357
1358 sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
1359 sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
1360
1361 sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
1362 sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1363
1364 src_r0 = _mm_abs_epi16(src_r0);
1365 src_r2 = _mm_abs_epi16(src_r2);
1366
1367 threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
1368 threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
1369
1370 src_r1 = _mm_srli_si128(src_r0, 8);
1371 src_r0 = _mm_cvtepu16_epi32(src_r0);
1372 src_r1 = _mm_cvtepu16_epi32(src_r1);
1373 src_r3 = _mm_srli_si128(src_r2, 8);
1374 src_r2 = _mm_cvtepu16_epi32(src_r2);
1375 src_r3 = _mm_cvtepu16_epi32(src_r3);
1376
1377 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1378 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
1379 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1380 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
1381 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1382 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1383
1384 temp0 = _mm_mullo_epi32(temp0, src_r0);
1385 temp1 = _mm_mullo_epi32(temp1, src_r1);
1386 temp2 = _mm_mullo_epi32(temp2, src_r2);
1387 temp3 = _mm_mullo_epi32(temp3, src_r3);
1388
1389 temp0 = _mm_add_epi32(temp0, rnd_fact);
1390 temp1 = _mm_add_epi32(temp1, rnd_fact);
1391 temp2 = _mm_add_epi32(temp2, rnd_fact);
1392 temp3 = _mm_add_epi32(temp3, rnd_fact);
1393
1394 temp0 = _mm_srli_epi32(temp0, u4_qbits);
1395 temp1 = _mm_srli_epi32(temp1, u4_qbits);
1396 temp2 = _mm_srli_epi32(temp2, u4_qbits);
1397 temp3 = _mm_srli_epi32(temp3, u4_qbits);
1398
1399 temp0 = _mm_packs_epi32(temp0, temp1);
1400 temp2 = _mm_packs_epi32(temp2, temp3);
1401
1402 temp0 = _mm_sign_epi16(temp0, sign_reg0);
1403 temp2 = _mm_sign_epi16(temp2, sign_reg2);
1404
1405 temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
1406 temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
1407
1408 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
1409 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
1410
1411 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1412 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1413
1414 mask0 = _mm_movemask_epi8(cmp0);
1415 mask1 = _mm_movemask_epi8(cmp1);
1416 u4_zero_coeff = 0;
1417 if(mask0)
1418 {
1419 if(mask0 == 0xffff)
1420 u4_zero_coeff += 8;
1421 else
1422 {
1423 cmp0 = _mm_and_si128(temp_1, cmp0);
1424 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1425 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1426 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1427 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1428 }
1429 }
1430 if(mask1)
1431 {
1432 if(mask1 == 0xffff)
1433 u4_zero_coeff += 8;
1434 else
1435 {
1436 cmp1 = _mm_and_si128(temp_1, cmp1);
1437 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1438 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1439 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1440 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1441 }
1442 }
1443
1444 /* Return total nonzero coefficients in the current sub block */
1445 u4_nonzero_coeff = 16 - u4_zero_coeff;
1446 *pu1_nnz = u4_nonzero_coeff;
1447 }
1448
1449 /**
1450
1451 * *******************************************************************************
1452
1453 * *
1454 * @brief
1455 * This function performs forward hadamard transform and
1456 * quantization on a 4*4
1457 *block
1458 *
1459 * @par Description:
1460 * The function
1461 * accepts source buffer and estimation buffer. From these, it
1462 * computes the
1463 * residue. This is residue is then transformed and quantized.
1464 * The
1465 * transform and quantization are in placed computed. They use the residue
1466 *
1467 * buffer for this.
1468 *
1469 * @param[in] pu1_src
1470 * Pointer to source sub-block
1471
1472 * *
1473 * @param[in] pu1_pred
1474 * Pointer to prediction sub-block
1475 *
1476 *
1477 * @param[in] pi2_out
1478 * Pointer to residual sub-block
1479 *
1480 * @param[in]
1481 * i4_src_stride
1482 * Source stride
1483 *
1484 * @param[in] i4_pred_stride
1485 *
1486 * Prediction stride
1487 *
1488 * @param[in] dst_strd
1489 * Destination stride
1490 *
1491 *
1492 * @param[in] u4_qbits
1493 * QP_BITS_h264_4x4 + floor(QP/6)
1494 *
1495 * @param[in]
1496 * pu2_threshold_matrix
1497 * Pointer to Forward Quant Threshold Matrix
1498 *
1499 *
1500 * @param[in] pu2_scale_matrix
1501 * Pointer to Forward Quant Scale Matrix
1502 *
1503 *
1504 * @param[in] u4_round_factor
1505 * Quantization Round factor
1506 *
1507 * @param[out]
1508 * pu1_nnz
1509 * Total non-zero coefficients in the current sub-block
1510 *
1511 *
1512 * @returns
1513 *
1514 * @remarks
1515 * None
1516 *
1517 */
1518
isvc_hadamard_quant_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)1519 void isvc_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
1520 resi_trans_quant_constants_t *ps_quant_constants,
1521 UWORD8 *pu1_nnz)
1522 {
1523 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1524 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1525 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1526 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1527 WORD32 u4_zero_coeff, u4_nonzero_coeff = 0;
1528 __m128i cmp0, cmp1, sum0, sum1, sum2;
1529 WORD32 mask0, mask1;
1530 __m128i src_r0_r1, src_r2_r3, sign_reg;
1531 __m128i src_r0, src_r1, src_r2, src_r3;
1532 __m128i zero_8x16b = _mm_setzero_si128();
1533 __m128i temp0, temp1, temp2, temp3;
1534 __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
1535 __m128i temp_1 = _mm_set1_epi16(1);
1536 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1537 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
1538
1539 UNUSED(pu2_threshold_matrix);
1540
1541 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); // a00 a01 a02 a03 a10 a11 a12 a13 -- the
1542 // source matrix 0th,1st row
1543 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); // a20 a21 a22 a23 a30 a31 a32 a33 --
1544 // the source matrix 2nd,3rd row
1545 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
1546 src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); // a0 a1 a2 a3
1547 src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); // b0 b1 b2 b3
1548 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
1549 src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); // c0 c1 c2 c3
1550 src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); // d0 d1 d2 d3
1551
1552 /* Perform Inverse transform */
1553 /*-------------------------------------------------------------*/
1554 /* Forward DC transform [ Horizontal transformation ] */
1555 /*-------------------------------------------------------------*/
1556 // Matrix transpose
1557 /*
1558 * a0 a1 a2 a3
1559 * b0 b1 b2 b3
1560 * c0 c1 c2 c3
1561 * d0 d1 d2 d3
1562 */
1563 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); // a0 b0 a1 b1
1564 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); // c0 d0 c1 d1
1565 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); // a2 b2 a3 b3
1566 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); // c2 d2 c3 d3
1567 src_r0 = _mm_unpacklo_epi64(temp0, temp2); // a0 b0 c0 d0
1568 src_r1 = _mm_unpackhi_epi64(temp0, temp2); // a1 b1 c1 d1
1569 src_r2 = _mm_unpacklo_epi64(temp1, temp3); // a2 b2 c2 d2
1570 src_r3 = _mm_unpackhi_epi64(temp1, temp3); // a3 b3 c3 d3
1571
1572 temp0 = _mm_add_epi32(src_r0, src_r3);
1573 temp1 = _mm_add_epi32(src_r1, src_r2);
1574 temp2 = _mm_sub_epi32(src_r1, src_r2);
1575 temp3 = _mm_sub_epi32(src_r0, src_r3);
1576
1577 src_r0 = _mm_add_epi32(temp0, temp1);
1578 src_r1 = _mm_add_epi32(temp2, temp3);
1579 src_r2 = _mm_sub_epi32(temp0, temp1);
1580 src_r3 = _mm_sub_epi32(temp3, temp2);
1581
1582 /*-------------------------------------------------------------*/
1583 /* Forward DC transform [ Vertical transformation ] */
1584 /*-------------------------------------------------------------*/
1585 // Matrix transpose
1586 /*
1587 * a0 b0 c0 d0
1588 * a1 b1 c1 d1
1589 * a2 b2 c2 d2
1590 * a3 b3 c3 d3
1591 */
1592 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); // a0 a1 b0 b1
1593 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); // a2 a3 b2 b3
1594 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); // c0 c1 d0 d1
1595 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); // c2 c3 d2 d3
1596 src_r0 = _mm_unpacklo_epi64(temp0, temp2); // a0 a1 a2 a3
1597 src_r1 = _mm_unpackhi_epi64(temp0, temp2); // b0 b1 b2 b3
1598 src_r2 = _mm_unpacklo_epi64(temp1, temp3); // c0 c1 c2 c3
1599 src_r3 = _mm_unpackhi_epi64(temp1, temp3); // d0 d1 d2 d3
1600
1601 temp0 = _mm_add_epi32(src_r0, src_r3);
1602 temp1 = _mm_add_epi32(src_r1, src_r2);
1603 temp2 = _mm_sub_epi32(src_r1, src_r2);
1604 temp3 = _mm_sub_epi32(src_r0, src_r3);
1605
1606 src_r0 = _mm_add_epi32(temp0, temp1);
1607 src_r1 = _mm_add_epi32(temp2, temp3);
1608 src_r2 = _mm_sub_epi32(temp0, temp1);
1609 src_r3 = _mm_sub_epi32(temp3, temp2);
1610
1611 src_r0 = _mm_srai_epi32(src_r0, 1);
1612 src_r1 = _mm_srai_epi32(src_r1, 1);
1613 src_r2 = _mm_srai_epi32(src_r2, 1);
1614 src_r3 = _mm_srai_epi32(src_r3, 1);
1615
1616 // Quantization
1617 sign_reg0 =
1618 _mm_cmpgt_epi32(zero_8x16b, src_r0); // Find sign of each value for later restoration
1619 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
1620 sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
1621 sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
1622
1623 sign_reg0 = _mm_packs_epi32(sign_reg0,
1624 sign_reg1); // Sign = -1 or 0 depending on <0 or >0 respectively
1625 sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
1626
1627 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); // Sign = -2 or 0 depending on <0 or >0 respectively
1628 sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
1629
1630 sign_reg0 =
1631 _mm_add_epi16(temp_1, sign_reg0); // Sign = -1 or 1 depending on <0 or >0 respectively
1632 sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1633
1634 src_r0 = _mm_abs_epi32(src_r0); // Absolute values
1635 src_r1 = _mm_abs_epi32(src_r1);
1636 src_r2 = _mm_abs_epi32(src_r2);
1637 src_r3 = _mm_abs_epi32(src_r3);
1638
1639 temp0 = _mm_mullo_epi32(scale_val, src_r0); // multiply by
1640 // pu2_scale_matrix[0]
1641 temp1 = _mm_mullo_epi32(scale_val, src_r1);
1642 temp2 = _mm_mullo_epi32(scale_val, src_r2);
1643 temp3 = _mm_mullo_epi32(scale_val, src_r3);
1644
1645 temp0 = _mm_add_epi32(temp0, rnd_fact); // Add round factor
1646 temp1 = _mm_add_epi32(temp1, rnd_fact);
1647 temp2 = _mm_add_epi32(temp2, rnd_fact);
1648 temp3 = _mm_add_epi32(temp3, rnd_fact);
1649
1650 temp0 = _mm_srli_epi32(temp0,
1651 u4_qbits); // RIght shift by qbits, unsigned variable,
1652 // so shift right immediate works
1653 temp1 = _mm_srli_epi32(temp1, u4_qbits);
1654 temp2 = _mm_srli_epi32(temp2, u4_qbits);
1655 temp3 = _mm_srli_epi32(temp3, u4_qbits);
1656
1657 temp0 = _mm_packs_epi32(temp0, temp1); // Final values are 16-bits only.
1658 temp2 = _mm_packs_epi32(temp2, temp3);
1659
1660 temp0 = _mm_sign_epi16(temp0, sign_reg0); // Sign restoration
1661 temp2 = _mm_sign_epi16(temp2, sign_reg2);
1662
1663 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
1664 _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
1665
1666 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1667 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1668
1669 mask0 = _mm_movemask_epi8(cmp0);
1670 mask1 = _mm_movemask_epi8(cmp1);
1671 u4_zero_coeff = 0;
1672 if(mask0)
1673 {
1674 if(mask0 == 0xffff)
1675 u4_zero_coeff += 8;
1676 else
1677 {
1678 cmp0 = _mm_and_si128(temp_1, cmp0);
1679 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1680 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1681 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1682 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1683 }
1684 }
1685 if(mask1)
1686 {
1687 if(mask1 == 0xffff)
1688 u4_zero_coeff += 8;
1689 else
1690 {
1691 cmp1 = _mm_and_si128(temp_1, cmp1);
1692 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1693 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1694 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1695 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1696 }
1697 }
1698
1699 /* Return total nonzero coefficients in the current sub block */
1700 u4_nonzero_coeff = 16 - u4_zero_coeff;
1701 pu1_nnz[0] = u4_nonzero_coeff;
1702 }
1703
1704 /**
1705
1706 * *******************************************************************************
1707
1708 * *
1709 * @brief
1710 * This function performs forward hadamard transform and
1711 * quantization on a 2*2
1712 *block for both U and V planes
1713 *
1714 * @par
1715 * Description:
1716 * The function accepts source buffer and estimation buffer.
1717 * From these, it
1718 * computes the residue. This is residue is then transformed
1719 * and quantized.
1720 * The transform and quantization are in placed computed.
1721 * They use the residue
1722 * buffer for this.
1723 *
1724 * @param[in] pu1_src
1725 *
1726 * Pointer to source sub-block
1727 *
1728 * @param[in] pu1_pred
1729 * Pointer to
1730 * prediction sub-block
1731 *
1732 * @param[in] pi2_out
1733 * Pointer to residual
1734 * sub-block
1735 *
1736 * @param[in] i4_src_stride
1737 * Source stride
1738 *
1739 * @param[in]
1740 * i4_pred_stride
1741 * Prediction stride
1742 *
1743 * @param[in] dst_strd
1744 *
1745 * Destination stride
1746 *
1747 * @param[in] u4_qbits
1748 * QP_BITS_h264_4x4 +
1749 * floor(QP/6)
1750 *
1751 * @param[in] pu2_threshold_matrix
1752 * Pointer to Forward
1753 * Quant Threshold Matrix
1754 *
1755 * @param[in] pu2_scale_matrix
1756 * Pointer to
1757 * Forward Quant Scale Matrix
1758 *
1759 * @param[in] u4_round_factor
1760 * Quantization
1761 * Round factor
1762 *
1763 * @param[out] pu1_nnz
1764 * Total non-zero coefficients in
1765 * the current sub-block
1766 *
1767 * @returns
1768 *
1769 * @remarks
1770 * NNZ for dc is
1771 * populated at 0 and 5th position of pu1_nnz
1772 *
1773 */
1774
isvc_hadamard_quant_2x2_uv_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)1775 void isvc_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
1776 resi_trans_quant_constants_t *ps_quant_constants,
1777 UWORD8 *pu1_nnz)
1778 {
1779 const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1780 const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1781 UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1782 UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1783 WORD32 val, nonzero_coeff_0 = 0, nonzero_coeff_1 = 0;
1784 __m128i cmp, cmp0, cmp1;
1785 __m128i sum0, sum1;
1786 WORD32 mask, mask0, mask1;
1787 __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
1788 __m128i zero_8x16b = _mm_setzero_si128();
1789 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
1790 __m128i sign_reg0, sign_reg1;
1791 __m128i temp_1 = _mm_set1_epi16(1);
1792 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1793
1794 UNUSED(pu2_threshold_matrix);
1795
1796 src = _mm_loadu_si128((__m128i *) pi2_src); // a0 a1 a2 a3 b0 b1 b2 b3
1797 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
1798 plane_0 = _mm_unpacklo_epi16(src, sign_reg); // a0 a1 a2 a3 -- 32 bits
1799 plane_1 = _mm_unpackhi_epi16(src, sign_reg); // b0 b1 b2 b3 -- 32 bits
1800
1801 temp0 = _mm_hadd_epi32(plane_0, plane_1); // a0+a1 a2+a3 b0+b1 b2+b3
1802 temp1 = _mm_hsub_epi32(plane_0, plane_1); // a0-a1 a2-a3 b0-b1 b2-b3
1803
1804 plane_0 = _mm_hadd_epi32(temp0, temp1); // a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
1805 plane_1 = _mm_hsub_epi32(temp0, temp1); // a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
1806
1807 temp0 =
1808 _mm_unpacklo_epi32(plane_0, plane_1); // a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
1809 temp1 =
1810 _mm_unpackhi_epi32(plane_0, plane_1); // a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
1811
1812 plane_0 = _mm_unpacklo_epi64(temp0, temp1); // a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
1813 plane_1 = _mm_unpackhi_epi64(temp0, temp1); // b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
1814
1815 plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); // a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
1816 plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); // b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
1817 // Quantization
1818 sign_reg0 =
1819 _mm_cmpgt_epi32(zero_8x16b, plane_0); // Find sign of each value for later restoration
1820 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
1821
1822 sign_reg0 = _mm_packs_epi32(sign_reg0,
1823 sign_reg1); // Sign = -1 or 0 depending on <0 or >0 respectively
1824 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); // Sign = -2 or 0 depending on <0 or >0 respectively
1825 sign_reg0 =
1826 _mm_add_epi16(temp_1, sign_reg0); // Sign = -1 or 1 depending on <0 or >0 respectively
1827
1828 plane_0 = _mm_abs_epi32(plane_0); // Absolute values
1829 plane_1 = _mm_abs_epi32(plane_1);
1830
1831 temp0 = _mm_mullo_epi32(scale_val, plane_0); // multiply by pu2_scale_matrix[0]
1832 temp1 = _mm_mullo_epi32(scale_val, plane_1); // multiply by pu2_scale_matrix[0]
1833
1834 temp0 = _mm_add_epi32(temp0, rnd_fact); // Add round factor
1835 temp1 = _mm_add_epi32(temp1, rnd_fact);
1836
1837 temp0 = _mm_srli_epi32(temp0,
1838 u4_qbits); // RIght shift by qbits, unsigned variable,
1839 // so shift right immediate works
1840 temp1 = _mm_srli_epi32(temp1, u4_qbits);
1841
1842 temp0 = _mm_packs_epi32(temp0, temp1); // Final values are 16-bits only.
1843 temp0 = _mm_sign_epi16(temp0, sign_reg0); // Sign restoration
1844
1845 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
1846
1847 cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
1848 mask = _mm_movemask_epi8(cmp);
1849 mask0 = mask & 0xff;
1850 mask1 = mask >> 8;
1851 if(mask0)
1852 {
1853 if(mask0 == 0xff)
1854 nonzero_coeff_0 += 4;
1855 else
1856 {
1857 cmp0 = _mm_and_si128(temp_1, cmp);
1858 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1859 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1860 val = _mm_cvtsi128_si32(sum1);
1861 val = val & 0xffff;
1862 nonzero_coeff_0 += val;
1863 }
1864 }
1865 if(mask1)
1866 {
1867 if(mask1 == 0xff)
1868 nonzero_coeff_1 += 4;
1869 else
1870 {
1871 cmp1 = _mm_srli_si128(cmp, 8);
1872 cmp1 = _mm_and_si128(temp_1, cmp1);
1873 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1874 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1875 nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
1876 }
1877 }
1878
1879 pu1_nnz[0] = 4 - nonzero_coeff_0;
1880 pu1_nnz[1] = 4 - nonzero_coeff_1;
1881 }
1882