• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /**
21 
22  * *******************************************************************************
23 
24  * * @file
25  *  isvc_resi_trans_quant_sse42.c
26  *
27  * @brief
28  *  Contains function
29  * definitions single stage  forward transform for H.264
30  *  It will calculate
31  * the residue, do the cf and then do quantization
32  *
33  * @author
34  *  Mohit
35  * [100664]
36  *
37  * @par List of Functions:
38  *  -
39  * isvc_resi_trans_quant_4x4_sse42()
40  *  -
41  * isvc_resi_trans_quant_chroma_4x4_sse42()
42  *
43  * @remarks
44  *  None
45  *
46 
47  * *******************************************************************************
48 
49  */
50 #include <immintrin.h>
51 
52 #include "ih264_typedefs.h"
53 #include "ih264_debug.h"
54 #include "ih264_defs.h"
55 #include "ih264_trans_macros.h"
56 #include "ih264_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "ih264_trans_data.h"
59 #include "ih264_size_defs.h"
60 #include "isvc_structs.h"
61 #include "isvc_trans_quant_itrans_iquant.h"
62 
63 /**|
64 *******************************************************************************
65 *
66 *
67 * @brief
68 *   This function performs forward transform and quantization on a 4*4
69 * block
70 *
71 * @par Description:
72 *   The function accepts source buffer and
73 * estimation buffer. From these, it
74 *   computes the residue. This is residue
75 * is then transformed and quantized.
76 *   The transform and quantization are in
77 * placed computed. They use the residue
78 *   buffer for this.
79 *
80 * @param[in]
81 * pu1_src
82 *   Pointer to source sub-block
83 *
84 * @param[in] pu1_pred
85 *   Pointer
86 * to prediction sub-block
87 *
88 * @param[in] pi2_out
89 *   Pointer to residual
90 * sub-block
91 *
92 * @param[in] i4_src_stride
93 *   Source stride
94 *
95 * @param[in]
96 * i4_pred_stride
97 *   Prediction stride
98 *
99 * @param[in] dst_strd
100 *   Destination
101 * stride
102 *
103 * @param[in] u4_qbits
104 *    QP_BITS_h264_4x4 + floor(QP/6)
105 *
106 *
107 * @param[in] pu2_threshold_matrix
108 *   Pointer to Forward Quant Threshold
109 * Matrix
110 *
111 * @param[in] pu2_scale_matrix
112 *   Pointer to Forward Quant Scale
113 * Matrix
114 *
115 * @param[in] u4_round_factor
116 *   Quantization Round factor
117 *
118 *
119 * @param[out] pu1_nnz
120 *   Total non-zero coefficients in the current
121 * sub-block
122 *
123 * @returns
124 *
125 * @remarks
126 *
127 * None
128 *
129 *******************************************************************************
130 */
isvc_resi_trans_quant_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)131 void isvc_resi_trans_quant_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
132                                      buffer_container_t *ps_out,
133                                      buffer_container_t *ps_upsampled_res,
134                                      resi_trans_quant_constants_t *ps_quant_constants,
135                                      UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
136                                      UWORD8 u1_use_upsampled_res)
137 {
138     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
139     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
140     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
141     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
142     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
143     WORD32 mask0, mask1;
144     __m128i sum0, sum1, sum2, cmp0, cmp1;
145     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
146     __m128i temp_2 = _mm_set1_epi16(2);
147     __m128i temp_1 = _mm_set1_epi16(1);
148     __m128i src_r0, src_r1, src_r2, src_r3;
149     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
150     __m128i temp0, temp1, temp2, temp3;
151     /* all bits reset to zero */
152     __m128i zero_8x16b = _mm_setzero_si128();
153     __m128i sign_reg0, sign_reg2;
154     __m128i scalemat_r0_r1, scalemat_r2_r3;
155     __m128i threshold_r0_r1, threshold_r2_r3;
156     __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
157 
158     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
159     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
160     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
161     WORD32 i4_src_stride = ps_src->i4_data_stride;
162     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
163     WORD32 i4_out_stride = ps_out->i4_data_stride;
164 
165     ASSERT(0 == u1_use_upsampled_res);
166     ASSERT(4 == i4_out_stride);
167     UNUSED(u1_use_upsampled_res);
168     UNUSED(i4_out_stride);
169     UNUSED(ps_upsampled_res);
170 
171     /* b00 b01 b02 b03 b10 b11 b12 b13
172      -- the scaling matrix 0th,1st row */
173     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
174 
175     /* b20 b21 b22 b23 b30 b31 b32 b33
176      -- the scaling matrix 2nd,3rd row */
177     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
178 
179     /* b00 b01 b02 b03 b10 b11 b12 b13
180      -- the treshold matrix 0th,1st row */
181     threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
182 
183     /* b20 b21 b22 b23 b30 b31 b32 b33
184      -- the threshold matrix 2nd,3rd row */
185     threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
186 
187     /* a00 a01 a02 a03 0 0 0 0 0
188      0 0 0 -- all 8 bits */
189     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
190 
191     /* a10 a11 a12 a13 0 0 0 0 0 0 0
192      0 -- all 8 bits */
193     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
194 
195     /* a20 a21 a22 a23 0 0 0 0 0 0 0
196      0 -- all 8 bits */
197     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
198 
199     /* a30 a31 a32 a33 0 0 0 0 0 0 0
200      0 -- all 8 bits */
201     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
202 
203     src_r0 = _mm_cvtepu8_epi16(src_r0);
204     src_r1 = _mm_cvtepu8_epi16(src_r1);
205     src_r2 = _mm_cvtepu8_epi16(src_r2);
206     src_r3 = _mm_cvtepu8_epi16(src_r3);
207 
208     /* p00 p01 p02 p03 0 0 0 0 0
209      0 0 0 -- all 8 bits */
210     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
211 
212     /* p10 p11 p12 p13 0 0 0 0 0
213      0 0 0 -- all 8 bits */
214     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
215 
216     /* p20 p21 p22 p23 0 0 0 0 0
217      0 0 0 -- all 8 bits */
218     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
219 
220     /* p30 p31 p32 p33 0 0 0 0 0
221      0 0 0 -- all 8 bits */
222     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
223 
224     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
225     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
226     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
227     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
228 
229     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
230     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
231     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
232     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
233 
234     /* Perform Forward transform */
235     /*-------------------------------------------------------------*/
236     /* DCT [ Horizontal transformation ]                          */
237     /*-------------------------------------------------------------*/
238     // Matrix transpose
239     /*
240      *  a0 a1 a2 a3
241      *  b0 b1 b2 b3
242      *  c0 c1 c2 c3
243      *  d0 d1 d2 d3
244      */
245     /* a0 b0 a1 b1 a2 b2 a3 b3 */
246     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
247     /* c0 d0 c1 d1 c2 d2 c3 d3 */
248     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
249     /* a0 b0 c0 d0 a1 b1 c1 d1 */
250     temp1 = _mm_unpacklo_epi32(temp0, temp2);
251     /* a2 b2 c2 d2 a3 b3 c3 d3 */
252     temp3 = _mm_unpackhi_epi32(temp0, temp2);
253 
254     /* a0 b0 c0 d0 */
255     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
256     /* a1 b1 c1 d1 */
257     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
258     /* a2 b2 c2 d2 */
259     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
260     /* a3 b3 c3 d3 */
261     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
262 
263     /*----------------------------------------------------------*/
264     /* x0 = z0 + z3                                             */
265     temp0 = _mm_add_epi16(src_r0, src_r3);
266     /* x1 = z1 + z2                                             */
267     temp1 = _mm_add_epi16(src_r1, src_r2);
268     /* x2 = z1 - z2                                             */
269     temp2 = _mm_sub_epi16(src_r1, src_r2);
270     /* x3 = z0 - z3                                             */
271     temp3 = _mm_sub_epi16(src_r0, src_r3);
272 
273     /* z0 = x0 + x1                                             */
274     src_r0 = _mm_add_epi16(temp0, temp1);
275     /* z1 = (x3 << 1) + x2                                      */
276     src_r1 = _mm_slli_epi16(temp3, 1);
277     src_r1 = _mm_add_epi16(src_r1, temp2);
278     /* z2 = x0 - x1                                             */
279     src_r2 = _mm_sub_epi16(temp0, temp1);
280     /* z3 = x3 - (x2 << 1)                                      */
281     src_r3 = _mm_slli_epi16(temp2, 1);
282     src_r3 = _mm_sub_epi16(temp3, src_r3);
283 
284     // Matrix transpose
285     /*
286      *  a0 b0 c0 d0
287      *  a1 b1 c1 d1
288      *  a2 b2 c2 d2
289      *  a3 b3 c3 d3
290      */
291     /* a0 a1 b0 b1 c0 c1 d0 d1 */
292     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
293     /* a2 a3 b2 b3 c2 c3 d2 d3 */
294     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
295     /* a0 a1 a2 a3 b0 b1 b2 b3 */
296     temp1 = _mm_unpacklo_epi32(temp0, temp2);
297     /* c0 c1 c2 c3 d0 d1 d2 d3 */
298     temp3 = _mm_unpackhi_epi32(temp0, temp2);
299 
300     /* a0 a1 a2 a3 */
301     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
302     /* b0 b1 b2 b3 */
303     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
304     /* c0 c1 c2 c3 */
305     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
306     /* d0 d1 d2 d3 */
307     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
308 
309     /*----------------------------------------------------------*/
310     /* x0 = z0 + z3                                             */
311     temp0 = _mm_add_epi16(src_r0, src_r3);
312     /* x1 = z1 + z2                                             */
313     temp1 = _mm_add_epi16(src_r1, src_r2);
314     /* x2 = z1 - z2                                             */
315     temp2 = _mm_sub_epi16(src_r1, src_r2);
316     /* x3 = z0 - z3                                             */
317     temp3 = _mm_sub_epi16(src_r0, src_r3);
318 
319     /* z0 = x0 + x1                                             */
320     src_r0 = _mm_add_epi16(temp0, temp1);
321     /* z1 = (x3 << 1) + x2                                      */
322     src_r1 = _mm_slli_epi16(temp3, 1);
323     src_r1 = _mm_add_epi16(src_r1, temp2);
324     /* z2 = x0 - x1                                             */
325     src_r2 = _mm_sub_epi16(temp0, temp1);
326     /* z3 = x3 - (x2 << 1)                                      */
327     src_r3 = _mm_slli_epi16(temp2, 1);
328     src_r3 = _mm_sub_epi16(temp3, src_r3);
329 
330     /* get the first 16 bits from the register */
331     tmp_dc = _mm_extract_epi16(src_r0, 0);
332     *pi2_dc_out = tmp_dc;
333 
334     /* a0 a1 a2 a3 b0 b1 b2 b3 */
335     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
336     /* c0 c1 c2 c3 d0 d1 d2 d3 */
337     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
338     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
339     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
340 
341     sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
342     sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
343 
344     sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
345     sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
346 
347     src_r0 = _mm_abs_epi16(src_r0);
348     src_r2 = _mm_abs_epi16(src_r2);
349 
350     threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
351     threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
352 
353     src_r1 = _mm_srli_si128(src_r0, 8);
354     src_r0 = _mm_cvtepu16_epi32(src_r0);
355     src_r1 = _mm_cvtepu16_epi32(src_r1);
356     src_r3 = _mm_srli_si128(src_r2, 8);
357     src_r2 = _mm_cvtepu16_epi32(src_r2);
358     src_r3 = _mm_cvtepu16_epi32(src_r3);
359 
360     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
361     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
362     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
363     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
364     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
365     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
366 
367     temp0 = _mm_mullo_epi32(temp0, src_r0);
368     temp1 = _mm_mullo_epi32(temp1, src_r1);
369     temp2 = _mm_mullo_epi32(temp2, src_r2);
370     temp3 = _mm_mullo_epi32(temp3, src_r3);
371 
372     temp0 = _mm_add_epi32(temp0, rnd_fact);
373     temp1 = _mm_add_epi32(temp1, rnd_fact);
374     temp2 = _mm_add_epi32(temp2, rnd_fact);
375     temp3 = _mm_add_epi32(temp3, rnd_fact);
376 
377     temp0 = _mm_srli_epi32(temp0, u4_qbits);
378     temp1 = _mm_srli_epi32(temp1, u4_qbits);
379     temp2 = _mm_srli_epi32(temp2, u4_qbits);
380     temp3 = _mm_srli_epi32(temp3, u4_qbits);
381 
382     temp0 = _mm_packs_epi32(temp0, temp1);
383     temp2 = _mm_packs_epi32(temp2, temp3);
384 
385     temp0 = _mm_sign_epi16(temp0, sign_reg0);
386     temp2 = _mm_sign_epi16(temp2, sign_reg2);
387 
388     temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
389     temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
390 
391     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
392     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
393 
394     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
395     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
396 
397     mask0 = _mm_movemask_epi8(cmp0);
398     mask1 = _mm_movemask_epi8(cmp1);
399     u4_zero_coeff = 0;
400 
401     if(mask0)
402     {
403         if(mask0 == 0xffff)
404             u4_zero_coeff += 8;
405         else
406         {
407             cmp0 = _mm_and_si128(temp_1, cmp0);
408             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
409             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
410             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
411             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
412         }
413     }
414     if(mask1)
415     {
416         if(mask1 == 0xffff)
417             u4_zero_coeff += 8;
418         else
419         {
420             cmp1 = _mm_and_si128(temp_1, cmp1);
421             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
422             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
423             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
424             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
425         }
426     }
427 
428     /* Return total nonzero coefficients in the current sub block */
429     u4_nonzero_coeff = 16 - u4_zero_coeff;
430     *pu1_nnz = u4_nonzero_coeff;
431 }
432 
isvc_resi_trans_quant_4x4_with_res_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)433 void isvc_resi_trans_quant_4x4_with_res_pred_sse42(
434     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
435     buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
436     UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
437 {
438     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
439     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
440     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
441     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
442     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
443     WORD32 mask0, mask1;
444     __m128i sum0, sum1, sum2, cmp0, cmp1;
445     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
446     __m128i temp_2 = _mm_set1_epi16(2);
447     __m128i temp_1 = _mm_set1_epi16(1);
448     __m128i src_r0, src_r1, src_r2, src_r3;
449     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
450     __m128i temp0, temp1, temp2, temp3;
451     /* all bits reset to zero */
452     __m128i zero_8x16b = _mm_setzero_si128();
453     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
454     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
455     __m128i sign_reg0, sign_reg2;
456     __m128i scalemat_r0_r1, scalemat_r2_r3;
457     __m128i upsampled_res0, upsampled_res1, upsampled_res2, upsampled_res3;
458     __m128i threshold_r0_r1, threshold_r2_r3;
459     __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
460 
461     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
462     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
463     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
464     WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
465     WORD32 i4_src_stride = ps_src->i4_data_stride;
466     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
467     WORD32 i4_out_stride = ps_out->i4_data_stride;
468     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
469 
470     ASSERT(1 == u1_use_upsampled_res);
471     ASSERT(4 == i4_out_stride);
472     UNUSED(u1_use_upsampled_res);
473     UNUSED(i4_out_stride);
474     UNUSED(ps_upsampled_res);
475 
476     /* b00 b01 b02 b03 b10 b11 b12 b13
477      -- the scaling matrix 0th,1st row */
478     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
479 
480     /* b20 b21 b22 b23 b30 b31 b32 b33
481      -- the scaling matrix 2nd,3rd row */
482     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
483 
484     /* b00 b01 b02 b03 b10 b11 b12 b13
485      -- the treshold matrix 0th,1st row */
486     threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
487 
488     /* b20 b21 b22 b23 b30 b31 b32 b33
489      -- the threshold matrix 2nd,3rd row */
490     threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
491 
492     /* a00 a01 a02 a03 0 0 0 0 0
493      0 0 0 -- all 8 bits */
494     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
495 
496     /* a10 a11 a12 a13 0 0 0 0 0 0 0
497      0 -- all 8 bits */
498     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
499 
500     /* a20 a21 a22 a23 0 0 0 0 0 0 0
501      0 -- all 8 bits */
502     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
503 
504     /* a30 a31 a32 a33 0 0 0 0 0 0 0
505      0 -- all 8 bits */
506     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
507 
508     src_r0 = _mm_cvtepu8_epi16(src_r0);
509     src_r1 = _mm_cvtepu8_epi16(src_r1);
510     src_r2 = _mm_cvtepu8_epi16(src_r2);
511     src_r3 = _mm_cvtepu8_epi16(src_r3);
512 
513     /* p00 p01 p02 p03 0 0 0 0 0
514      0 0 0 -- all 8 bits */
515     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
516 
517     /* p10 p11 p12 p13 0 0 0 0 0
518      0 0 0 -- all 8 bits */
519     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
520 
521     /* p20 p21 p22 p23 0 0 0 0 0
522      0 0 0 -- all 8 bits */
523     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
524 
525     /* p30 p31 p32 p33 0 0 0 0 0
526      0 0 0 -- all 8 bits */
527     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
528 
529     pred_r0 = _mm_cvtepu8_epi16(pred_r0);
530     pred_r1 = _mm_cvtepu8_epi16(pred_r1);
531     pred_r2 = _mm_cvtepu8_epi16(pred_r2);
532     pred_r3 = _mm_cvtepu8_epi16(pred_r3);
533 
534     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
535     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
536     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
537     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
538 
539     /* load upsampled residual values and subtract from
540     the previous residue */
541     upsampled_res0 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[0]));
542 
543     upsampled_res1 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[i4_upsampled_res_stride]));
544 
545     upsampled_res2 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[2 * i4_upsampled_res_stride]));
546 
547     upsampled_res3 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[3 * i4_upsampled_res_stride]));
548 
549     src_r0 = _mm_sub_epi16(src_r0, upsampled_res0);
550     src_r1 = _mm_sub_epi16(src_r1, upsampled_res1);
551     src_r2 = _mm_sub_epi16(src_r2, upsampled_res2);
552     src_r3 = _mm_sub_epi16(src_r3, upsampled_res3);
553 
554     src_r1 = _mm_unpacklo_epi16(src_r0, src_r1);
555     src_r3 = _mm_unpacklo_epi16(src_r2, src_r3);
556 
557     /* Saturate all values < -255 to -255 and retain the rest as it is */
558     src_r1 = _mm_max_epi16(src_r1, neg_255_8x16b);
559     /* Saturate all values > 255 to 255 and retain the rest as it is */
560     temp0 = _mm_min_epi16(src_r1, pos_255_8x16b);
561 
562     /* Saturate all values < -255 to -255 and retain the rest as it is */
563     src_r3 = _mm_max_epi16(src_r3, neg_255_8x16b);
564     /* Saturate all values > 255 to 255 and retain the rest as it is */
565     temp2 = _mm_min_epi16(src_r3, pos_255_8x16b);
566 
567     /* Perform Forward transform */
568     /*-------------------------------------------------------------*/
569     /* DCT [ Horizontal transformation ]                          */
570     /*-------------------------------------------------------------*/
571     // Matrix transpose
572     /*
573      *  a0 a1 a2 a3
574      *  b0 b1 b2 b3
575      *  c0 c1 c2 c3
576      *  d0 d1 d2 d3
577      */
578     /* a0 b0 c0 d0 a1 b1 c1 d1 */
579     temp1 = _mm_unpacklo_epi32(temp0, temp2);
580     /* a2 b2 c2 d2 a3 b3 c3 d3 */
581     temp3 = _mm_unpackhi_epi32(temp0, temp2);
582 
583     /* a0 b0 c0 d0 */
584     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
585     /* a1 b1 c1 d1 */
586     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
587     /* a2 b2 c2 d2 */
588     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
589     /* a3 b3 c3 d3 */
590     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
591 
592     /*----------------------------------------------------------*/
593     /* x0 = z0 + z3                                             */
594     temp0 = _mm_add_epi16(src_r0, src_r3);
595     /* x1 = z1 + z2                                             */
596     temp1 = _mm_add_epi16(src_r1, src_r2);
597     /* x2 = z1 - z2                                             */
598     temp2 = _mm_sub_epi16(src_r1, src_r2);
599     /* x3 = z0 - z3                                             */
600     temp3 = _mm_sub_epi16(src_r0, src_r3);
601 
602     /* z0 = x0 + x1                                             */
603     src_r0 = _mm_add_epi16(temp0, temp1);
604     /* z1 = (x3 << 1) + x2                                      */
605     src_r1 = _mm_slli_epi16(temp3, 1);
606     src_r1 = _mm_add_epi16(src_r1, temp2);
607     /* z2 = x0 - x1                                             */
608     src_r2 = _mm_sub_epi16(temp0, temp1);
609     /* z3 = x3 - (x2 << 1)                                      */
610     src_r3 = _mm_slli_epi16(temp2, 1);
611     src_r3 = _mm_sub_epi16(temp3, src_r3);
612 
613     // Matrix transpose
614     /*
615      *  a0 b0 c0 d0
616      *  a1 b1 c1 d1
617      *  a2 b2 c2 d2
618      *  a3 b3 c3 d3
619      */
620     /* a0 a1 b0 b1 c0 c1 d0 d1 */
621     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
622     /* a2 a3 b2 b3 c2 c3 d2 d3 */
623     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
624     /* a0 a1 a2 a3 b0 b1 b2 b3 */
625     temp1 = _mm_unpacklo_epi32(temp0, temp2);
626     /* c0 c1 c2 c3 d0 d1 d2 d3 */
627     temp3 = _mm_unpackhi_epi32(temp0, temp2);
628 
629     /* a0 a1 a2 a3 */
630     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
631     /* b0 b1 b2 b3 */
632     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
633     /* c0 c1 c2 c3 */
634     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
635     /* d0 d1 d2 d3 */
636     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
637 
638     /*----------------------------------------------------------*/
639     /* x0 = z0 + z3                                             */
640     temp0 = _mm_add_epi16(src_r0, src_r3);
641     /* x1 = z1 + z2                                             */
642     temp1 = _mm_add_epi16(src_r1, src_r2);
643     /* x2 = z1 - z2                                             */
644     temp2 = _mm_sub_epi16(src_r1, src_r2);
645     /* x3 = z0 - z3                                             */
646     temp3 = _mm_sub_epi16(src_r0, src_r3);
647 
648     /* z0 = x0 + x1                                             */
649     src_r0 = _mm_add_epi16(temp0, temp1);
650     /* z1 = (x3 << 1) + x2                                      */
651     src_r1 = _mm_slli_epi16(temp3, 1);
652     src_r1 = _mm_add_epi16(src_r1, temp2);
653     /* z2 = x0 - x1                                             */
654     src_r2 = _mm_sub_epi16(temp0, temp1);
655     /* z3 = x3 - (x2 << 1)                                      */
656     src_r3 = _mm_slli_epi16(temp2, 1);
657     src_r3 = _mm_sub_epi16(temp3, src_r3);
658 
659     /* get the first 16 bits from the register */
660     tmp_dc = _mm_extract_epi16(src_r0, 0);
661     *pi2_dc_out = tmp_dc;
662 
663     /* a0 a1 a2 a3 b0 b1 b2 b3 */
664     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
665     /* c0 c1 c2 c3 d0 d1 d2 d3 */
666     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
667     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
668     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
669 
670     sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
671     sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
672 
673     sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
674     sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
675 
676     src_r0 = _mm_abs_epi16(src_r0);
677     src_r2 = _mm_abs_epi16(src_r2);
678 
679     threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
680     threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
681 
682     src_r1 = _mm_srli_si128(src_r0, 8);
683     src_r0 = _mm_cvtepu16_epi32(src_r0);
684     src_r1 = _mm_cvtepu16_epi32(src_r1);
685     src_r3 = _mm_srli_si128(src_r2, 8);
686     src_r2 = _mm_cvtepu16_epi32(src_r2);
687     src_r3 = _mm_cvtepu16_epi32(src_r3);
688 
689     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
690     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
691     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
692     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
693     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
694     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
695 
696     temp0 = _mm_mullo_epi32(temp0, src_r0);
697     temp1 = _mm_mullo_epi32(temp1, src_r1);
698     temp2 = _mm_mullo_epi32(temp2, src_r2);
699     temp3 = _mm_mullo_epi32(temp3, src_r3);
700 
701     temp0 = _mm_add_epi32(temp0, rnd_fact);
702     temp1 = _mm_add_epi32(temp1, rnd_fact);
703     temp2 = _mm_add_epi32(temp2, rnd_fact);
704     temp3 = _mm_add_epi32(temp3, rnd_fact);
705 
706     temp0 = _mm_srli_epi32(temp0, u4_qbits);
707     temp1 = _mm_srli_epi32(temp1, u4_qbits);
708     temp2 = _mm_srli_epi32(temp2, u4_qbits);
709     temp3 = _mm_srli_epi32(temp3, u4_qbits);
710 
711     temp0 = _mm_packs_epi32(temp0, temp1);
712     temp2 = _mm_packs_epi32(temp2, temp3);
713 
714     temp0 = _mm_sign_epi16(temp0, sign_reg0);
715     temp2 = _mm_sign_epi16(temp2, sign_reg2);
716 
717     temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
718     temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
719 
720     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
721     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
722 
723     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
724     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
725 
726     mask0 = _mm_movemask_epi8(cmp0);
727     mask1 = _mm_movemask_epi8(cmp1);
728     u4_zero_coeff = 0;
729     if(mask0)
730     {
731         if(mask0 == 0xffff)
732             u4_zero_coeff += 8;
733         else
734         {
735             cmp0 = _mm_and_si128(temp_1, cmp0);
736             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
737             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
738             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
739             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
740         }
741     }
742     if(mask1)
743     {
744         if(mask1 == 0xffff)
745             u4_zero_coeff += 8;
746         else
747         {
748             cmp1 = _mm_and_si128(temp_1, cmp1);
749             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
750             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
751             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
752             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
753         }
754     }
755 
756     /* Return total nonzero coefficients in the current sub block */
757     u4_nonzero_coeff = 16 - u4_zero_coeff;
758     *pu1_nnz = u4_nonzero_coeff;
759 }
760 
761 /**
762 
763  * *******************************************************************************
764 
765  * *
766  * @brief
767  *   This function performs forward transform and quantization on
768  * a 4*4 chroma
769  *block
770  *
771  * @par Description:
772  *   The function accepts source
773  * buffer and estimation buffer. From these, it
774  *   computes the residue. This
775  * is residue is then transformed and quantized.
776  *   The transform and
777  * quantization are in placed computed. They use the residue
778  *   buffer for
779  * this.
780  *
781  * @param[in] pu1_src
782  *   Pointer to source sub-block
783  *
784  *
785  * @param[in] pu1_pred
786  *   Pointer to prediction sub-block
787  *
788  * @param[in]
789  * pi2_out
790  *   Pointer to residual sub-block
791  *
792  * @param[in] i4_src_stride
793  *
794  * Source stride
795  *
796  * @param[in] i4_pred_stride
797  *   Prediction stride
798  *
799  *
800  * @param[in] dst_strd
801  *   Destination stride
802  *
803  * @param[in] u4_qbits
804  *
805  * QP_BITS_h264_4x4 + floor(QP/6)
806  *
807  * @param[in] pu2_threshold_matrix
808  *
809  * Pointer to Forward Quant Threshold Matrix
810  *
811  * @param[in] pu2_scale_matrix
812 
813  * *   Pointer to Forward Quant Scale Matrix
814  *
815  * @param[in] u4_round_factor
816  *
817  * Quantization Round factor
818  *
819  * @param[out] pu1_nnz
820  *   Total non-zero
821  * coefficients in the current sub-block
822  *
823  * @returns
824  *
825  * @remarks
826  *
827  * None
828  *
829 
830  * *******************************************************************************
831 
832  */
isvc_resi_trans_quant_chroma_4x4_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)833 void isvc_resi_trans_quant_chroma_4x4_sse42(buffer_container_t *ps_src, buffer_container_t *ps_pred,
834                                             buffer_container_t *ps_out,
835                                             buffer_container_t *ps_upsampled_res,
836                                             resi_trans_quant_constants_t *ps_quant_constants,
837                                             UWORD8 *pu1_nnz, WORD16 *pi2_dc_out,
838                                             UWORD8 u1_use_upsampled_res)
839 {
840     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
841     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
842     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
843     WORD32 i4_src_stride = ps_src->i4_data_stride;
844     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
845     WORD32 i4_out_stride = ps_out->i4_data_stride;
846     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
847     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
848     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
849     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
850     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
851     WORD32 mask0, mask1;
852     __m128i cmp0, cmp1, sum0, sum1, sum2;
853     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
854     __m128i temp_2 = _mm_set1_epi16(2);
855     __m128i temp_1 = _mm_set1_epi16(1);
856     __m128i src_r0, src_r1, src_r2, src_r3;
857     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
858     __m128i temp0, temp1, temp2, temp3;
859     /* all bits reset to zero */
860     __m128i zero_8x16b = _mm_setzero_si128();
861     __m128i sign_reg0, sign_reg2;
862     __m128i scalemat_r0_r1, scalemat_r2_r3;
863     __m128i threshold_r0_r1, threshold_r2_r3;
864     __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
865     __m128i chroma_mask = _mm_set1_epi16(0xFF);
866 
867     ASSERT(0 == u1_use_upsampled_res);
868     ASSERT(4 == i4_out_stride);
869     UNUSED(u1_use_upsampled_res);
870     UNUSED(i4_out_stride);
871     UNUSED(ps_upsampled_res);
872 
873     /* b00 b01 b02 b03 b10 b11 b12 b13
874    -- the scaling matrix 0th,1st row */
875     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
876 
877     /* b20 b21 b22 b23 b30 b31 b32 b33
878      -- the scaling matrix 2nd,3rd row */
879     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
880 
881     /* b00 b01 b02 b03 b10 b11 b12 b13
882      -- the treshold matrix 0th,1st row */
883     threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
884 
885     /* b20 b21 b22 b23 b30 b31 b32 b33
886      -- the threshold matrix 2nd,3rd row */
887     threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
888 
889     /* a00 a01 a02 a03 0 0 0 0 0
890     0 0 0 -- all 8 bits */
891     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
892     /* a10 a11 a12 a13 0 0 0 0 0 0 0
893      0 -- all 8 bits */
894     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
895     /* a20 a21 a22 a23 0 0 0 0 0 0 0
896     0 -- all 8 bits */
897     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
898     /* a30 a31 a32 a33 0 0 0 0 0 0 0
899     0 -- all 8 bits */
900     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
901 
902     src_r0 = _mm_and_si128(src_r0, chroma_mask);
903     src_r1 = _mm_and_si128(src_r1, chroma_mask);
904     src_r2 = _mm_and_si128(src_r2, chroma_mask);
905     src_r3 = _mm_and_si128(src_r3, chroma_mask);
906 
907     /* p00 p01 p02 p03 0 0 0 0 0
908      0 0 0 -- all 8 bits */
909     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
910     /* p10 p11 p12 p13 0 0 0 0 0
911     0 0 0 -- all 8 bits */
912     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
913     /* p20 p21 p22 p23 0 0 0 0 0
914     0 0 0 -- all 8 bits */
915     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
916     /* p30 p31 p32 p33 0 0 0 0 0
917     0 0 0 -- all 8 bits */
918     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
919 
920     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
921     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
922     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
923     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
924 
925     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
926     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
927     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
928     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
929 
930     /* Perform Forward transform */
931     /*-------------------------------------------------------------*/
932     /* DCT [ Horizontal transformation ]                          */
933     /*-------------------------------------------------------------*/
934     // Matrix transpose
935     /*
936      *  a0 a1 a2 a3
937      *  b0 b1 b2 b3
938      *  c0 c1 c2 c3
939      *  d0 d1 d2 d3
940      */
941     /* a0 b0 a1 b1 a2 b2 a3 b3 */
942     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
943     /* c0 d0 c1 d1 c2 d2 c3 d3 */
944     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
945     /* a0 b0 c0 d0 a1 b1 c1 d1 */
946     temp1 = _mm_unpacklo_epi32(temp0, temp2);
947     /* a2 b2 c2 d2 a3 b3 c3 d3 */
948     temp3 = _mm_unpackhi_epi32(temp0, temp2);
949 
950     /* a0 b0 c0 d0 */
951     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
952     /* a1 b1 c1 d1 */
953     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
954     /* a2 b2 c2 d2 */
955     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
956     /* a3 b3 c3 d3 */
957     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
958 
959     /*----------------------------------------------------------*/
960     /* x0 = z0 + z3                                */
961     temp0 = _mm_add_epi16(src_r0, src_r3);
962     /* x1 = z1 + z2                                */
963     temp1 = _mm_add_epi16(src_r1, src_r2);
964     /* x2 = z1 - z2                                */
965     temp2 = _mm_sub_epi16(src_r1, src_r2);
966     /* x3 = z0 - z3                                */
967     temp3 = _mm_sub_epi16(src_r0, src_r3);
968 
969     /* z0 = x0 + x1                                */
970     src_r0 = _mm_add_epi16(temp0, temp1);
971     /* z1 = (x3 << 1) + x2                         */
972     src_r1 = _mm_slli_epi16(temp3, 1);
973     src_r1 = _mm_add_epi16(src_r1, temp2);
974     /* z2 = x0 - x1                                */
975     src_r2 = _mm_sub_epi16(temp0, temp1);
976     /* z3 = x3 - (x2 << 1)                         */
977     src_r3 = _mm_slli_epi16(temp2, 1);
978     src_r3 = _mm_sub_epi16(temp3, src_r3);
979 
980     // Matrix transpose
981     /*
982      *  a0 b0 c0 d0
983      *  a1 b1 c1 d1
984      *  a2 b2 c2 d2
985      *  a3 b3 c3 d3
986      */
987     /* a0 a1 b0 b1 c0 c1 d0 d1 */
988     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
989     /* a2 a3 b2 b3 c2 c3 d2 d3 */
990     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
991     /* a0 a1 a2 a3 b0 b1 b2 b3 */
992     temp1 = _mm_unpacklo_epi32(temp0, temp2);
993     /* c0 c1 c2 c3 d0 d1 d2 d3 */
994     temp3 = _mm_unpackhi_epi32(temp0, temp2);
995 
996     /* a0 a1 a2 a3 */
997     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
998     /* b0 b1 b2 b3 */
999     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1000     /* c0 c1 c2 c3 */
1001     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1002     /* d0 d1 d2 d3 */
1003     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1004 
1005     /*----------------------------------------------------------*/
1006     /* x0 = z0 + z3                                  */
1007     temp0 = _mm_add_epi16(src_r0, src_r3);
1008     /* x1 = z1 + z2                                  */
1009     temp1 = _mm_add_epi16(src_r1, src_r2);
1010     /* x2 = z1 - z2                                  */
1011     temp2 = _mm_sub_epi16(src_r1, src_r2);
1012     /* x3 = z0 - z3                                  */
1013     temp3 = _mm_sub_epi16(src_r0, src_r3);
1014 
1015     /* z0 = x0 + x1                                  */
1016     src_r0 = _mm_add_epi16(temp0, temp1);
1017     /* z1 = (x3 << 1) + x2                           */
1018     src_r1 = _mm_slli_epi16(temp3, 1);
1019     src_r1 = _mm_add_epi16(src_r1, temp2);
1020     /* z2 = x0 - x1                                  */
1021     src_r2 = _mm_sub_epi16(temp0, temp1);
1022     /* z3 = x3 - (x2 << 1)                           */
1023     src_r3 = _mm_slli_epi16(temp2, 1);
1024     src_r3 = _mm_sub_epi16(temp3, src_r3);
1025 
1026     /* get the first 16 bits from the register */
1027     tmp_dc = _mm_extract_epi16(src_r0, 0);
1028     *pi2_dc_out = tmp_dc;
1029 
1030     /* a0 a1 a2 a3 b0 b1 b2 b3 */
1031     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
1032     /* c0 c1 c2 c3 d0 d1 d2 d3 */
1033     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
1034     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
1035     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
1036 
1037     sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
1038     sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
1039 
1040     sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
1041     sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1042 
1043     src_r0 = _mm_abs_epi16(src_r0);
1044     src_r2 = _mm_abs_epi16(src_r2);
1045 
1046     threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
1047     threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
1048 
1049     src_r1 = _mm_srli_si128(src_r0, 8);
1050     src_r0 = _mm_cvtepu16_epi32(src_r0);
1051     src_r1 = _mm_cvtepu16_epi32(src_r1);
1052     src_r3 = _mm_srli_si128(src_r2, 8);
1053     src_r2 = _mm_cvtepu16_epi32(src_r2);
1054     src_r3 = _mm_cvtepu16_epi32(src_r3);
1055 
1056     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1057     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
1058     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1059     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
1060     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1061     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1062 
1063     temp0 = _mm_mullo_epi32(temp0, src_r0);
1064     temp1 = _mm_mullo_epi32(temp1, src_r1);
1065     temp2 = _mm_mullo_epi32(temp2, src_r2);
1066     temp3 = _mm_mullo_epi32(temp3, src_r3);
1067 
1068     temp0 = _mm_add_epi32(temp0, rnd_fact);
1069     temp1 = _mm_add_epi32(temp1, rnd_fact);
1070     temp2 = _mm_add_epi32(temp2, rnd_fact);
1071     temp3 = _mm_add_epi32(temp3, rnd_fact);
1072 
1073     temp0 = _mm_srli_epi32(temp0, u4_qbits);
1074     temp1 = _mm_srli_epi32(temp1, u4_qbits);
1075     temp2 = _mm_srli_epi32(temp2, u4_qbits);
1076     temp3 = _mm_srli_epi32(temp3, u4_qbits);
1077 
1078     temp0 = _mm_packs_epi32(temp0, temp1);
1079     temp2 = _mm_packs_epi32(temp2, temp3);
1080 
1081     temp0 = _mm_sign_epi16(temp0, sign_reg0);
1082     temp2 = _mm_sign_epi16(temp2, sign_reg2);
1083 
1084     temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
1085     temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
1086 
1087     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
1088     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
1089 
1090     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1091     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1092 
1093     mask0 = _mm_movemask_epi8(cmp0);
1094     mask1 = _mm_movemask_epi8(cmp1);
1095     u4_zero_coeff = 0;
1096     if(mask0)
1097     {
1098         if(mask0 == 0xffff)
1099             u4_zero_coeff += 8;
1100         else
1101         {
1102             cmp0 = _mm_and_si128(temp_1, cmp0);
1103             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1104             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1105             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1106             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1107         }
1108     }
1109     if(mask1)
1110     {
1111         if(mask1 == 0xffff)
1112             u4_zero_coeff += 8;
1113         else
1114         {
1115             cmp1 = _mm_and_si128(temp_1, cmp1);
1116             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1117             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1118             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1119             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1120         }
1121     }
1122 
1123     /* Return total nonzero coefficients in the current sub block */
1124     u4_nonzero_coeff = 16 - u4_zero_coeff;
1125     *pu1_nnz = u4_nonzero_coeff;
1126 }
1127 
isvc_resi_trans_quant_chroma_4x4_with_res_pred_sse42(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_out,buffer_container_t * ps_upsampled_res,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz,WORD16 * pi2_dc_out,UWORD8 u1_use_upsampled_res)1128 void isvc_resi_trans_quant_chroma_4x4_with_res_pred_sse42(
1129     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_out,
1130     buffer_container_t *ps_upsampled_res, resi_trans_quant_constants_t *ps_quant_constants,
1131     UWORD8 *pu1_nnz, WORD16 *pi2_dc_out, UWORD8 u1_use_upsampled_res)
1132 {
1133     UWORD8 *pu1_src = (UWORD8 *) ps_src->pv_data;
1134     UWORD8 *pu1_pred = (UWORD8 *) ps_pred->pv_data;
1135     WORD16 *pi2_out = (WORD16 *) ps_out->pv_data;
1136     WORD16 *pi2_upsampled_res = ps_upsampled_res ? (WORD16 *) ps_upsampled_res->pv_data : NULL;
1137     WORD32 i4_src_stride = ps_src->i4_data_stride;
1138     WORD32 i4_pred_stride = ps_pred->i4_data_stride;
1139     WORD32 i4_out_stride = ps_out->i4_data_stride;
1140     WORD32 i4_upsampled_res_stride = ps_upsampled_res ? ps_upsampled_res->i4_data_stride : 0;
1141     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1142     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1143     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1144     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1145     WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
1146     WORD32 mask0, mask1;
1147     __m128i cmp0, cmp1, sum0, sum1, sum2;
1148     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1149     __m128i temp_2 = _mm_set1_epi16(2);
1150     __m128i temp_1 = _mm_set1_epi16(1);
1151     __m128i src_r0, src_r1, src_r2, src_r3;
1152     __m128i pred_r0, pred_r1, pred_r2, pred_r3;
1153     __m128i temp0, temp1, temp2, temp3;
1154     /* all bits reset to zero */
1155     __m128i zero_8x16b = _mm_setzero_si128();
1156     __m128i neg_255_8x16b = _mm_set1_epi16(-((WORD16) UINT8_MAX));
1157     __m128i pos_255_8x16b = _mm_set1_epi16(((WORD16) UINT8_MAX));
1158     __m128i sign_reg0, sign_reg2;
1159     __m128i scalemat_r0_r1, scalemat_r2_r3;
1160     __m128i upsampled_res0, upsampled_res1, upsampled_res2, upsampled_res3;
1161     __m128i threshold_r0_r1, threshold_r2_r3;
1162     __m128i threshold_mask_r0_r1, threshold_mask_r2_r3;
1163     __m128i chroma_mask = _mm_set1_epi16(0xFF);
1164 
1165     ASSERT(1 == u1_use_upsampled_res);
1166     ASSERT(4 == i4_out_stride);
1167     UNUSED(u1_use_upsampled_res);
1168     UNUSED(i4_out_stride);
1169     UNUSED(ps_upsampled_res);
1170 
1171     /* b00 b01 b02 b03 b10 b11 b12 b13
1172    -- the scaling matrix 0th,1st row */
1173     scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix));
1174 
1175     /* b20 b21 b22 b23 b30 b31 b32 b33
1176      -- the scaling matrix 2nd,3rd row */
1177     scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8));
1178 
1179     /* b00 b01 b02 b03 b10 b11 b12 b13
1180      -- the treshold matrix 0th,1st row */
1181     threshold_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix));
1182 
1183     /* b20 b21 b22 b23 b30 b31 b32 b33
1184      -- the threshold matrix 2nd,3rd row */
1185     threshold_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_threshold_matrix + 8));
1186 
1187     /* a00 a01 a02 a03 0 0 0 0 0
1188     0 0 0 -- all 8 bits */
1189     src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0]));
1190     /* a10 a11 a12 a13 0 0 0 0 0 0 0
1191      0 -- all 8 bits */
1192     src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[i4_src_stride]));
1193     /* a20 a21 a22 a23 0 0 0 0 0 0 0
1194     0 -- all 8 bits */
1195     src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * i4_src_stride]));
1196     /* a30 a31 a32 a33 0 0 0 0 0 0 0
1197     0 -- all 8 bits */
1198     src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * i4_src_stride]));
1199 
1200     src_r0 = _mm_and_si128(src_r0, chroma_mask);
1201     src_r1 = _mm_and_si128(src_r1, chroma_mask);
1202     src_r2 = _mm_and_si128(src_r2, chroma_mask);
1203     src_r3 = _mm_and_si128(src_r3, chroma_mask);
1204 
1205     /* p00 p01 p02 p03 0 0 0 0 0
1206      0 0 0 -- all 8 bits */
1207     pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0]));
1208     /* p10 p11 p12 p13 0 0 0 0 0
1209     0 0 0 -- all 8 bits */
1210     pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[i4_pred_stride]));
1211     /* p20 p21 p22 p23 0 0 0 0 0
1212     0 0 0 -- all 8 bits */
1213     pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * i4_pred_stride]));
1214     /* p30 p31 p32 p33 0 0 0 0 0
1215     0 0 0 -- all 8 bits */
1216     pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * i4_pred_stride]));
1217 
1218     pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
1219     pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
1220     pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
1221     pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
1222 
1223     src_r0 = _mm_sub_epi16(src_r0, pred_r0);
1224     src_r1 = _mm_sub_epi16(src_r1, pred_r1);
1225     src_r2 = _mm_sub_epi16(src_r2, pred_r2);
1226     src_r3 = _mm_sub_epi16(src_r3, pred_r3);
1227 
1228     /* load upsampled residual values and subtract from
1229     the previous residue */
1230     upsampled_res0 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[0]));
1231 
1232     upsampled_res1 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[i4_upsampled_res_stride]));
1233 
1234     upsampled_res2 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[2 * i4_upsampled_res_stride]));
1235 
1236     upsampled_res3 = _mm_loadu_si128((__m128i *) (&pi2_upsampled_res[3 * i4_upsampled_res_stride]));
1237 
1238     src_r0 = _mm_sub_epi16(src_r0, upsampled_res0);
1239     src_r1 = _mm_sub_epi16(src_r1, upsampled_res1);
1240     src_r2 = _mm_sub_epi16(src_r2, upsampled_res2);
1241     src_r3 = _mm_sub_epi16(src_r3, upsampled_res3);
1242 
1243     src_r1 = _mm_unpacklo_epi16(src_r0, src_r1);
1244     src_r3 = _mm_unpacklo_epi16(src_r2, src_r3);
1245 
1246     /* Saturate all values < -255 to -255 and retain the rest as it is */
1247     src_r1 = _mm_max_epi16(src_r1, neg_255_8x16b);
1248     /* Saturate all values > 255 to 255 and retain the rest as it is */
1249     temp0 = _mm_min_epi16(src_r1, pos_255_8x16b);
1250 
1251     /* Saturate all values < -255 to -255 and retain the rest as it is */
1252     src_r3 = _mm_max_epi16(src_r3, neg_255_8x16b);
1253     /* Saturate all values > 255 to 255 and retain the rest as it is */
1254     temp2 = _mm_min_epi16(src_r3, pos_255_8x16b);
1255 
1256     /* Perform Forward transform */
1257     /*-------------------------------------------------------------*/
1258     /* DCT [ Horizontal transformation ]                          */
1259     /*-------------------------------------------------------------*/
1260     // Matrix transpose
1261     /*
1262      *  a0 a1 a2 a3
1263      *  b0 b1 b2 b3
1264      *  c0 c1 c2 c3
1265      *  d0 d1 d2 d3
1266      */
1267     temp1 = _mm_unpacklo_epi32(temp0, temp2);
1268     /* a2 b2 c2 d2 a3 b3 c3 d3 */
1269     temp3 = _mm_unpackhi_epi32(temp0, temp2);
1270 
1271     /* a0 b0 c0 d0 */
1272     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
1273     /* a1 b1 c1 d1 */
1274     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1275     /* a2 b2 c2 d2 */
1276     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1277     /* a3 b3 c3 d3 */
1278     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1279 
1280     /*----------------------------------------------------------*/
1281     /* x0 = z0 + z3                                */
1282     temp0 = _mm_add_epi16(src_r0, src_r3);
1283     /* x1 = z1 + z2                                */
1284     temp1 = _mm_add_epi16(src_r1, src_r2);
1285     /* x2 = z1 - z2                                */
1286     temp2 = _mm_sub_epi16(src_r1, src_r2);
1287     /* x3 = z0 - z3                                */
1288     temp3 = _mm_sub_epi16(src_r0, src_r3);
1289 
1290     /* z0 = x0 + x1                                */
1291     src_r0 = _mm_add_epi16(temp0, temp1);
1292     /* z1 = (x3 << 1) + x2                         */
1293     src_r1 = _mm_slli_epi16(temp3, 1);
1294     src_r1 = _mm_add_epi16(src_r1, temp2);
1295     /* z2 = x0 - x1                                */
1296     src_r2 = _mm_sub_epi16(temp0, temp1);
1297     /* z3 = x3 - (x2 << 1)                         */
1298     src_r3 = _mm_slli_epi16(temp2, 1);
1299     src_r3 = _mm_sub_epi16(temp3, src_r3);
1300 
1301     // Matrix transpose
1302     /*
1303      *  a0 b0 c0 d0
1304      *  a1 b1 c1 d1
1305      *  a2 b2 c2 d2
1306      *  a3 b3 c3 d3
1307      */
1308     /* a0 a1 b0 b1 c0 c1 d0 d1 */
1309     temp0 = _mm_unpacklo_epi16(src_r0, src_r1);
1310     /* a2 a3 b2 b3 c2 c3 d2 d3 */
1311     temp2 = _mm_unpacklo_epi16(src_r2, src_r3);
1312     /* a0 a1 a2 a3 b0 b1 b2 b3 */
1313     temp1 = _mm_unpacklo_epi32(temp0, temp2);
1314     /* c0 c1 c2 c3 d0 d1 d2 d3 */
1315     temp3 = _mm_unpackhi_epi32(temp0, temp2);
1316 
1317     /* a0 a1 a2 a3 */
1318     src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b);
1319     /* b0 b1 b2 b3 */
1320     src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b);
1321     /* c0 c1 c2 c3 */
1322     src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b);
1323     /* d0 d1 d2 d3 */
1324     src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b);
1325 
1326     /*----------------------------------------------------------*/
1327     /* x0 = z0 + z3                                  */
1328     temp0 = _mm_add_epi16(src_r0, src_r3);
1329     /* x1 = z1 + z2                                  */
1330     temp1 = _mm_add_epi16(src_r1, src_r2);
1331     /* x2 = z1 - z2                                  */
1332     temp2 = _mm_sub_epi16(src_r1, src_r2);
1333     /* x3 = z0 - z3                                  */
1334     temp3 = _mm_sub_epi16(src_r0, src_r3);
1335 
1336     /* z0 = x0 + x1                                  */
1337     src_r0 = _mm_add_epi16(temp0, temp1);
1338     /* z1 = (x3 << 1) + x2                           */
1339     src_r1 = _mm_slli_epi16(temp3, 1);
1340     src_r1 = _mm_add_epi16(src_r1, temp2);
1341     /* z2 = x0 - x1                                  */
1342     src_r2 = _mm_sub_epi16(temp0, temp1);
1343     /* z3 = x3 - (x2 << 1)                           */
1344     src_r3 = _mm_slli_epi16(temp2, 1);
1345     src_r3 = _mm_sub_epi16(temp3, src_r3);
1346 
1347     /* get the first 16 bits from the register */
1348     tmp_dc = _mm_extract_epi16(src_r0, 0);
1349     *pi2_dc_out = tmp_dc;
1350 
1351     /* a0 a1 a2 a3 b0 b1 b2 b3 */
1352     src_r0 = _mm_unpacklo_epi64(src_r0, src_r1);
1353     /* c0 c1 c2 c3 d0 d1 d2 d3 */
1354     src_r2 = _mm_unpacklo_epi64(src_r2, src_r3);
1355     sign_reg0 = _mm_cmpgt_epi16(zero_8x16b, src_r0);
1356     sign_reg2 = _mm_cmpgt_epi16(zero_8x16b, src_r2);
1357 
1358     sign_reg0 = _mm_mullo_epi16(temp_2, sign_reg0);
1359     sign_reg2 = _mm_mullo_epi16(temp_2, sign_reg2);
1360 
1361     sign_reg0 = _mm_add_epi16(temp_1, sign_reg0);
1362     sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1363 
1364     src_r0 = _mm_abs_epi16(src_r0);
1365     src_r2 = _mm_abs_epi16(src_r2);
1366 
1367     threshold_mask_r0_r1 = _mm_cmpgt_epi16(threshold_r0_r1, src_r0);
1368     threshold_mask_r2_r3 = _mm_cmpgt_epi16(threshold_r2_r3, src_r2);
1369 
1370     src_r1 = _mm_srli_si128(src_r0, 8);
1371     src_r0 = _mm_cvtepu16_epi32(src_r0);
1372     src_r1 = _mm_cvtepu16_epi32(src_r1);
1373     src_r3 = _mm_srli_si128(src_r2, 8);
1374     src_r2 = _mm_cvtepu16_epi32(src_r2);
1375     src_r3 = _mm_cvtepu16_epi32(src_r3);
1376 
1377     temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1378     scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
1379     temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1380     scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
1381     temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
1382     temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
1383 
1384     temp0 = _mm_mullo_epi32(temp0, src_r0);
1385     temp1 = _mm_mullo_epi32(temp1, src_r1);
1386     temp2 = _mm_mullo_epi32(temp2, src_r2);
1387     temp3 = _mm_mullo_epi32(temp3, src_r3);
1388 
1389     temp0 = _mm_add_epi32(temp0, rnd_fact);
1390     temp1 = _mm_add_epi32(temp1, rnd_fact);
1391     temp2 = _mm_add_epi32(temp2, rnd_fact);
1392     temp3 = _mm_add_epi32(temp3, rnd_fact);
1393 
1394     temp0 = _mm_srli_epi32(temp0, u4_qbits);
1395     temp1 = _mm_srli_epi32(temp1, u4_qbits);
1396     temp2 = _mm_srli_epi32(temp2, u4_qbits);
1397     temp3 = _mm_srli_epi32(temp3, u4_qbits);
1398 
1399     temp0 = _mm_packs_epi32(temp0, temp1);
1400     temp2 = _mm_packs_epi32(temp2, temp3);
1401 
1402     temp0 = _mm_sign_epi16(temp0, sign_reg0);
1403     temp2 = _mm_sign_epi16(temp2, sign_reg2);
1404 
1405     temp0 = _mm_andnot_si128(threshold_mask_r0_r1, temp0);
1406     temp2 = _mm_andnot_si128(threshold_mask_r2_r3, temp2);
1407 
1408     _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
1409     _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
1410 
1411     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1412     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1413 
1414     mask0 = _mm_movemask_epi8(cmp0);
1415     mask1 = _mm_movemask_epi8(cmp1);
1416     u4_zero_coeff = 0;
1417     if(mask0)
1418     {
1419         if(mask0 == 0xffff)
1420             u4_zero_coeff += 8;
1421         else
1422         {
1423             cmp0 = _mm_and_si128(temp_1, cmp0);
1424             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1425             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1426             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1427             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1428         }
1429     }
1430     if(mask1)
1431     {
1432         if(mask1 == 0xffff)
1433             u4_zero_coeff += 8;
1434         else
1435         {
1436             cmp1 = _mm_and_si128(temp_1, cmp1);
1437             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1438             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1439             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1440             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1441         }
1442     }
1443 
1444     /* Return total nonzero coefficients in the current sub block */
1445     u4_nonzero_coeff = 16 - u4_zero_coeff;
1446     *pu1_nnz = u4_nonzero_coeff;
1447 }
1448 
1449 /**
1450 
1451  * *******************************************************************************
1452 
1453  * *
1454  * @brief
1455  *   This function performs forward hadamard transform and
1456  * quantization on a 4*4
1457  *block
1458  *
1459  * @par Description:
1460  *   The function
1461  * accepts source buffer and estimation buffer. From these, it
1462  *   computes the
1463  * residue. This is residue is then transformed and quantized.
1464  *   The
1465  * transform and quantization are in placed computed. They use the residue
1466  *
1467  * buffer for this.
1468  *
1469  * @param[in] pu1_src
1470  *   Pointer to source sub-block
1471 
1472  * *
1473  * @param[in] pu1_pred
1474  *   Pointer to prediction sub-block
1475  *
1476  *
1477  * @param[in] pi2_out
1478  *   Pointer to residual sub-block
1479  *
1480  * @param[in]
1481  * i4_src_stride
1482  *   Source stride
1483  *
1484  * @param[in] i4_pred_stride
1485  *
1486  * Prediction stride
1487  *
1488  * @param[in] dst_strd
1489  *   Destination stride
1490  *
1491  *
1492  * @param[in] u4_qbits
1493  *    QP_BITS_h264_4x4 + floor(QP/6)
1494  *
1495  * @param[in]
1496  * pu2_threshold_matrix
1497  *   Pointer to Forward Quant Threshold Matrix
1498  *
1499  *
1500  * @param[in] pu2_scale_matrix
1501  *   Pointer to Forward Quant Scale Matrix
1502  *
1503  *
1504  * @param[in] u4_round_factor
1505  *   Quantization Round factor
1506  *
1507  * @param[out]
1508  * pu1_nnz
1509  *   Total non-zero coefficients in the current sub-block
1510  *
1511  *
1512  * @returns
1513  *
1514  * @remarks
1515  *   None
1516  *
1517  */
1518 
isvc_hadamard_quant_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)1519 void isvc_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
1520                                    resi_trans_quant_constants_t *ps_quant_constants,
1521                                    UWORD8 *pu1_nnz)
1522 {
1523     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1524     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1525     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1526     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1527     WORD32 u4_zero_coeff, u4_nonzero_coeff = 0;
1528     __m128i cmp0, cmp1, sum0, sum1, sum2;
1529     WORD32 mask0, mask1;
1530     __m128i src_r0_r1, src_r2_r3, sign_reg;
1531     __m128i src_r0, src_r1, src_r2, src_r3;
1532     __m128i zero_8x16b = _mm_setzero_si128();
1533     __m128i temp0, temp1, temp2, temp3;
1534     __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
1535     __m128i temp_1 = _mm_set1_epi16(1);
1536     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1537     __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
1538 
1539     UNUSED(pu2_threshold_matrix);
1540 
1541     src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src));  // a00 a01 a02 a03 a10 a11 a12 a13 -- the
1542                                                          // source matrix 0th,1st row
1543     src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8));  // a20 a21 a22 a23 a30 a31 a32 a33 --
1544                                                              // the source matrix 2nd,3rd row
1545     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
1546     src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);  // a0 a1 a2 a3
1547     src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);  // b0 b1 b2 b3
1548     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
1549     src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);  // c0 c1 c2 c3
1550     src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);  // d0 d1 d2 d3
1551 
1552     /* Perform Inverse transform */
1553     /*-------------------------------------------------------------*/
1554     /* Forward DC transform [ Horizontal transformation ] */
1555     /*-------------------------------------------------------------*/
1556     // Matrix transpose
1557     /*
1558      *  a0 a1 a2 a3
1559      *  b0 b1 b2 b3
1560      *  c0 c1 c2 c3
1561      *  d0 d1 d2 d3
1562      */
1563     temp0 = _mm_unpacklo_epi32(src_r0, src_r1);  // a0 b0 a1 b1
1564     temp2 = _mm_unpacklo_epi32(src_r2, src_r3);  // c0 d0 c1 d1
1565     temp1 = _mm_unpackhi_epi32(src_r0, src_r1);  // a2 b2 a3 b3
1566     temp3 = _mm_unpackhi_epi32(src_r2, src_r3);  // c2 d2 c3 d3
1567     src_r0 = _mm_unpacklo_epi64(temp0, temp2);   // a0 b0 c0 d0
1568     src_r1 = _mm_unpackhi_epi64(temp0, temp2);   // a1 b1 c1 d1
1569     src_r2 = _mm_unpacklo_epi64(temp1, temp3);   // a2 b2 c2 d2
1570     src_r3 = _mm_unpackhi_epi64(temp1, temp3);   // a3 b3 c3 d3
1571 
1572     temp0 = _mm_add_epi32(src_r0, src_r3);
1573     temp1 = _mm_add_epi32(src_r1, src_r2);
1574     temp2 = _mm_sub_epi32(src_r1, src_r2);
1575     temp3 = _mm_sub_epi32(src_r0, src_r3);
1576 
1577     src_r0 = _mm_add_epi32(temp0, temp1);
1578     src_r1 = _mm_add_epi32(temp2, temp3);
1579     src_r2 = _mm_sub_epi32(temp0, temp1);
1580     src_r3 = _mm_sub_epi32(temp3, temp2);
1581 
1582     /*-------------------------------------------------------------*/
1583     /* Forward DC transform [ Vertical transformation ] */
1584     /*-------------------------------------------------------------*/
1585     // Matrix transpose
1586     /*
1587      *  a0 b0 c0 d0
1588      *  a1 b1 c1 d1
1589      *  a2 b2 c2 d2
1590      *  a3 b3 c3 d3
1591      */
1592     temp0 = _mm_unpacklo_epi32(src_r0, src_r1);  // a0 a1 b0 b1
1593     temp2 = _mm_unpacklo_epi32(src_r2, src_r3);  // a2 a3 b2 b3
1594     temp1 = _mm_unpackhi_epi32(src_r0, src_r1);  // c0 c1 d0 d1
1595     temp3 = _mm_unpackhi_epi32(src_r2, src_r3);  // c2 c3 d2 d3
1596     src_r0 = _mm_unpacklo_epi64(temp0, temp2);   // a0 a1 a2 a3
1597     src_r1 = _mm_unpackhi_epi64(temp0, temp2);   // b0 b1 b2 b3
1598     src_r2 = _mm_unpacklo_epi64(temp1, temp3);   // c0 c1 c2 c3
1599     src_r3 = _mm_unpackhi_epi64(temp1, temp3);   // d0 d1 d2 d3
1600 
1601     temp0 = _mm_add_epi32(src_r0, src_r3);
1602     temp1 = _mm_add_epi32(src_r1, src_r2);
1603     temp2 = _mm_sub_epi32(src_r1, src_r2);
1604     temp3 = _mm_sub_epi32(src_r0, src_r3);
1605 
1606     src_r0 = _mm_add_epi32(temp0, temp1);
1607     src_r1 = _mm_add_epi32(temp2, temp3);
1608     src_r2 = _mm_sub_epi32(temp0, temp1);
1609     src_r3 = _mm_sub_epi32(temp3, temp2);
1610 
1611     src_r0 = _mm_srai_epi32(src_r0, 1);
1612     src_r1 = _mm_srai_epi32(src_r1, 1);
1613     src_r2 = _mm_srai_epi32(src_r2, 1);
1614     src_r3 = _mm_srai_epi32(src_r3, 1);
1615 
1616     // Quantization
1617     sign_reg0 =
1618         _mm_cmpgt_epi32(zero_8x16b, src_r0);  // Find sign of each value for later restoration
1619     sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
1620     sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
1621     sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
1622 
1623     sign_reg0 = _mm_packs_epi32(sign_reg0,
1624                                 sign_reg1);  // Sign = -1 or 0 depending on <0 or >0 respectively
1625     sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
1626 
1627     sign_reg0 = _mm_slli_epi16(sign_reg0, 1);  // Sign = -2 or 0 depending on <0 or >0 respectively
1628     sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
1629 
1630     sign_reg0 =
1631         _mm_add_epi16(temp_1, sign_reg0);  // Sign = -1 or 1 depending on <0 or >0 respectively
1632     sign_reg2 = _mm_add_epi16(temp_1, sign_reg2);
1633 
1634     src_r0 = _mm_abs_epi32(src_r0);  // Absolute values
1635     src_r1 = _mm_abs_epi32(src_r1);
1636     src_r2 = _mm_abs_epi32(src_r2);
1637     src_r3 = _mm_abs_epi32(src_r3);
1638 
1639     temp0 = _mm_mullo_epi32(scale_val, src_r0);  // multiply by
1640                                                  // pu2_scale_matrix[0]
1641     temp1 = _mm_mullo_epi32(scale_val, src_r1);
1642     temp2 = _mm_mullo_epi32(scale_val, src_r2);
1643     temp3 = _mm_mullo_epi32(scale_val, src_r3);
1644 
1645     temp0 = _mm_add_epi32(temp0, rnd_fact);  // Add round factor
1646     temp1 = _mm_add_epi32(temp1, rnd_fact);
1647     temp2 = _mm_add_epi32(temp2, rnd_fact);
1648     temp3 = _mm_add_epi32(temp3, rnd_fact);
1649 
1650     temp0 = _mm_srli_epi32(temp0,
1651                            u4_qbits);  // RIght shift by qbits, unsigned variable,
1652                                        // so shift right immediate works
1653     temp1 = _mm_srli_epi32(temp1, u4_qbits);
1654     temp2 = _mm_srli_epi32(temp2, u4_qbits);
1655     temp3 = _mm_srli_epi32(temp3, u4_qbits);
1656 
1657     temp0 = _mm_packs_epi32(temp0, temp1);  // Final values are 16-bits only.
1658     temp2 = _mm_packs_epi32(temp2, temp3);
1659 
1660     temp0 = _mm_sign_epi16(temp0, sign_reg0);  // Sign restoration
1661     temp2 = _mm_sign_epi16(temp2, sign_reg2);
1662 
1663     _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
1664     _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
1665 
1666     cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
1667     cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
1668 
1669     mask0 = _mm_movemask_epi8(cmp0);
1670     mask1 = _mm_movemask_epi8(cmp1);
1671     u4_zero_coeff = 0;
1672     if(mask0)
1673     {
1674         if(mask0 == 0xffff)
1675             u4_zero_coeff += 8;
1676         else
1677         {
1678             cmp0 = _mm_and_si128(temp_1, cmp0);
1679             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1680             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1681             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1682             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1683         }
1684     }
1685     if(mask1)
1686     {
1687         if(mask1 == 0xffff)
1688             u4_zero_coeff += 8;
1689         else
1690         {
1691             cmp1 = _mm_and_si128(temp_1, cmp1);
1692             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1693             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1694             sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
1695             u4_zero_coeff += _mm_cvtsi128_si32(sum2);
1696         }
1697     }
1698 
1699     /* Return total nonzero coefficients in the current sub block */
1700     u4_nonzero_coeff = 16 - u4_zero_coeff;
1701     pu1_nnz[0] = u4_nonzero_coeff;
1702 }
1703 
1704 /**
1705 
1706  * *******************************************************************************
1707 
1708  * *
1709  * @brief
1710  *   This function performs forward hadamard transform and
1711  * quantization on a 2*2
1712  *block for both U and V planes
1713  *
1714  * @par
1715  * Description:
1716  *   The function accepts source buffer and estimation buffer.
1717  * From these, it
1718  *   computes the residue. This is residue is then transformed
1719  * and quantized.
1720  *   The transform and quantization are in placed computed.
1721  * They use the residue
1722  *   buffer for this.
1723  *
1724  * @param[in] pu1_src
1725  *
1726  * Pointer to source sub-block
1727  *
1728  * @param[in] pu1_pred
1729  *   Pointer to
1730  * prediction sub-block
1731  *
1732  * @param[in] pi2_out
1733  *   Pointer to residual
1734  * sub-block
1735  *
1736  * @param[in] i4_src_stride
1737  *   Source stride
1738  *
1739  * @param[in]
1740  * i4_pred_stride
1741  *   Prediction stride
1742  *
1743  * @param[in] dst_strd
1744  *
1745  * Destination stride
1746  *
1747  * @param[in] u4_qbits
1748  *    QP_BITS_h264_4x4 +
1749  * floor(QP/6)
1750  *
1751  * @param[in] pu2_threshold_matrix
1752  *   Pointer to Forward
1753  * Quant Threshold Matrix
1754  *
1755  * @param[in] pu2_scale_matrix
1756  *   Pointer to
1757  * Forward Quant Scale Matrix
1758  *
1759  * @param[in] u4_round_factor
1760  *   Quantization
1761  * Round factor
1762  *
1763  * @param[out] pu1_nnz
1764  *   Total non-zero coefficients in
1765  * the current sub-block
1766  *
1767  * @returns
1768  *
1769  * @remarks
1770  *   NNZ for dc is
1771  * populated at 0 and 5th position of pu1_nnz
1772  *
1773  */
1774 
isvc_hadamard_quant_2x2_uv_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,resi_trans_quant_constants_t * ps_quant_constants,UWORD8 * pu1_nnz)1775 void isvc_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
1776                                       resi_trans_quant_constants_t *ps_quant_constants,
1777                                       UWORD8 *pu1_nnz)
1778 {
1779     const UWORD16 *pu2_scale_matrix = ps_quant_constants->pu2_scale_matrix;
1780     const UWORD16 *pu2_threshold_matrix = ps_quant_constants->pu2_threshold_matrix;
1781     UWORD32 u4_qbits = ps_quant_constants->u4_qbits;
1782     UWORD32 u4_round_factor = ps_quant_constants->u4_round_factor;
1783     WORD32 val, nonzero_coeff_0 = 0, nonzero_coeff_1 = 0;
1784     __m128i cmp, cmp0, cmp1;
1785     __m128i sum0, sum1;
1786     WORD32 mask, mask0, mask1;
1787     __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
1788     __m128i zero_8x16b = _mm_setzero_si128();
1789     __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
1790     __m128i sign_reg0, sign_reg1;
1791     __m128i temp_1 = _mm_set1_epi16(1);
1792     __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
1793 
1794     UNUSED(pu2_threshold_matrix);
1795 
1796     src = _mm_loadu_si128((__m128i *) pi2_src);  // a0 a1 a2 a3 b0 b1 b2 b3
1797     sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
1798     plane_0 = _mm_unpacklo_epi16(src, sign_reg);  // a0 a1 a2 a3 -- 32 bits
1799     plane_1 = _mm_unpackhi_epi16(src, sign_reg);  // b0 b1 b2 b3 -- 32 bits
1800 
1801     temp0 = _mm_hadd_epi32(plane_0, plane_1);  // a0+a1 a2+a3 b0+b1 b2+b3
1802     temp1 = _mm_hsub_epi32(plane_0, plane_1);  // a0-a1 a2-a3 b0-b1 b2-b3
1803 
1804     plane_0 = _mm_hadd_epi32(temp0, temp1);  // a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
1805     plane_1 = _mm_hsub_epi32(temp0, temp1);  // a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
1806 
1807     temp0 =
1808         _mm_unpacklo_epi32(plane_0, plane_1);  // a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
1809     temp1 =
1810         _mm_unpackhi_epi32(plane_0, plane_1);  // a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
1811 
1812     plane_0 = _mm_unpacklo_epi64(temp0, temp1);  // a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
1813     plane_1 = _mm_unpackhi_epi64(temp0, temp1);  // b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
1814 
1815     plane_0 = _mm_shuffle_epi32(plane_0, 0xd8);  // a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
1816     plane_1 = _mm_shuffle_epi32(plane_1, 0xd8);  // b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
1817     // Quantization
1818     sign_reg0 =
1819         _mm_cmpgt_epi32(zero_8x16b, plane_0);  // Find sign of each value for later restoration
1820     sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
1821 
1822     sign_reg0 = _mm_packs_epi32(sign_reg0,
1823                                 sign_reg1);    // Sign = -1 or 0 depending on <0 or >0 respectively
1824     sign_reg0 = _mm_slli_epi16(sign_reg0, 1);  // Sign = -2 or 0 depending on <0 or >0 respectively
1825     sign_reg0 =
1826         _mm_add_epi16(temp_1, sign_reg0);  // Sign = -1 or 1 depending on <0 or >0 respectively
1827 
1828     plane_0 = _mm_abs_epi32(plane_0);  // Absolute values
1829     plane_1 = _mm_abs_epi32(plane_1);
1830 
1831     temp0 = _mm_mullo_epi32(scale_val, plane_0);  // multiply by pu2_scale_matrix[0]
1832     temp1 = _mm_mullo_epi32(scale_val, plane_1);  // multiply by pu2_scale_matrix[0]
1833 
1834     temp0 = _mm_add_epi32(temp0, rnd_fact);  // Add round factor
1835     temp1 = _mm_add_epi32(temp1, rnd_fact);
1836 
1837     temp0 = _mm_srli_epi32(temp0,
1838                            u4_qbits);  // RIght shift by qbits, unsigned variable,
1839                                        // so shift right immediate works
1840     temp1 = _mm_srli_epi32(temp1, u4_qbits);
1841 
1842     temp0 = _mm_packs_epi32(temp0, temp1);     // Final values are 16-bits only.
1843     temp0 = _mm_sign_epi16(temp0, sign_reg0);  // Sign restoration
1844 
1845     _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
1846 
1847     cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
1848     mask = _mm_movemask_epi8(cmp);
1849     mask0 = mask & 0xff;
1850     mask1 = mask >> 8;
1851     if(mask0)
1852     {
1853         if(mask0 == 0xff)
1854             nonzero_coeff_0 += 4;
1855         else
1856         {
1857             cmp0 = _mm_and_si128(temp_1, cmp);
1858             sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
1859             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1860             val = _mm_cvtsi128_si32(sum1);
1861             val = val & 0xffff;
1862             nonzero_coeff_0 += val;
1863         }
1864     }
1865     if(mask1)
1866     {
1867         if(mask1 == 0xff)
1868             nonzero_coeff_1 += 4;
1869         else
1870         {
1871             cmp1 = _mm_srli_si128(cmp, 8);
1872             cmp1 = _mm_and_si128(temp_1, cmp1);
1873             sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
1874             sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
1875             nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
1876         }
1877     }
1878 
1879     pu1_nnz[0] = 4 - nonzero_coeff_0;
1880     pu1_nnz[1] = 4 - nonzero_coeff_1;
1881 }
1882