1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ih264_resi_trans_quant_sse42.c
24 *
25 * @brief
26 * Contains function definitions single stage forward transform for H.264
27 * It will calculate the residue, do the cf and then do quantization
28 *
29 * @author
30 * Mohit [100664]
31 *
32 * @par List of Functions:
33 * - ih264_resi_trans_quant_4x4_sse42()
34 * - ih264_resi_trans_quant_chroma_4x4_sse42()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41 /* System include files */
42 #include <stddef.h>
43
44 /* User include files */
45 #include "ih264_typedefs.h"
46 #include "ih264_defs.h"
47 #include "ih264_size_defs.h"
48 #include "ih264_macros.h"
49 #include "ih264_platform_macros.h"
50 #include "ih264_trans_macros.h"
51 #include "ih264_trans_data.h"
52 #include "ih264_structs.h"
53 #include "ih264_trans_quant_itrans_iquant.h"
54 #include <immintrin.h>
55 /**
56 *******************************************************************************
57 *
58 * @brief
59 * This function performs forward transform and quantization on a 4*4 block
60 *
61 * @par Description:
62 * The function accepts source buffer and estimation buffer. From these, it
63 * computes the residue. This is residue is then transformed and quantized.
64 * The transform and quantization are in placed computed. They use the residue
65 * buffer for this.
66 *
67 * @param[in] pu1_src
68 * Pointer to source sub-block
69 *
70 * @param[in] pu1_pred
71 * Pointer to prediction sub-block
72 *
73 * @param[in] pi2_out
74 * Pointer to residual sub-block
75 *
76 * @param[in] src_strd
77 * Source stride
78 *
79 * @param[in] pred_strd
80 * Prediction stride
81 *
82 * @param[in] dst_strd
83 * Destination stride
84 *
85 * @param[in] u4_qbits
86 * QP_BITS_h264_4x4 + floor(QP/6)
87 *
88 * @param[in] pu2_threshold_matrix
89 * Pointer to Forward Quant Threshold Matrix
90 *
91 * @param[in] pu2_scale_matrix
92 * Pointer to Forward Quant Scale Matrix
93 *
94 * @param[in] u4_round_factor
95 * Quantization Round factor
96 *
97 * @param[out] pu1_nnz
98 * Total non-zero coefficients in the current sub-block
99 *
100 * @returns
101 *
102 * @remarks
103 * None
104 *
105 *******************************************************************************
106 */
ih264_resi_trans_quant_4x4_sse42(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)107 void ih264_resi_trans_quant_4x4_sse42(UWORD8 *pu1_src, UWORD8 *pu1_pred,
108 WORD16 *pi2_out, WORD32 src_strd, WORD32 pred_strd,
109 const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix,
110 UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 *pu1_nnz,
111 WORD16 *pi2_alt_dc_addr)
112 {
113 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
114 WORD32 mask0, mask1;
115 __m128i sum0, sum1, sum2, cmp0, cmp1;
116 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
117 __m128i temp_2 = _mm_set1_epi16(2);
118 __m128i temp_1 = _mm_set1_epi16(1);
119 __m128i src_r0, src_r1, src_r2, src_r3;
120 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
121 __m128i temp0, temp1, temp2, temp3;
122 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
123 __m128i sign_reg0, sign_reg2;
124 __m128i scalemat_r0_r1, scalemat_r2_r3;
125
126 UNUSED (pu2_threshold_matrix);
127
128 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
129 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
130 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
131 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
132 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
133 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
134
135 src_r0 = _mm_cvtepu8_epi16(src_r0);
136 src_r1 = _mm_cvtepu8_epi16(src_r1);
137 src_r2 = _mm_cvtepu8_epi16(src_r2);
138 src_r3 = _mm_cvtepu8_epi16(src_r3);
139
140 pred_r0 = loadu_32(&pu1_pred[0]); //p00 p01 p02 p03 -- all 8 bits
141 pred_r1 = loadu_32(&pu1_pred[pred_strd]); //p10 p11 p12 p13 -- all 8 bits
142 pred_r2 = loadu_32(&pu1_pred[2 * pred_strd]); //p20 p21 p22 p23 -- all 8 bits
143 pred_r3 = loadu_32(&pu1_pred[3 * pred_strd]); //p30 p31 p32 p33 -- all 8 bits
144
145 pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
146 pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
147 pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
148 pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
149
150 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
151 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
152 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
153 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
154
155 /* Perform Forward transform */
156 /*-------------------------------------------------------------*/
157 /* DCT [ Horizontal transformation ] */
158 /*-------------------------------------------------------------*/
159 // Matrix transpose
160 /*
161 * a0 a1 a2 a3
162 * b0 b1 b2 b3
163 * c0 c1 c2 c3
164 * d0 d1 d2 d3
165 */
166 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
167 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
168 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
169 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
170
171 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
172 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
173 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
174 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
175
176 /*----------------------------------------------------------*/
177 /* x0 = z0 + z3 */
178 temp0 = _mm_add_epi16(src_r0, src_r3);
179 /* x1 = z1 + z2 */
180 temp1 = _mm_add_epi16(src_r1, src_r2);
181 /* x2 = z1 - z2 */
182 temp2 = _mm_sub_epi16(src_r1, src_r2);
183 /* x3 = z0 - z3 */
184 temp3 = _mm_sub_epi16(src_r0, src_r3);
185
186 /* z0 = x0 + x1 */
187 src_r0 = _mm_add_epi16(temp0, temp1);
188 /* z1 = (x3 << 1) + x2 */
189 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
190 src_r1 = _mm_add_epi16(src_r1, temp2);
191 /* z2 = x0 - x1 */
192 src_r2 = _mm_sub_epi16(temp0, temp1);
193 /* z3 = x3 - (x2 << 1) */
194 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
195 src_r3 = _mm_sub_epi16(temp3, src_r3);
196
197 // Matrix transpose
198 /*
199 * a0 b0 c0 d0
200 * a1 b1 c1 d1
201 * a2 b2 c2 d2
202 * a3 b3 c3 d3
203 */
204 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
205 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
206 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
207 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
208
209 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
210 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
211 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
212 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
213
214 /*----------------------------------------------------------*/
215 /* x0 = z0 + z3 */
216 temp0 = _mm_add_epi16(src_r0, src_r3);
217 /* x1 = z1 + z2 */
218 temp1 = _mm_add_epi16(src_r1, src_r2);
219 /* x2 = z1 - z2 */
220 temp2 = _mm_sub_epi16(src_r1, src_r2);
221 /* x3 = z0 - z3 */
222 temp3 = _mm_sub_epi16(src_r0, src_r3);
223
224 /* z0 = x0 + x1 */
225 src_r0 = _mm_add_epi16(temp0, temp1);
226 /* z1 = (x3 << 1) + x2 */
227 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
228 src_r1 = _mm_add_epi16(src_r1, temp2);
229 /* z2 = x0 - x1 */
230 src_r2 = _mm_sub_epi16(temp0, temp1);
231 /* z3 = x3 - (x2 << 1) */
232 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
233 src_r3 = _mm_sub_epi16(temp3, src_r3);
234
235 tmp_dc = _mm_extract_epi16(src_r0,0); //a0
236 *pi2_alt_dc_addr = tmp_dc;
237
238 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
239 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
240 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
241 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
242
243 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
244 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
245
246 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
247 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
248
249 src_r0 = _mm_abs_epi16(src_r0);
250 src_r2 = _mm_abs_epi16(src_r2);
251
252 src_r1 = _mm_srli_si128(src_r0, 8);
253 src_r0 = _mm_cvtepu16_epi32(src_r0);
254 src_r1 = _mm_cvtepu16_epi32(src_r1);
255 src_r3 = _mm_srli_si128(src_r2, 8);
256 src_r2 = _mm_cvtepu16_epi32(src_r2);
257 src_r3 = _mm_cvtepu16_epi32(src_r3);
258
259 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
260 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
261 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
262 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
263 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
264 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
265
266 temp0 = _mm_mullo_epi32(temp0, src_r0);
267 temp1 = _mm_mullo_epi32(temp1, src_r1);
268 temp2 = _mm_mullo_epi32(temp2, src_r2);
269 temp3 = _mm_mullo_epi32(temp3, src_r3);
270
271 temp0 = _mm_add_epi32(temp0,rnd_fact);
272 temp1 = _mm_add_epi32(temp1,rnd_fact);
273 temp2 = _mm_add_epi32(temp2,rnd_fact);
274 temp3 = _mm_add_epi32(temp3,rnd_fact);
275
276 temp0 = _mm_srli_epi32(temp0,u4_qbits);
277 temp1 = _mm_srli_epi32(temp1,u4_qbits);
278 temp2 = _mm_srli_epi32(temp2,u4_qbits);
279 temp3 = _mm_srli_epi32(temp3,u4_qbits);
280
281 temp0 = _mm_packs_epi32 (temp0,temp1);
282 temp2 = _mm_packs_epi32 (temp2,temp3);
283
284 temp0 = _mm_sign_epi16(temp0, sign_reg0);
285 temp2 = _mm_sign_epi16(temp2, sign_reg2);
286
287 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
288 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
289
290 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
291 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
292
293 mask0 = _mm_movemask_epi8(cmp0);
294 mask1 = _mm_movemask_epi8(cmp1);
295 u4_zero_coeff = 0;
296 if(mask0)
297 {
298 if(mask0 == 0xffff)
299 u4_zero_coeff+=8;
300 else
301 {
302 cmp0 = _mm_and_si128(temp_1, cmp0);
303 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
304 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
305 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
306 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
307 }
308 }
309 if(mask1)
310 {
311 if(mask1 == 0xffff)
312 u4_zero_coeff+=8;
313 else
314 {
315 cmp1 = _mm_and_si128(temp_1, cmp1);
316 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
317 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
318 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
319 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
320 }
321 }
322
323 /* Return total nonzero coefficients in the current sub block */
324 u4_nonzero_coeff = 16 - u4_zero_coeff;
325 *pu1_nnz = u4_nonzero_coeff;
326 }
327
328 /**
329 *******************************************************************************
330 *
331 * @brief
332 * This function performs forward transform and quantization on a 4*4 chroma block
333 *
334 * @par Description:
335 * The function accepts source buffer and estimation buffer. From these, it
336 * computes the residue. This is residue is then transformed and quantized.
337 * The transform and quantization are in placed computed. They use the residue
338 * buffer for this.
339 *
340 * @param[in] pu1_src
341 * Pointer to source sub-block
342 *
343 * @param[in] pu1_pred
344 * Pointer to prediction sub-block
345 *
346 * @param[in] pi2_out
347 * Pointer to residual sub-block
348 *
349 * @param[in] src_strd
350 * Source stride
351 *
352 * @param[in] pred_strd
353 * Prediction stride
354 *
355 * @param[in] dst_strd
356 * Destination stride
357 *
358 * @param[in] u4_qbits
359 * QP_BITS_h264_4x4 + floor(QP/6)
360 *
361 * @param[in] pu2_threshold_matrix
362 * Pointer to Forward Quant Threshold Matrix
363 *
364 * @param[in] pu2_scale_matrix
365 * Pointer to Forward Quant Scale Matrix
366 *
367 * @param[in] u4_round_factor
368 * Quantization Round factor
369 *
370 * @param[out] pu1_nnz
371 * Total non-zero coefficients in the current sub-block
372 *
373 * @returns
374 *
375 * @remarks
376 * None
377 *
378 *******************************************************************************
379 */
ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)380 void ih264_resi_trans_quant_chroma_4x4_sse42(UWORD8 *pu1_src,UWORD8 *pu1_pred,WORD16 *pi2_out,
381 WORD32 src_strd,WORD32 pred_strd,
382 const UWORD16 *pu2_scale_matrix,
383 const UWORD16 *pu2_threshold_matrix,
384 UWORD32 u4_qbits,UWORD32 u4_round_factor,
385 UWORD8 *pu1_nnz, WORD16 *pi2_alt_dc_addr)
386 {
387 WORD32 tmp_dc, u4_zero_coeff, u4_nonzero_coeff = 0;
388 WORD32 mask0, mask1;
389 __m128i cmp0, cmp1, sum0, sum1, sum2;
390 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
391 __m128i temp_2 = _mm_set1_epi16(2);
392 __m128i temp_1 = _mm_set1_epi16(1);
393 __m128i src_r0, src_r1, src_r2, src_r3;
394 __m128i pred_r0, pred_r1, pred_r2, pred_r3;
395 __m128i temp0, temp1, temp2, temp3;
396 __m128i zero_8x16b = _mm_setzero_si128(); // all bits reset to zero
397 __m128i sign_reg0, sign_reg2;
398 __m128i scalemat_r0_r1, scalemat_r2_r3;
399 __m128i chroma_mask = _mm_set1_epi16 (0xFF);
400
401 UNUSED (pu2_threshold_matrix);
402
403 scalemat_r0_r1 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix)); //b00 b01 b02 b03 b10 b11 b12 b13 -- the scaling matrix 0th,1st row
404 scalemat_r2_r3 = _mm_loadu_si128((__m128i *) (pu2_scale_matrix + 8)); //b20 b21 b22 b23 b30 b31 b32 b33 -- the scaling matrix 2nd,3rd row
405 src_r0 = _mm_loadl_epi64((__m128i *) (&pu1_src[0])); //a00 a01 a02 a03 0 0 0 0 0 0 0 0 -- all 8 bits
406 src_r1 = _mm_loadl_epi64((__m128i *) (&pu1_src[src_strd])); //a10 a11 a12 a13 0 0 0 0 0 0 0 0 -- all 8 bits
407 src_r2 = _mm_loadl_epi64((__m128i *) (&pu1_src[2 * src_strd])); //a20 a21 a22 a23 0 0 0 0 0 0 0 0 -- all 8 bits
408 src_r3 = _mm_loadl_epi64((__m128i *) (&pu1_src[3 * src_strd])); //a30 a31 a32 a33 0 0 0 0 0 0 0 0 -- all 8 bits
409
410 src_r0 = _mm_and_si128(src_r0, chroma_mask);
411 src_r1 = _mm_and_si128(src_r1, chroma_mask);
412 src_r2 = _mm_and_si128(src_r2, chroma_mask);
413 src_r3 = _mm_and_si128(src_r3, chroma_mask);
414 // src_r0 = _mm_cvtepu8_epi16(src_r0);
415 // src_r1 = _mm_cvtepu8_epi16(src_r1);
416 // src_r2 = _mm_cvtepu8_epi16(src_r2);
417 // src_r3 = _mm_cvtepu8_epi16(src_r3);
418
419 pred_r0 = _mm_loadl_epi64((__m128i *) (&pu1_pred[0])); //p00 p01 p02 p03 0 0 0 0 0 0 0 0 -- all 8 bits
420 pred_r1 = _mm_loadl_epi64((__m128i *) (&pu1_pred[pred_strd])); //p10 p11 p12 p13 0 0 0 0 0 0 0 0 -- all 8 bits
421 pred_r2 = _mm_loadl_epi64((__m128i *) (&pu1_pred[2 * pred_strd])); //p20 p21 p22 p23 0 0 0 0 0 0 0 0 -- all 8 bits
422 pred_r3 = _mm_loadl_epi64((__m128i *) (&pu1_pred[3 * pred_strd])); //p30 p31 p32 p33 0 0 0 0 0 0 0 0 -- all 8 bits
423
424 pred_r0 = _mm_and_si128(pred_r0, chroma_mask);
425 pred_r1 = _mm_and_si128(pred_r1, chroma_mask);
426 pred_r2 = _mm_and_si128(pred_r2, chroma_mask);
427 pred_r3 = _mm_and_si128(pred_r3, chroma_mask);
428 // pred_r0 = _mm_cvtepu8_epi16(pred_r0); //p00 p01 p02 p03 -- all 16 bits
429 // pred_r1 = _mm_cvtepu8_epi16(pred_r1); //p10 p11 p12 p13 -- all 16 bits
430 // pred_r2 = _mm_cvtepu8_epi16(pred_r2); //p20 p21 p22 p23 -- all 16 bits
431 // pred_r3 = _mm_cvtepu8_epi16(pred_r3); //p30 p31 p32 p33 -- all 16 bits
432
433 src_r0 = _mm_sub_epi16(src_r0, pred_r0);
434 src_r1 = _mm_sub_epi16(src_r1, pred_r1);
435 src_r2 = _mm_sub_epi16(src_r2, pred_r2);
436 src_r3 = _mm_sub_epi16(src_r3, pred_r3);
437
438 /* Perform Forward transform */
439 /*-------------------------------------------------------------*/
440 /* DCT [ Horizontal transformation ] */
441 /*-------------------------------------------------------------*/
442 // Matrix transpose
443 /*
444 * a0 a1 a2 a3
445 * b0 b1 b2 b3
446 * c0 c1 c2 c3
447 * d0 d1 d2 d3
448 */
449 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 b0 a1 b1 a2 b2 a3 b3
450 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //c0 d0 c1 d1 c2 d2 c3 d3
451 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 b0 c0 d0 a1 b1 c1 d1
452 temp3 = _mm_unpackhi_epi32(temp0, temp2); //a2 b2 c2 d2 a3 b3 c3 d3
453
454 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 b0 c0 d0
455 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //a1 b1 c1 d1
456 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //a2 b2 c2 d2
457 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //a3 b3 c3 d3
458
459 /*----------------------------------------------------------*/
460 /* x0 = z0 + z3 */
461 temp0 = _mm_add_epi16(src_r0, src_r3);
462 /* x1 = z1 + z2 */
463 temp1 = _mm_add_epi16(src_r1, src_r2);
464 /* x2 = z1 - z2 */
465 temp2 = _mm_sub_epi16(src_r1, src_r2);
466 /* x3 = z0 - z3 */
467 temp3 = _mm_sub_epi16(src_r0, src_r3);
468
469 /* z0 = x0 + x1 */
470 src_r0 = _mm_add_epi16(temp0, temp1);
471 /* z1 = (x3 << 1) + x2 */
472 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
473 src_r1 = _mm_add_epi16(src_r1, temp2);
474 /* z2 = x0 - x1 */
475 src_r2 = _mm_sub_epi16(temp0, temp1);
476 /* z3 = x3 - (x2 << 1) */
477 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
478 src_r3 = _mm_sub_epi16(temp3, src_r3);
479
480 // Matrix transpose
481 /*
482 * a0 b0 c0 d0
483 * a1 b1 c1 d1
484 * a2 b2 c2 d2
485 * a3 b3 c3 d3
486 */
487 temp0 = _mm_unpacklo_epi16(src_r0, src_r1); //a0 a1 b0 b1 c0 c1 d0 d1
488 temp2 = _mm_unpacklo_epi16(src_r2, src_r3); //a2 a3 b2 b3 c2 c3 d2 d3
489 temp1 = _mm_unpacklo_epi32(temp0, temp2); //a0 a1 a2 a3 b0 b1 b2 b3
490 temp3 = _mm_unpackhi_epi32(temp0, temp2); //c0 c1 c2 c3 d0 d1 d2 d3
491
492 src_r0 = _mm_unpacklo_epi64(temp1, zero_8x16b); //a0 a1 a2 a3
493 src_r1 = _mm_unpackhi_epi64(temp1, zero_8x16b); //b0 b1 b2 b3
494 src_r2 = _mm_unpacklo_epi64(temp3, zero_8x16b); //c0 c1 c2 c3
495 src_r3 = _mm_unpackhi_epi64(temp3, zero_8x16b); //d0 d1 d2 d3
496
497 /*----------------------------------------------------------*/
498 /* x0 = z0 + z3 */
499 temp0 = _mm_add_epi16(src_r0, src_r3);
500 /* x1 = z1 + z2 */
501 temp1 = _mm_add_epi16(src_r1, src_r2);
502 /* x2 = z1 - z2 */
503 temp2 = _mm_sub_epi16(src_r1, src_r2);
504 /* x3 = z0 - z3 */
505 temp3 = _mm_sub_epi16(src_r0, src_r3);
506
507 /* z0 = x0 + x1 */
508 src_r0 = _mm_add_epi16(temp0, temp1);
509 /* z1 = (x3 << 1) + x2 */
510 src_r1 = _mm_slli_epi16(temp3, 1); //(x3<<1)
511 src_r1 = _mm_add_epi16(src_r1, temp2);
512 /* z2 = x0 - x1 */
513 src_r2 = _mm_sub_epi16(temp0, temp1);
514 /* z3 = x3 - (x2 << 1) */
515 src_r3 = _mm_slli_epi16(temp2, 1); //(x2<<1)
516 src_r3 = _mm_sub_epi16(temp3, src_r3);
517
518 tmp_dc = _mm_extract_epi16(src_r0,0); //a0
519 *pi2_alt_dc_addr = tmp_dc;
520
521 src_r0 = _mm_unpacklo_epi64(src_r0, src_r1); //a0 a1 a2 a3 b0 b1 b2 b3
522 src_r2 = _mm_unpacklo_epi64(src_r2, src_r3); //c0 c1 c2 c3 d0 d1 d2 d3
523 sign_reg0 = _mm_cmpgt_epi16(zero_8x16b,src_r0);
524 sign_reg2 = _mm_cmpgt_epi16(zero_8x16b,src_r2);
525
526 sign_reg0 = _mm_mullo_epi16(temp_2,sign_reg0);
527 sign_reg2 = _mm_mullo_epi16(temp_2,sign_reg2);
528
529 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0);
530 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
531
532 src_r0 = _mm_abs_epi16(src_r0);
533 src_r2 = _mm_abs_epi16(src_r2);
534
535 src_r1 = _mm_srli_si128(src_r0, 8);
536 src_r0 = _mm_cvtepu16_epi32(src_r0);
537 src_r1 = _mm_cvtepu16_epi32(src_r1);
538 src_r3 = _mm_srli_si128(src_r2, 8);
539 src_r2 = _mm_cvtepu16_epi32(src_r2);
540 src_r3 = _mm_cvtepu16_epi32(src_r3);
541
542 temp0 = _mm_cvtepu16_epi32(scalemat_r0_r1);
543 scalemat_r0_r1 = _mm_srli_si128(scalemat_r0_r1, 8);
544 temp2 = _mm_cvtepu16_epi32(scalemat_r2_r3);
545 scalemat_r2_r3 = _mm_srli_si128(scalemat_r2_r3, 8);
546 temp1 = _mm_cvtepu16_epi32(scalemat_r0_r1);
547 temp3 = _mm_cvtepu16_epi32(scalemat_r2_r3);
548
549 temp0 = _mm_mullo_epi32(temp0, src_r0);
550 temp1 = _mm_mullo_epi32(temp1, src_r1);
551 temp2 = _mm_mullo_epi32(temp2, src_r2);
552 temp3 = _mm_mullo_epi32(temp3, src_r3);
553
554 temp0 = _mm_add_epi32(temp0,rnd_fact);
555 temp1 = _mm_add_epi32(temp1,rnd_fact);
556 temp2 = _mm_add_epi32(temp2,rnd_fact);
557 temp3 = _mm_add_epi32(temp3,rnd_fact);
558
559 temp0 = _mm_srli_epi32(temp0,u4_qbits);
560 temp1 = _mm_srli_epi32(temp1,u4_qbits);
561 temp2 = _mm_srli_epi32(temp2,u4_qbits);
562 temp3 = _mm_srli_epi32(temp3,u4_qbits);
563
564 temp0 = _mm_packs_epi32 (temp0,temp1);
565 temp2 = _mm_packs_epi32 (temp2,temp3);
566
567 temp0 = _mm_sign_epi16(temp0, sign_reg0);
568 temp2 = _mm_sign_epi16(temp2, sign_reg2);
569
570 //temp0 = _mm_insert_epi16(temp0, tmp_dc, 0);
571
572 _mm_storeu_si128((__m128i *) (&pi2_out[0]), temp0);
573 _mm_storeu_si128((__m128i *) (&pi2_out[8]), temp2);
574
575 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
576 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
577
578 mask0 = _mm_movemask_epi8(cmp0);
579 mask1 = _mm_movemask_epi8(cmp1);
580 u4_zero_coeff = 0;
581 if(mask0)
582 {
583 if(mask0 == 0xffff)
584 u4_zero_coeff+=8;
585 else
586 {
587 cmp0 = _mm_and_si128(temp_1, cmp0);
588 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
589 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
590 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
591 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
592 }
593 }
594 if(mask1)
595 {
596 if(mask1 == 0xffff)
597 u4_zero_coeff+=8;
598 else
599 {
600 cmp1 = _mm_and_si128(temp_1, cmp1);
601 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
602 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
603 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
604 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
605 }
606 }
607
608 /* Return total nonzero coefficients in the current sub block */
609 u4_nonzero_coeff = 16 - u4_zero_coeff;
610 *pu1_nnz = u4_nonzero_coeff;
611
612 }
613
614
615 /**
616 *******************************************************************************
617 *
618 * @brief
619 * This function performs forward hadamard transform and quantization on a 4*4 block
620 *
621 * @par Description:
622 * The function accepts source buffer and estimation buffer. From these, it
623 * computes the residue. This is residue is then transformed and quantized.
624 * The transform and quantization are in placed computed. They use the residue
625 * buffer for this.
626 *
627 * @param[in] pu1_src
628 * Pointer to source sub-block
629 *
630 * @param[in] pu1_pred
631 * Pointer to prediction sub-block
632 *
633 * @param[in] pi2_out
634 * Pointer to residual sub-block
635 *
636 * @param[in] src_strd
637 * Source stride
638 *
639 * @param[in] pred_strd
640 * Prediction stride
641 *
642 * @param[in] dst_strd
643 * Destination stride
644 *
645 * @param[in] u4_qbits
646 * QP_BITS_h264_4x4 + floor(QP/6)
647 *
648 * @param[in] pu2_threshold_matrix
649 * Pointer to Forward Quant Threshold Matrix
650 *
651 * @param[in] pu2_scale_matrix
652 * Pointer to Forward Quant Scale Matrix
653 *
654 * @param[in] u4_round_factor
655 * Quantization Round factor
656 *
657 * @param[out] pu1_nnz
658 * Total non-zero coefficients in the current sub-block
659 *
660 * @returns
661 *
662 * @remarks
663 * None
664 *
665 */
666
ih264_hadamard_quant_4x4_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)667 void ih264_hadamard_quant_4x4_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
668 const UWORD16 *pu2_scale_matrix,
669 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
670 UWORD32 u4_round_factor,UWORD8 *pu1_nnz
671 )
672 {
673 WORD32 u4_zero_coeff,u4_nonzero_coeff=0;
674 __m128i cmp0, cmp1, sum0, sum1, sum2;
675 WORD32 mask0, mask1;
676 __m128i src_r0_r1, src_r2_r3, sign_reg;
677 __m128i src_r0, src_r1, src_r2, src_r3;
678 __m128i zero_8x16b = _mm_setzero_si128();
679 __m128i temp0, temp1, temp2, temp3;
680 __m128i sign_reg0, sign_reg1, sign_reg2, sign_reg3;
681 __m128i temp_1 = _mm_set1_epi16(1);
682 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
683 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
684
685 UNUSED (pu2_threshold_matrix);
686
687 src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
688 src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
689 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
690 src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg); //a0 a1 a2 a3
691 src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg); //b0 b1 b2 b3
692 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
693 src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg); //c0 c1 c2 c3
694 src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg); //d0 d1 d2 d3
695
696 /* Perform Inverse transform */
697 /*-------------------------------------------------------------*/
698 /* Forward DC transform [ Horizontal transformation ] */
699 /*-------------------------------------------------------------*/
700 // Matrix transpose
701 /*
702 * a0 a1 a2 a3
703 * b0 b1 b2 b3
704 * c0 c1 c2 c3
705 * d0 d1 d2 d3
706 */
707 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 b0 a1 b1
708 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //c0 d0 c1 d1
709 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //a2 b2 a3 b3
710 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 d2 c3 d3
711 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 b0 c0 d0
712 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //a1 b1 c1 d1
713 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //a2 b2 c2 d2
714 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //a3 b3 c3 d3
715
716 temp0 = _mm_add_epi32(src_r0, src_r3);
717 temp1 = _mm_add_epi32(src_r1, src_r2);
718 temp2 = _mm_sub_epi32(src_r1, src_r2);
719 temp3 = _mm_sub_epi32(src_r0, src_r3);
720
721 src_r0 = _mm_add_epi32(temp0, temp1);
722 src_r1 = _mm_add_epi32(temp2, temp3);
723 src_r2 = _mm_sub_epi32(temp0, temp1);
724 src_r3 = _mm_sub_epi32(temp3, temp2);
725
726 /*-------------------------------------------------------------*/
727 /* Forward DC transform [ Vertical transformation ] */
728 /*-------------------------------------------------------------*/
729 // Matrix transpose
730 /*
731 * a0 b0 c0 d0
732 * a1 b1 c1 d1
733 * a2 b2 c2 d2
734 * a3 b3 c3 d3
735 */
736 temp0 = _mm_unpacklo_epi32(src_r0, src_r1); //a0 a1 b0 b1
737 temp2 = _mm_unpacklo_epi32(src_r2, src_r3); //a2 a3 b2 b3
738 temp1 = _mm_unpackhi_epi32(src_r0, src_r1); //c0 c1 d0 d1
739 temp3 = _mm_unpackhi_epi32(src_r2, src_r3); //c2 c3 d2 d3
740 src_r0 = _mm_unpacklo_epi64(temp0, temp2); //a0 a1 a2 a3
741 src_r1 = _mm_unpackhi_epi64(temp0, temp2); //b0 b1 b2 b3
742 src_r2 = _mm_unpacklo_epi64(temp1, temp3); //c0 c1 c2 c3
743 src_r3 = _mm_unpackhi_epi64(temp1, temp3); //d0 d1 d2 d3
744
745 temp0 = _mm_add_epi32(src_r0, src_r3);
746 temp1 = _mm_add_epi32(src_r1, src_r2);
747 temp2 = _mm_sub_epi32(src_r1, src_r2);
748 temp3 = _mm_sub_epi32(src_r0, src_r3);
749
750 src_r0 = _mm_add_epi32(temp0, temp1);
751 src_r1 = _mm_add_epi32(temp2, temp3);
752 src_r2 = _mm_sub_epi32(temp0, temp1);
753 src_r3 = _mm_sub_epi32(temp3, temp2);
754
755 src_r0 = _mm_srai_epi32(src_r0, 1);
756 src_r1 = _mm_srai_epi32(src_r1, 1);
757 src_r2 = _mm_srai_epi32(src_r2, 1);
758 src_r3 = _mm_srai_epi32(src_r3, 1);
759
760 // Quantization
761 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, src_r0); //Find sign of each value for later restoration
762 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, src_r1);
763 sign_reg2 = _mm_cmpgt_epi32(zero_8x16b, src_r2);
764 sign_reg3 = _mm_cmpgt_epi32(zero_8x16b, src_r3);
765
766 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
767 sign_reg2 = _mm_packs_epi32(sign_reg2, sign_reg3);
768
769 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
770 sign_reg2 = _mm_slli_epi16(sign_reg2, 1);
771
772 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
773 sign_reg2 = _mm_add_epi16(temp_1,sign_reg2);
774
775 src_r0 = _mm_abs_epi32(src_r0); //Absolute values
776 src_r1 = _mm_abs_epi32(src_r1);
777 src_r2 = _mm_abs_epi32(src_r2);
778 src_r3 = _mm_abs_epi32(src_r3);
779
780 temp0 = _mm_mullo_epi32(scale_val, src_r0); //multiply by pu2_scale_matrix[0]
781 temp1 = _mm_mullo_epi32(scale_val, src_r1);
782 temp2 = _mm_mullo_epi32(scale_val, src_r2);
783 temp3 = _mm_mullo_epi32(scale_val, src_r3);
784
785 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
786 temp1 = _mm_add_epi32(temp1,rnd_fact);
787 temp2 = _mm_add_epi32(temp2,rnd_fact);
788 temp3 = _mm_add_epi32(temp3,rnd_fact);
789
790 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
791 temp1 = _mm_srli_epi32(temp1,u4_qbits);
792 temp2 = _mm_srli_epi32(temp2,u4_qbits);
793 temp3 = _mm_srli_epi32(temp3,u4_qbits);
794
795 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
796 temp2 = _mm_packs_epi32 (temp2,temp3);
797
798 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
799 temp2 = _mm_sign_epi16(temp2, sign_reg2);
800
801 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
802 _mm_storeu_si128((__m128i *) (&pi2_dst[8]), temp2);
803
804 cmp0 = _mm_cmpeq_epi16(temp0, zero_8x16b);
805 cmp1 = _mm_cmpeq_epi16(temp2, zero_8x16b);
806
807 mask0 = _mm_movemask_epi8(cmp0);
808 mask1 = _mm_movemask_epi8(cmp1);
809 u4_zero_coeff = 0;
810 if(mask0)
811 {
812 if(mask0 == 0xffff)
813 u4_zero_coeff+=8;
814 else
815 {
816 cmp0 = _mm_and_si128(temp_1, cmp0);
817 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
818 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
819 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
820 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
821 }
822 }
823 if(mask1)
824 {
825 if(mask1 == 0xffff)
826 u4_zero_coeff+=8;
827 else
828 {
829 cmp1 = _mm_and_si128(temp_1, cmp1);
830 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
831 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
832 sum2 = _mm_hadd_epi16(sum1, zero_8x16b);
833 u4_zero_coeff += _mm_cvtsi128_si32(sum2);
834 }
835 }
836
837 /* Return total nonzero coefficients in the current sub block */
838 u4_nonzero_coeff = 16 - u4_zero_coeff;
839 pu1_nnz[0] = u4_nonzero_coeff;
840 }
841
842
843 /**
844 *******************************************************************************
845 *
846 * @brief
847 * This function performs forward hadamard transform and quantization on a 2*2 block
848 * for both U and V planes
849 *
850 * @par Description:
851 * The function accepts source buffer and estimation buffer. From these, it
852 * computes the residue. This is residue is then transformed and quantized.
853 * The transform and quantization are in placed computed. They use the residue
854 * buffer for this.
855 *
856 * @param[in] pu1_src
857 * Pointer to source sub-block
858 *
859 * @param[in] pu1_pred
860 * Pointer to prediction sub-block
861 *
862 * @param[in] pi2_out
863 * Pointer to residual sub-block
864 *
865 * @param[in] src_strd
866 * Source stride
867 *
868 * @param[in] pred_strd
869 * Prediction stride
870 *
871 * @param[in] dst_strd
872 * Destination stride
873 *
874 * @param[in] u4_qbits
875 * QP_BITS_h264_4x4 + floor(QP/6)
876 *
877 * @param[in] pu2_threshold_matrix
878 * Pointer to Forward Quant Threshold Matrix
879 *
880 * @param[in] pu2_scale_matrix
881 * Pointer to Forward Quant Scale Matrix
882 *
883 * @param[in] u4_round_factor
884 * Quantization Round factor
885 *
886 * @param[out] pu1_nnz
887 * Total non-zero coefficients in the current sub-block
888 *
889 * @returns
890 *
891 * @remarks
892 * NNZ for dc is populated at 0 and 5th position of pu1_nnz
893 *
894 */
895
ih264_hadamard_quant_2x2_uv_sse42(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)896 void ih264_hadamard_quant_2x2_uv_sse42(WORD16 *pi2_src, WORD16 *pi2_dst,
897 const UWORD16 *pu2_scale_matrix,
898 const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
899 UWORD32 u4_round_factor,UWORD8 *pu1_nnz)
900 {
901 WORD32 val, nonzero_coeff_0=0, nonzero_coeff_1=0;
902 __m128i cmp, cmp0, cmp1;
903 __m128i sum0, sum1;
904 WORD32 mask, mask0, mask1;
905 __m128i src, plane_0, plane_1, temp0, temp1, sign_reg;
906 __m128i zero_8x16b = _mm_setzero_si128();
907 __m128i scale_val = _mm_set1_epi32(pu2_scale_matrix[0]);
908 __m128i sign_reg0, sign_reg1;
909 __m128i temp_1 = _mm_set1_epi16(1);
910 __m128i rnd_fact = _mm_set1_epi32(u4_round_factor);
911
912 UNUSED (pu2_threshold_matrix);
913
914 src = _mm_loadu_si128((__m128i *)pi2_src); //a0 a1 a2 a3 b0 b1 b2 b3
915 sign_reg = _mm_cmpgt_epi16(zero_8x16b, src);
916 plane_0 = _mm_unpacklo_epi16(src, sign_reg); //a0 a1 a2 a3 -- 32 bits
917 plane_1 = _mm_unpackhi_epi16(src, sign_reg); //b0 b1 b2 b3 -- 32 bits
918
919 temp0 = _mm_hadd_epi32(plane_0, plane_1); //a0+a1 a2+a3 b0+b1 b2+b3
920 temp1 = _mm_hsub_epi32(plane_0, plane_1); //a0-a1 a2-a3 b0-b1 b2-b3
921
922 plane_0 = _mm_hadd_epi32(temp0, temp1); //a0+a1+a2+a3 b0+b1+b2+b3 a0-a1+a2-a3 b0-b1+b2-b3
923 plane_1 = _mm_hsub_epi32(temp0, temp1); //a0+a1-a2-a3 b0+b1-b2-b3 a0-a1-a2+a3 b0-b1-b2+b3
924
925 temp0 = _mm_unpacklo_epi32(plane_0, plane_1); //a0+a1+a2+a3 a0+a1-a2-a3 b0+b1+b2+b3 b0+b1-b2-b3
926 temp1 = _mm_unpackhi_epi32(plane_0, plane_1); //a0-a1+a2-a3 a0-a1-a2+a3 b0-b1+b2-b3 b0-b1-b2+b3
927
928 plane_0 = _mm_unpacklo_epi64(temp0, temp1); //a0+a1+a2+a3 a0+a1-a2-a3 a0-a1+a2-a3 a0-a1-a2+a3
929 plane_1 = _mm_unpackhi_epi64(temp0, temp1); //b0+b1+b2+b3 b0+b1-b2-b3 b0-b1+b2-b3 b0-b1-b2+b3
930
931 plane_0 = _mm_shuffle_epi32(plane_0, 0xd8); //a0+a1+a2+a3 a0-a1+a2-a3 a0+a1-a2-a3 a0-a1-a2+a3
932 plane_1 = _mm_shuffle_epi32(plane_1, 0xd8); //b0+b1+b2+b3 b0-b1+b2-b3 b0+b1-b2-b3 b0-b1-b2+b3
933 // Quantization
934 sign_reg0 = _mm_cmpgt_epi32(zero_8x16b, plane_0); //Find sign of each value for later restoration
935 sign_reg1 = _mm_cmpgt_epi32(zero_8x16b, plane_1);
936
937 sign_reg0 = _mm_packs_epi32(sign_reg0, sign_reg1); //Sign = -1 or 0 depending on <0 or >0 respectively
938 sign_reg0 = _mm_slli_epi16(sign_reg0, 1); //Sign = -2 or 0 depending on <0 or >0 respectively
939 sign_reg0 = _mm_add_epi16(temp_1,sign_reg0); //Sign = -1 or 1 depending on <0 or >0 respectively
940
941 plane_0 = _mm_abs_epi32(plane_0); //Absolute values
942 plane_1 = _mm_abs_epi32(plane_1);
943
944 temp0 = _mm_mullo_epi32(scale_val, plane_0); //multiply by pu2_scale_matrix[0]
945 temp1 = _mm_mullo_epi32(scale_val, plane_1); //multiply by pu2_scale_matrix[0]
946
947 temp0 = _mm_add_epi32(temp0,rnd_fact); //Add round factor
948 temp1 = _mm_add_epi32(temp1,rnd_fact);
949
950 temp0 = _mm_srli_epi32(temp0,u4_qbits); //RIght shift by qbits, unsigned variable, so shift right immediate works
951 temp1 = _mm_srli_epi32(temp1,u4_qbits);
952
953 temp0 = _mm_packs_epi32 (temp0,temp1); //Final values are 16-bits only.
954 temp0 = _mm_sign_epi16(temp0, sign_reg0); //Sign restoration
955
956 _mm_storeu_si128((__m128i *) (&pi2_dst[0]), temp0);
957
958 cmp = _mm_cmpeq_epi16(temp0, zero_8x16b);
959 mask = _mm_movemask_epi8(cmp);
960 mask0 = mask & 0xff;
961 mask1 = mask>>8;
962 if(mask0)
963 {
964 if(mask0 == 0xff)
965 nonzero_coeff_0 += 4;
966 else
967 {
968 cmp0 = _mm_and_si128(temp_1, cmp);
969 sum0 = _mm_hadd_epi16(cmp0, zero_8x16b);
970 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
971 val = _mm_cvtsi128_si32(sum1);
972 val = val & 0xffff;
973 nonzero_coeff_0 += val;
974 }
975 }
976 if(mask1)
977 {
978 if(mask1 == 0xff)
979 nonzero_coeff_1 += 4;
980 else
981 {
982 cmp1 = _mm_srli_si128(cmp, 8);
983 cmp1 = _mm_and_si128(temp_1, cmp1);
984 sum0 = _mm_hadd_epi16(cmp1, zero_8x16b);
985 sum1 = _mm_hadd_epi16(sum0, zero_8x16b);
986 nonzero_coeff_1 += _mm_cvtsi128_si32(sum1);
987 }
988 }
989
990 pu1_nnz[0] = 4 - nonzero_coeff_0;
991 pu1_nnz[1] = 4 - nonzero_coeff_1;
992
993 }
994