1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13
14 #include "./vp9_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18
vp9_quantize_fp_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)19 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
20 int skip_block, const int16_t *round_ptr,
21 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
22 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23 uint16_t *eob_ptr, const int16_t *scan_ptr,
24 const int16_t *iscan_ptr) {
25 __m128i zero;
26 __m128i thr;
27 int16_t nzflag;
28
29 (void)scan_ptr;
30
31 coeff_ptr += n_coeffs;
32 iscan_ptr += n_coeffs;
33 qcoeff_ptr += n_coeffs;
34 dqcoeff_ptr += n_coeffs;
35 n_coeffs = -n_coeffs;
36 zero = _mm_setzero_si128();
37
38 if (!skip_block) {
39 __m128i eob;
40 __m128i round, quant, dequant;
41 {
42 __m128i coeff0, coeff1;
43
44 // Setup global values
45 {
46 round = _mm_load_si128((const __m128i *)round_ptr);
47 quant = _mm_load_si128((const __m128i *)quant_ptr);
48 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
49 }
50
51 {
52 __m128i coeff0_sign, coeff1_sign;
53 __m128i qcoeff0, qcoeff1;
54 __m128i qtmp0, qtmp1;
55 // Do DC and first 15 AC
56 coeff0 = load_tran_low(coeff_ptr + n_coeffs);
57 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
58
59 // Poor man's sign extract
60 coeff0_sign = _mm_srai_epi16(coeff0, 15);
61 coeff1_sign = _mm_srai_epi16(coeff1, 15);
62 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
63 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
64 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
65 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
66
67 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
68 round = _mm_unpackhi_epi64(round, round);
69 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
70 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
71 quant = _mm_unpackhi_epi64(quant, quant);
72 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
73
74 // Reinsert signs
75 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
76 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
77 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
78 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
79
80 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
81 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
82
83 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
84 dequant = _mm_unpackhi_epi64(dequant, dequant);
85 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
86
87 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
88 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
89 }
90
91 {
92 // Scan for eob
93 __m128i zero_coeff0, zero_coeff1;
94 __m128i nzero_coeff0, nzero_coeff1;
95 __m128i iscan0, iscan1;
96 __m128i eob1;
97 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
98 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
99 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
100 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
101 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
102 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
103 // Add one to convert from indices to counts
104 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
105 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
106 eob = _mm_and_si128(iscan0, nzero_coeff0);
107 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
108 eob = _mm_max_epi16(eob, eob1);
109 }
110 n_coeffs += 8 * 2;
111 }
112
113 thr = _mm_srai_epi16(dequant, 1);
114
115 // AC only loop
116 while (n_coeffs < 0) {
117 __m128i coeff0, coeff1;
118 {
119 __m128i coeff0_sign, coeff1_sign;
120 __m128i qcoeff0, qcoeff1;
121 __m128i qtmp0, qtmp1;
122
123 coeff0 = load_tran_low(coeff_ptr + n_coeffs);
124 coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
125
126 // Poor man's sign extract
127 coeff0_sign = _mm_srai_epi16(coeff0, 15);
128 coeff1_sign = _mm_srai_epi16(coeff1, 15);
129 qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
130 qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
131 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
132 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
133
134 nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
135 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
136
137 if (nzflag) {
138 qcoeff0 = _mm_adds_epi16(qcoeff0, round);
139 qcoeff1 = _mm_adds_epi16(qcoeff1, round);
140 qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
141 qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
142
143 // Reinsert signs
144 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
145 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
146 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
147 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
148
149 store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
150 store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
151
152 coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
153 coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
154
155 store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
156 store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
157 } else {
158 store_zero_tran_low(qcoeff_ptr + n_coeffs);
159 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
160
161 store_zero_tran_low(dqcoeff_ptr + n_coeffs);
162 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
163 }
164 }
165
166 if (nzflag) {
167 // Scan for eob
168 __m128i zero_coeff0, zero_coeff1;
169 __m128i nzero_coeff0, nzero_coeff1;
170 __m128i iscan0, iscan1;
171 __m128i eob0, eob1;
172 zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
173 zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
174 nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
175 nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
176 iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
177 iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
178 // Add one to convert from indices to counts
179 iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
180 iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
181 eob0 = _mm_and_si128(iscan0, nzero_coeff0);
182 eob1 = _mm_and_si128(iscan1, nzero_coeff1);
183 eob0 = _mm_max_epi16(eob0, eob1);
184 eob = _mm_max_epi16(eob, eob0);
185 }
186 n_coeffs += 8 * 2;
187 }
188
189 // Accumulate EOB
190 {
191 __m128i eob_shuffled;
192 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
193 eob = _mm_max_epi16(eob, eob_shuffled);
194 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
195 eob = _mm_max_epi16(eob, eob_shuffled);
196 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
197 eob = _mm_max_epi16(eob, eob_shuffled);
198 *eob_ptr = _mm_extract_epi16(eob, 1);
199 }
200 } else {
201 do {
202 store_zero_tran_low(qcoeff_ptr + n_coeffs);
203 store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
204
205 store_zero_tran_low(dqcoeff_ptr + n_coeffs);
206 store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
207 n_coeffs += 8 * 2;
208 } while (n_coeffs < 0);
209 *eob_ptr = 0;
210 }
211 }
212