1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #endif
15 #include <immintrin.h>
16
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx/vpx_integer.h"
19 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
20 #include "vpx_dsp/x86/quantize_sse2.h"
21 #include "vpx_dsp/x86/quantize_ssse3.h"
22
vpx_quantize_b_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)23 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
24 int skip_block, const int16_t *zbin_ptr,
25 const int16_t *round_ptr, const int16_t *quant_ptr,
26 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
27 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
28 uint16_t *eob_ptr, const int16_t *scan,
29 const int16_t *iscan) {
30 const __m128i zero = _mm_setzero_si128();
31 const __m256i big_zero = _mm256_setzero_si256();
32 int index;
33
34 __m128i zbin, round, quant, dequant, shift;
35 __m128i coeff0, coeff1;
36 __m128i qcoeff0, qcoeff1;
37 __m128i cmp_mask0, cmp_mask1;
38 __m128i all_zero;
39 __m128i eob = zero, eob0;
40
41 (void)scan;
42 (void)skip_block;
43 assert(!skip_block);
44
45 *eob_ptr = 0;
46
47 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
48 dequant_ptr, &dequant, quant_shift_ptr, &shift);
49
50 // Do DC and first 15 AC.
51 coeff0 = load_tran_low(coeff_ptr);
52 coeff1 = load_tran_low(coeff_ptr + 8);
53
54 qcoeff0 = _mm_abs_epi16(coeff0);
55 qcoeff1 = _mm_abs_epi16(coeff1);
56
57 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
58 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
59 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
60
61 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
62 if (_mm_test_all_zeros(all_zero, all_zero)) {
63 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
64 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
65 #if CONFIG_VP9_HIGHBITDEPTH
66 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
67 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
68 #endif // CONFIG_VP9_HIGHBITDEPTH
69
70 if (n_coeffs == 16) return;
71
72 round = _mm_unpackhi_epi64(round, round);
73 quant = _mm_unpackhi_epi64(quant, quant);
74 shift = _mm_unpackhi_epi64(shift, shift);
75 dequant = _mm_unpackhi_epi64(dequant, dequant);
76 } else {
77 calculate_qcoeff(&qcoeff0, round, quant, shift);
78 round = _mm_unpackhi_epi64(round, round);
79 quant = _mm_unpackhi_epi64(quant, quant);
80 shift = _mm_unpackhi_epi64(shift, shift);
81 calculate_qcoeff(&qcoeff1, round, quant, shift);
82
83 // Reinsert signs
84 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
85 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
86
87 // Mask out zbin threshold coeffs
88 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
89 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
90
91 store_tran_low(qcoeff0, qcoeff_ptr);
92 store_tran_low(qcoeff1, qcoeff_ptr + 8);
93
94 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
95 dequant = _mm_unpackhi_epi64(dequant, dequant);
96 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
97
98 eob =
99 scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
100 }
101
102 // AC only loop.
103 for (index = 16; index < n_coeffs; index += 16) {
104 coeff0 = load_tran_low(coeff_ptr + index);
105 coeff1 = load_tran_low(coeff_ptr + index + 8);
106
107 qcoeff0 = _mm_abs_epi16(coeff0);
108 qcoeff1 = _mm_abs_epi16(coeff1);
109
110 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
111 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
112
113 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
114 if (_mm_test_all_zeros(all_zero, all_zero)) {
115 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
116 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
117 #if CONFIG_VP9_HIGHBITDEPTH
118 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
119 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
120 #endif // CONFIG_VP9_HIGHBITDEPTH
121 continue;
122 }
123
124 calculate_qcoeff(&qcoeff0, round, quant, shift);
125 calculate_qcoeff(&qcoeff1, round, quant, shift);
126
127 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
128 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
129
130 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
131 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
132
133 store_tran_low(qcoeff0, qcoeff_ptr + index);
134 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
135
136 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
137 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
138
139 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
140 zero);
141 eob = _mm_max_epi16(eob, eob0);
142 }
143
144 *eob_ptr = accumulate_eob(eob);
145 }
146
vpx_quantize_b_32x32_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)147 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
148 int skip_block, const int16_t *zbin_ptr,
149 const int16_t *round_ptr,
150 const int16_t *quant_ptr,
151 const int16_t *quant_shift_ptr,
152 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
153 const int16_t *dequant_ptr, uint16_t *eob_ptr,
154 const int16_t *scan, const int16_t *iscan) {
155 const __m128i zero = _mm_setzero_si128();
156 const __m128i one = _mm_set1_epi16(1);
157 const __m256i big_zero = _mm256_setzero_si256();
158 int index;
159
160 __m128i zbin, round, quant, dequant, shift;
161 __m128i coeff0, coeff1;
162 __m128i qcoeff0, qcoeff1;
163 __m128i cmp_mask0, cmp_mask1;
164 __m128i all_zero;
165 __m128i eob = zero, eob0;
166
167 (void)scan;
168 (void)n_coeffs;
169 (void)skip_block;
170 assert(!skip_block);
171
172 // Setup global values.
173 // The 32x32 halves zbin and round.
174 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
175 // Shift with rounding.
176 zbin = _mm_add_epi16(zbin, one);
177 zbin = _mm_srli_epi16(zbin, 1);
178 // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
179 // it is a strict "greater" comparison.
180 zbin = _mm_sub_epi16(zbin, one);
181
182 round = _mm_load_si128((const __m128i *)round_ptr);
183 round = _mm_add_epi16(round, one);
184 round = _mm_srli_epi16(round, 1);
185
186 quant = _mm_load_si128((const __m128i *)quant_ptr);
187 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
188 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
189 shift = _mm_slli_epi16(shift, 1);
190
191 // Do DC and first 15 AC.
192 coeff0 = load_tran_low(coeff_ptr);
193 coeff1 = load_tran_low(coeff_ptr + 8);
194
195 qcoeff0 = _mm_abs_epi16(coeff0);
196 qcoeff1 = _mm_abs_epi16(coeff1);
197
198 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
199 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
200 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
201
202 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
203 if (_mm_test_all_zeros(all_zero, all_zero)) {
204 _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
205 _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
206 #if CONFIG_VP9_HIGHBITDEPTH
207 _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
208 _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
209 #endif // CONFIG_VP9_HIGHBITDEPTH
210
211 round = _mm_unpackhi_epi64(round, round);
212 quant = _mm_unpackhi_epi64(quant, quant);
213 shift = _mm_unpackhi_epi64(shift, shift);
214 dequant = _mm_unpackhi_epi64(dequant, dequant);
215 } else {
216 calculate_qcoeff(&qcoeff0, round, quant, shift);
217 round = _mm_unpackhi_epi64(round, round);
218 quant = _mm_unpackhi_epi64(quant, quant);
219 shift = _mm_unpackhi_epi64(shift, shift);
220 calculate_qcoeff(&qcoeff1, round, quant, shift);
221
222 // Reinsert signs.
223 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
224 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
225
226 // Mask out zbin threshold coeffs.
227 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
228 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
229
230 store_tran_low(qcoeff0, qcoeff_ptr);
231 store_tran_low(qcoeff1, qcoeff_ptr + 8);
232
233 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
234 dequant = _mm_unpackhi_epi64(dequant, dequant);
235 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
236
237 eob =
238 scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
239 }
240
241 // AC only loop.
242 for (index = 16; index < 32 * 32; index += 16) {
243 coeff0 = load_tran_low(coeff_ptr + index);
244 coeff1 = load_tran_low(coeff_ptr + index + 8);
245
246 qcoeff0 = _mm_abs_epi16(coeff0);
247 qcoeff1 = _mm_abs_epi16(coeff1);
248
249 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
250 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
251
252 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
253 if (_mm_test_all_zeros(all_zero, all_zero)) {
254 _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
255 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
256 #if CONFIG_VP9_HIGHBITDEPTH
257 _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
258 _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
259 #endif // CONFIG_VP9_HIGHBITDEPTH
260 continue;
261 }
262
263 calculate_qcoeff(&qcoeff0, round, quant, shift);
264 calculate_qcoeff(&qcoeff1, round, quant, shift);
265
266 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
267 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
268
269 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
270 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
271
272 store_tran_low(qcoeff0, qcoeff_ptr + index);
273 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
274
275 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
276 dqcoeff_ptr + index);
277 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
278 dqcoeff_ptr + index + 8);
279
280 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
281 zero);
282 eob = _mm_max_epi16(eob, eob0);
283 }
284
285 *eob_ptr = accumulate_eob(eob);
286 }
287