• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #if defined(_MSC_VER)
13 #include <intrin.h>
14 #endif
15 #include <immintrin.h>
16 
17 #include "./vpx_dsp_rtcd.h"
18 #include "vpx/vpx_integer.h"
19 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
20 #include "vpx_dsp/x86/quantize_sse2.h"
21 #include "vpx_dsp/x86/quantize_ssse3.h"
22 
vpx_quantize_b_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)23 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
24                         int skip_block, const int16_t *zbin_ptr,
25                         const int16_t *round_ptr, const int16_t *quant_ptr,
26                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
27                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
28                         uint16_t *eob_ptr, const int16_t *scan,
29                         const int16_t *iscan) {
30   const __m128i zero = _mm_setzero_si128();
31   const __m256i big_zero = _mm256_setzero_si256();
32   int index;
33 
34   __m128i zbin, round, quant, dequant, shift;
35   __m128i coeff0, coeff1;
36   __m128i qcoeff0, qcoeff1;
37   __m128i cmp_mask0, cmp_mask1;
38   __m128i all_zero;
39   __m128i eob = zero, eob0;
40 
41   (void)scan;
42   (void)skip_block;
43   assert(!skip_block);
44 
45   *eob_ptr = 0;
46 
47   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
48                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
49 
50   // Do DC and first 15 AC.
51   coeff0 = load_tran_low(coeff_ptr);
52   coeff1 = load_tran_low(coeff_ptr + 8);
53 
54   qcoeff0 = _mm_abs_epi16(coeff0);
55   qcoeff1 = _mm_abs_epi16(coeff1);
56 
57   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
58   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
59   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
60 
61   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
62   if (_mm_test_all_zeros(all_zero, all_zero)) {
63     _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
64     _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
65 #if CONFIG_VP9_HIGHBITDEPTH
66     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
67     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
68 #endif  // CONFIG_VP9_HIGHBITDEPTH
69 
70     if (n_coeffs == 16) return;
71 
72     round = _mm_unpackhi_epi64(round, round);
73     quant = _mm_unpackhi_epi64(quant, quant);
74     shift = _mm_unpackhi_epi64(shift, shift);
75     dequant = _mm_unpackhi_epi64(dequant, dequant);
76   } else {
77     calculate_qcoeff(&qcoeff0, round, quant, shift);
78     round = _mm_unpackhi_epi64(round, round);
79     quant = _mm_unpackhi_epi64(quant, quant);
80     shift = _mm_unpackhi_epi64(shift, shift);
81     calculate_qcoeff(&qcoeff1, round, quant, shift);
82 
83     // Reinsert signs
84     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
85     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
86 
87     // Mask out zbin threshold coeffs
88     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
89     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
90 
91     store_tran_low(qcoeff0, qcoeff_ptr);
92     store_tran_low(qcoeff1, qcoeff_ptr + 8);
93 
94     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
95     dequant = _mm_unpackhi_epi64(dequant, dequant);
96     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
97 
98     eob =
99         scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
100   }
101 
102   // AC only loop.
103   for (index = 16; index < n_coeffs; index += 16) {
104     coeff0 = load_tran_low(coeff_ptr + index);
105     coeff1 = load_tran_low(coeff_ptr + index + 8);
106 
107     qcoeff0 = _mm_abs_epi16(coeff0);
108     qcoeff1 = _mm_abs_epi16(coeff1);
109 
110     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
111     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
112 
113     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
114     if (_mm_test_all_zeros(all_zero, all_zero)) {
115       _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
116       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
117 #if CONFIG_VP9_HIGHBITDEPTH
118       _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
119       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
120 #endif  // CONFIG_VP9_HIGHBITDEPTH
121       continue;
122     }
123 
124     calculate_qcoeff(&qcoeff0, round, quant, shift);
125     calculate_qcoeff(&qcoeff1, round, quant, shift);
126 
127     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
128     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
129 
130     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
131     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
132 
133     store_tran_low(qcoeff0, qcoeff_ptr + index);
134     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
135 
136     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
137     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
138 
139     eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
140                         zero);
141     eob = _mm_max_epi16(eob, eob0);
142   }
143 
144   *eob_ptr = accumulate_eob(eob);
145 }
146 
vpx_quantize_b_32x32_avx(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)147 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
148                               int skip_block, const int16_t *zbin_ptr,
149                               const int16_t *round_ptr,
150                               const int16_t *quant_ptr,
151                               const int16_t *quant_shift_ptr,
152                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
153                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
154                               const int16_t *scan, const int16_t *iscan) {
155   const __m128i zero = _mm_setzero_si128();
156   const __m128i one = _mm_set1_epi16(1);
157   const __m256i big_zero = _mm256_setzero_si256();
158   int index;
159 
160   __m128i zbin, round, quant, dequant, shift;
161   __m128i coeff0, coeff1;
162   __m128i qcoeff0, qcoeff1;
163   __m128i cmp_mask0, cmp_mask1;
164   __m128i all_zero;
165   __m128i eob = zero, eob0;
166 
167   (void)scan;
168   (void)n_coeffs;
169   (void)skip_block;
170   assert(!skip_block);
171 
172   // Setup global values.
173   // The 32x32 halves zbin and round.
174   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
175   // Shift with rounding.
176   zbin = _mm_add_epi16(zbin, one);
177   zbin = _mm_srli_epi16(zbin, 1);
178   // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
179   // it is a strict "greater" comparison.
180   zbin = _mm_sub_epi16(zbin, one);
181 
182   round = _mm_load_si128((const __m128i *)round_ptr);
183   round = _mm_add_epi16(round, one);
184   round = _mm_srli_epi16(round, 1);
185 
186   quant = _mm_load_si128((const __m128i *)quant_ptr);
187   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
188   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
189   shift = _mm_slli_epi16(shift, 1);
190 
191   // Do DC and first 15 AC.
192   coeff0 = load_tran_low(coeff_ptr);
193   coeff1 = load_tran_low(coeff_ptr + 8);
194 
195   qcoeff0 = _mm_abs_epi16(coeff0);
196   qcoeff1 = _mm_abs_epi16(coeff1);
197 
198   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
199   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
200   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
201 
202   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
203   if (_mm_test_all_zeros(all_zero, all_zero)) {
204     _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
205     _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
206 #if CONFIG_VP9_HIGHBITDEPTH
207     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
208     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
209 #endif  // CONFIG_VP9_HIGHBITDEPTH
210 
211     round = _mm_unpackhi_epi64(round, round);
212     quant = _mm_unpackhi_epi64(quant, quant);
213     shift = _mm_unpackhi_epi64(shift, shift);
214     dequant = _mm_unpackhi_epi64(dequant, dequant);
215   } else {
216     calculate_qcoeff(&qcoeff0, round, quant, shift);
217     round = _mm_unpackhi_epi64(round, round);
218     quant = _mm_unpackhi_epi64(quant, quant);
219     shift = _mm_unpackhi_epi64(shift, shift);
220     calculate_qcoeff(&qcoeff1, round, quant, shift);
221 
222     // Reinsert signs.
223     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
224     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
225 
226     // Mask out zbin threshold coeffs.
227     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
228     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
229 
230     store_tran_low(qcoeff0, qcoeff_ptr);
231     store_tran_low(qcoeff1, qcoeff_ptr + 8);
232 
233     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
234     dequant = _mm_unpackhi_epi64(dequant, dequant);
235     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
236 
237     eob =
238         scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
239   }
240 
241   // AC only loop.
242   for (index = 16; index < 32 * 32; index += 16) {
243     coeff0 = load_tran_low(coeff_ptr + index);
244     coeff1 = load_tran_low(coeff_ptr + index + 8);
245 
246     qcoeff0 = _mm_abs_epi16(coeff0);
247     qcoeff1 = _mm_abs_epi16(coeff1);
248 
249     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
250     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
251 
252     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
253     if (_mm_test_all_zeros(all_zero, all_zero)) {
254       _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
255       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
256 #if CONFIG_VP9_HIGHBITDEPTH
257       _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
258       _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
259 #endif  // CONFIG_VP9_HIGHBITDEPTH
260       continue;
261     }
262 
263     calculate_qcoeff(&qcoeff0, round, quant, shift);
264     calculate_qcoeff(&qcoeff1, round, quant, shift);
265 
266     qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
267     qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
268 
269     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
270     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
271 
272     store_tran_low(qcoeff0, qcoeff_ptr + index);
273     store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
274 
275     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
276                                       dqcoeff_ptr + index);
277     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
278                                       dqcoeff_ptr + index + 8);
279 
280     eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
281                         zero);
282     eob = _mm_max_epi16(eob, eob0);
283   }
284 
285   *eob_ptr = accumulate_eob(eob);
286 }
287