1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)18 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
19 const int16x8_t dequant,
20 tran_low_t *dqcoeff) {
21 const int32x4_t dqcoeff_0 =
22 vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
23 const int32x4_t dqcoeff_1 =
24 vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
25
26 #if CONFIG_VP9_HIGHBITDEPTH
27 vst1q_s32(dqcoeff, dqcoeff_0);
28 vst1q_s32(dqcoeff + 4, dqcoeff_1);
29 #else
30 vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
31 #endif // CONFIG_VP9_HIGHBITDEPTH
32 }
33
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)34 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
35 int skip_block, const int16_t *zbin_ptr,
36 const int16_t *round_ptr, const int16_t *quant_ptr,
37 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
38 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
39 uint16_t *eob_ptr, const int16_t *scan,
40 const int16_t *iscan) {
41 const int16x8_t one = vdupq_n_s16(1);
42 const int16x8_t neg_one = vdupq_n_s16(-1);
43 uint16x8_t eob_max;
44 (void)scan;
45 (void)skip_block;
46 assert(!skip_block);
47
48 // Process first 8 values which include a dc component.
49 {
50 // Only the first element of each vector is DC.
51 const int16x8_t zbin = vld1q_s16(zbin_ptr);
52 const int16x8_t round = vld1q_s16(round_ptr);
53 const int16x8_t quant = vld1q_s16(quant_ptr);
54 const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
55 const int16x8_t dequant = vld1q_s16(dequant_ptr);
56 // Add one because the eob does not index from 0.
57 const uint16x8_t v_iscan =
58 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
59
60 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
61 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
62 const int16x8_t coeff_abs = vabsq_s16(coeff);
63
64 const int16x8_t zbin_mask =
65 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
66
67 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
68
69 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
70 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
71
72 qcoeff = vaddq_s16(qcoeff, rounded);
73
74 // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
75 qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
76
77 // Restore the sign bit.
78 qcoeff = veorq_s16(qcoeff, coeff_sign);
79 qcoeff = vsubq_s16(qcoeff, coeff_sign);
80
81 qcoeff = vandq_s16(qcoeff, zbin_mask);
82
83 // Set non-zero elements to -1 and use that to extract values for eob.
84 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
85
86 coeff_ptr += 8;
87 iscan += 8;
88
89 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
90 qcoeff_ptr += 8;
91
92 calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
93 dqcoeff_ptr += 8;
94 }
95
96 n_coeffs -= 8;
97
98 {
99 const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
100 const int16x8_t round = vdupq_n_s16(round_ptr[1]);
101 const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
102 const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
103 const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
104
105 do {
106 // Add one because the eob is not its index.
107 const uint16x8_t v_iscan =
108 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
109
110 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
111 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
112 const int16x8_t coeff_abs = vabsq_s16(coeff);
113
114 const int16x8_t zbin_mask =
115 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
116
117 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
118
119 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
120 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
121
122 qcoeff = vaddq_s16(qcoeff, rounded);
123
124 // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
125 qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
126
127 // Restore the sign bit.
128 qcoeff = veorq_s16(qcoeff, coeff_sign);
129 qcoeff = vsubq_s16(qcoeff, coeff_sign);
130
131 qcoeff = vandq_s16(qcoeff, zbin_mask);
132
133 // Set non-zero elements to -1 and use that to extract values for eob.
134 eob_max =
135 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
136
137 coeff_ptr += 8;
138 iscan += 8;
139
140 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
141 qcoeff_ptr += 8;
142
143 calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
144 dqcoeff_ptr += 8;
145
146 n_coeffs -= 8;
147 } while (n_coeffs > 0);
148 }
149
150 #ifdef __aarch64__
151 *eob_ptr = vmaxvq_u16(eob_max);
152 #else
153 {
154 const uint16x4_t eob_max_0 =
155 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
156 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
157 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
158 vst1_lane_u16(eob_ptr, eob_max_2, 0);
159 }
160 #endif // __aarch64__
161 }
162
extract_sign_bit(int32x4_t a)163 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
164 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
165 }
166
calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)167 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
168 const int16x8_t dequant,
169 tran_low_t *dqcoeff) {
170 int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
171 int32x4_t dqcoeff_1 =
172 vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
173
174 // Add 1 if negative to round towards zero because the C uses division.
175 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
176 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
177
178 #if CONFIG_VP9_HIGHBITDEPTH
179 dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
180 dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
181 vst1q_s32(dqcoeff, dqcoeff_0);
182 vst1q_s32(dqcoeff + 4, dqcoeff_1);
183 #else
184 vst1q_s16(dqcoeff,
185 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
186 #endif // CONFIG_VP9_HIGHBITDEPTH
187 }
188
189 // Main difference is that zbin values are halved before comparison and dqcoeff
190 // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)191 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
192 int skip_block, const int16_t *zbin_ptr,
193 const int16_t *round_ptr,
194 const int16_t *quant_ptr,
195 const int16_t *quant_shift_ptr,
196 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
197 const int16_t *dequant_ptr, uint16_t *eob_ptr,
198 const int16_t *scan, const int16_t *iscan) {
199 const int16x8_t one = vdupq_n_s16(1);
200 const int16x8_t neg_one = vdupq_n_s16(-1);
201 uint16x8_t eob_max;
202 int i;
203 (void)scan;
204 (void)n_coeffs; // Because we will always calculate 32*32.
205 (void)skip_block;
206 assert(!skip_block);
207
208 // Process first 8 values which include a dc component.
209 {
210 // Only the first element of each vector is DC.
211 const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
212 const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
213 const int16x8_t quant = vld1q_s16(quant_ptr);
214 const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
215 const int16x8_t dequant = vld1q_s16(dequant_ptr);
216 // Add one because the eob does not index from 0.
217 const uint16x8_t v_iscan =
218 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
219
220 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
221 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
222 const int16x8_t coeff_abs = vabsq_s16(coeff);
223
224 const int16x8_t zbin_mask =
225 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
226
227 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
228
229 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
230 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
231
232 qcoeff = vaddq_s16(qcoeff, rounded);
233
234 // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
235 qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
236
237 // Restore the sign bit.
238 qcoeff = veorq_s16(qcoeff, coeff_sign);
239 qcoeff = vsubq_s16(qcoeff, coeff_sign);
240
241 qcoeff = vandq_s16(qcoeff, zbin_mask);
242
243 // Set non-zero elements to -1 and use that to extract values for eob.
244 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
245
246 coeff_ptr += 8;
247 iscan += 8;
248
249 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
250 qcoeff_ptr += 8;
251
252 calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
253 dqcoeff_ptr += 8;
254 }
255
256 {
257 const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
258 const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
259 const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
260 const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
261 const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
262
263 for (i = 1; i < 32 * 32 / 8; ++i) {
264 // Add one because the eob is not its index.
265 const uint16x8_t v_iscan =
266 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
267
268 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
269 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
270 const int16x8_t coeff_abs = vabsq_s16(coeff);
271
272 const int16x8_t zbin_mask =
273 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
274
275 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
276
277 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
278 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
279
280 qcoeff = vaddq_s16(qcoeff, rounded);
281
282 // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
283 qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
284
285 // Restore the sign bit.
286 qcoeff = veorq_s16(qcoeff, coeff_sign);
287 qcoeff = vsubq_s16(qcoeff, coeff_sign);
288
289 qcoeff = vandq_s16(qcoeff, zbin_mask);
290
291 // Set non-zero elements to -1 and use that to extract values for eob.
292 eob_max =
293 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
294
295 coeff_ptr += 8;
296 iscan += 8;
297
298 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
299 qcoeff_ptr += 8;
300
301 calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
302 dqcoeff_ptr += 8;
303 }
304 }
305
306 #ifdef __aarch64__
307 *eob_ptr = vmaxvq_u16(eob_max);
308 #else
309 {
310 const uint16x4_t eob_max_0 =
311 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
312 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
313 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
314 vst1_lane_u16(eob_ptr, eob_max_2, 0);
315 }
316 #endif // __aarch64__
317 }
318