1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include <assert.h>
15 #include <math.h>
16
17 #include "aom_dsp/arm/mem_neon.h"
18 #include "aom_dsp/arm/sum_neon.h"
19 #include "aom_mem/aom_mem.h"
20
21 #include "av1/common/quant_common.h"
22 #include "av1/common/seg_common.h"
23
24 #include "av1/encoder/av1_quantize.h"
25 #include "av1/encoder/encoder.h"
26 #include "av1/encoder/rd.h"
27
get_max_eob(int16x8_t v_eobmax)28 static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
29 #ifdef __aarch64__
30 return (uint16_t)vmaxvq_s16(v_eobmax);
31 #else
32 const int16x4_t v_eobmax_3210 =
33 vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
34 const int64x1_t v_eobmax_xx32 =
35 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
36 const int16x4_t v_eobmax_tmp =
37 vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
38 const int64x1_t v_eobmax_xxx3 =
39 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
40 const int16x4_t v_eobmax_final =
41 vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
42 return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
43 #endif
44 }
45
get_max_lane_eob(const int16_t * iscan,int16x8_t v_eobmax,uint16x8_t v_mask)46 static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
47 int16x8_t v_eobmax,
48 uint16x8_t v_mask) {
49 const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
50 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
51 const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
52 return vmaxq_s16(v_eobmax, v_nz_iscan);
53 }
54
quantize_fp_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)55 static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
56 tran_low_t *qcoeff_ptr,
57 tran_low_t *dqcoeff_ptr,
58 int16x8_t v_quant, int16x8_t v_dequant,
59 int16x8_t v_round, int16x8_t v_zero) {
60 const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
61 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
62 const int16x8_t v_abs = vabsq_s16(v_coeff);
63 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
64 const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
65 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
66 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
67 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
68 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
69 store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
70 store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
71 return v_nz_mask;
72 }
73
av1_quantize_fp_neon(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)74 void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
75 const int16_t *zbin_ptr, const int16_t *round_ptr,
76 const int16_t *quant_ptr,
77 const int16_t *quant_shift_ptr,
78 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
79 const int16_t *dequant_ptr, uint16_t *eob_ptr,
80 const int16_t *scan, const int16_t *iscan) {
81 // TODO(jingning) Decide the need of these arguments after the
82 // quantization process is completed.
83 (void)zbin_ptr;
84 (void)quant_shift_ptr;
85 (void)scan;
86
87 // Quantization pass: All coefficients with index >= zero_flag are
88 // skippable. Note: zero_flag can be zero.
89 const int16x8_t v_zero = vdupq_n_s16(0);
90 int16x8_t v_quant = vld1q_s16(quant_ptr);
91 int16x8_t v_dequant = vld1q_s16(dequant_ptr);
92 int16x8_t v_round = vld1q_s16(round_ptr);
93 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
94 uint16x8_t v_nz_mask;
95 // process dc and the first seven ac coeffs
96 v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
97 v_dequant, v_round, v_zero);
98 v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
99 // overwrite the dc constants with ac constants
100 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
101 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
102 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
103
104 count -= 8;
105 // now process the rest of the ac coeffs
106 do {
107 coeff_ptr += 8;
108 qcoeff_ptr += 8;
109 dqcoeff_ptr += 8;
110 iscan += 8;
111 v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
112 v_dequant, v_round, v_zero);
113 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
114 count -= 8;
115 } while (count > 0);
116 *eob_ptr = get_max_eob(v_eobmax_76543210);
117 }
118
quantize_lp_8(const int16_t * coeff_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)119 static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
120 int16_t *qcoeff_ptr,
121 int16_t *dqcoeff_ptr, int16x8_t v_quant,
122 int16x8_t v_dequant, int16x8_t v_round,
123 int16x8_t v_zero) {
124 const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
125 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
126 const int16x8_t v_abs = vabsq_s16(v_coeff);
127 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
128 const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
129 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
130 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
131 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
132 const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
133 vst1q_s16(qcoeff_ptr, v_qcoeff);
134 vst1q_s16(dqcoeff_ptr, v_dqcoeff);
135 return v_nz_mask;
136 }
137
av1_quantize_lp_neon(const int16_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)138 void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
139 const int16_t *round_ptr, const int16_t *quant_ptr,
140 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
141 const int16_t *dequant_ptr, uint16_t *eob_ptr,
142 const int16_t *scan, const int16_t *iscan) {
143 (void)scan;
144 // Quantization pass: All coefficients with index >= zero_flag are
145 // skippable. Note: zero_flag can be zero.
146 const int16x8_t v_zero = vdupq_n_s16(0);
147 int16x8_t v_quant = vld1q_s16(quant_ptr);
148 int16x8_t v_dequant = vld1q_s16(dequant_ptr);
149 int16x8_t v_round = vld1q_s16(round_ptr);
150 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
151 uint16x8_t v_nz_mask;
152 intptr_t count = n_coeffs;
153
154 // process dc and the first seven ac coeffs
155 v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
156 v_dequant, v_round, v_zero);
157 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
158 // overwrite the dc constants with ac constants
159 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
160 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
161 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
162
163 count -= 8;
164 // now process the rest of the ac coeffs
165 do {
166 coeff_ptr += 8;
167 qcoeff_ptr += 8;
168 dqcoeff_ptr += 8;
169 iscan += 8;
170 v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
171 v_dequant, v_round, v_zero);
172 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
173 count -= 8;
174 } while (count != 0);
175 *eob_ptr = get_max_eob(v_eobmax_76543210);
176 }
177
quantize_fp_logscale_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero,int log_scale)178 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
179 const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
180 tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
181 int16x8_t v_round, int16x8_t v_zero, int log_scale) {
182 const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
183 const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
184 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
185 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
186 const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
187 const uint16x8_t v_mask =
188 vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
189 // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
190 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
191 vreinterpretq_s16_u16(v_mask));
192 const int16x8_t v_tmp2 =
193 vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
194 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
195 const int16x8_t v_qcoeff =
196 vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
197 // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
198 // shifting right. (vshlq_s16 will shift right if shift value is negative)
199 const uint16x8_t v_abs_dqcoeff =
200 vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
201 vdupq_n_s16(-log_scale));
202 const int16x8_t v_dqcoeff =
203 vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
204 v_coeff_sign);
205 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
206 store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
207 return v_nz_mask;
208 }
209
quantize_fp_logscale2_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)210 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
211 const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
212 tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
213 int16x8_t v_round, int16x8_t v_zero) {
214 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
215 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
216 const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
217 const uint16x8_t v_mask =
218 vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
219 vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
220 // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
221 const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
222 vreinterpretq_s16_u16(v_mask));
223 // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
224 const int16x8_t v_tmp2 =
225 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
226 vreinterpretq_s16_u16(vshrq_n_u16(
227 vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
228 const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
229 const int16x8_t v_qcoeff =
230 vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
231 // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
232 const int16x8_t v_abs_dqcoeff =
233 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
234 vreinterpretq_s16_u16(vshrq_n_u16(
235 vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
236 const int16x8_t v_dqcoeff =
237 vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
238 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
239 store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
240 return v_nz_mask;
241 }
242
quantize_fp_no_qmatrix_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * iscan,int log_scale)243 static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
244 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
245 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
246 const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
247 int log_scale) {
248 const int16x8_t v_zero = vdupq_n_s16(0);
249 int16x8_t v_quant = vld1q_s16(quant_ptr);
250 int16x8_t v_dequant = vld1q_s16(dequant_ptr);
251 const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
252 int16x8_t v_round =
253 vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
254 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
255 intptr_t non_zero_count = n_coeffs;
256
257 assert(n_coeffs > 16);
258 // Pre-scan pass
259 const int16x8_t v_dequant_scaled =
260 vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
261 const int16x8_t v_zbin_s16 =
262 vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
263 intptr_t i = n_coeffs;
264 do {
265 const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
266 const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
267 const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
268 const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
269 const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
270 const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
271 // If the coefficient is in the base ZBIN range, then discard.
272 if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
273 non_zero_count -= 16;
274 } else {
275 break;
276 }
277 i -= 16;
278 } while (i > 0);
279
280 const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
281 memset(qcoeff_ptr + non_zero_count, 0,
282 remaining_zcoeffs * sizeof(*qcoeff_ptr));
283 memset(dqcoeff_ptr + non_zero_count, 0,
284 remaining_zcoeffs * sizeof(*dqcoeff_ptr));
285
286 // process dc and the first seven ac coeffs
287 uint16x8_t v_nz_mask;
288 if (log_scale == 2) {
289 v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
290 v_quant, v_dequant, v_round, v_zero);
291 } else {
292 v_nz_mask =
293 quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
294 v_dequant, v_round, v_zero, log_scale);
295 }
296 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
297 // overwrite the dc constants with ac constants
298 v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
299 v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
300 v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
301
302 for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
303 coeff_ptr += 8;
304 qcoeff_ptr += 8;
305 dqcoeff_ptr += 8;
306 iscan += 8;
307 if (log_scale == 2) {
308 v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
309 v_quant, v_dequant, v_round, v_zero);
310 } else {
311 v_nz_mask =
312 quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
313 v_dequant, v_round, v_zero, log_scale);
314 }
315 v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
316 }
317 *eob_ptr = get_max_eob(v_eobmax_76543210);
318 }
319
av1_quantize_fp_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)320 void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
321 const int16_t *zbin_ptr,
322 const int16_t *round_ptr,
323 const int16_t *quant_ptr,
324 const int16_t *quant_shift_ptr,
325 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
326 const int16_t *dequant_ptr, uint16_t *eob_ptr,
327 const int16_t *scan, const int16_t *iscan) {
328 (void)zbin_ptr;
329 (void)quant_shift_ptr;
330 (void)scan;
331 quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
332 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
333 iscan, 1);
334 }
335
av1_quantize_fp_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)336 void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
337 const int16_t *zbin_ptr,
338 const int16_t *round_ptr,
339 const int16_t *quant_ptr,
340 const int16_t *quant_shift_ptr,
341 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
342 const int16_t *dequant_ptr, uint16_t *eob_ptr,
343 const int16_t *scan, const int16_t *iscan) {
344 (void)zbin_ptr;
345 (void)quant_shift_ptr;
346 (void)scan;
347 quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
348 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
349 iscan, 2);
350 }
351
aom_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)352 void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
353 const int16_t *zbin_ptr, const int16_t *round_ptr,
354 const int16_t *quant_ptr,
355 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
356 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
357 uint16_t *eob_ptr, const int16_t *scan,
358 const int16_t *iscan) {
359 (void)quant_shift_ptr;
360 (void)scan;
361
362 const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
363
364 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
365 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
366
367 const int16x8_t zero = vdupq_n_s16(0);
368 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
369
370 int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
371 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
372 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
373 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
374
375 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
376 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
377 int16x8_t v_abs = vabsq_s16(v_coeff);
378
379 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
380
381 uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
382 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
383 if (nz_check) {
384 vround = vsetq_lane_s16(round_ptr[0], vround, 0);
385 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
386 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
387 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
388
389 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
390 int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
391 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
392
393 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
394 int16x8_t coeff_nz_mask =
395 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
396 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
397 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
398
399 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
400 coeff_nz_mask =
401 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
402 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
403
404 vround = vsetq_lane_s16(round_ptr[1], vround, 0);
405 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
406 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
407 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
408
409 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
410 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
411 int16x8_t v_iscan = vld1q_s16(&iscan[0]);
412 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
413 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
414 }
415 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
416
417 for (int i = 8; i < n_coeffs; i += 8) {
418 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
419 v_coeff_sign = vshrq_n_s16(v_coeff, 15);
420 v_abs = vabsq_s16(v_coeff);
421 vcond = vcgeq_s16(v_abs, vzbins);
422
423 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
424 if (nz_check) {
425 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
426 int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
427
428 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
429 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
430 int16x8_t coeff_nz_mask =
431 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
432 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
433 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
434 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
435 coeff_nz_mask =
436 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
437 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
438
439 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
440 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
441 int16x8_t v_iscan = vld1q_s16(&iscan[i]);
442 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
443 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
444 }
445 }
446 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
447 }
448
449 #define QM_MULL_SHIFT(x0, x1) \
450 vreinterpretq_s16_u16(vorrq_u16( \
451 vreinterpretq_u16_s16(vshlq_n_s16( \
452 vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
453 vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
454
aom_quantize_b_helper_16x16_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)455 static void aom_quantize_b_helper_16x16_neon(
456 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
457 const int16_t *round_ptr, const int16_t *quant_ptr,
458 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
459 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
460 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
461 const qm_val_t *iqm_ptr) {
462 (void)scan;
463
464 uint16x8_t vwt, viwt;
465 const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
466
467 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
468 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
469
470 const int16x8_t zero = vdupq_n_s16(0);
471 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
472
473 int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
474 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
475 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
476 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
477
478 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
479 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
480 int16x8_t v_abs = vabsq_s16(v_coeff);
481 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
482 uint16x8_t vcond;
483 if (qm_ptr == NULL) {
484 vcond = vcgeq_s16(v_abs, vzbins);
485 } else {
486 vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
487 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
488 }
489 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
490 if (nz_check) {
491 vround = vsetq_lane_s16(round_ptr[0], vround, 0);
492 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
493 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
494 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
495
496 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
497
498 int16x8_t vtmp2;
499 if (qm_ptr == NULL) {
500 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
501 } else {
502 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
503 vtmp2 = vaddq_s16(vtmp2, vtmp);
504 }
505
506 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
507 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
508 int16x8_t coeff_nz_mask =
509 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
510 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
511
512 if (iqm_ptr != NULL) {
513 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
514 vdequant = QM_MULL_SHIFT(vdequant, viwt);
515 }
516 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
517 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
518 coeff_nz_mask =
519 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
520 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
521
522 vround = vsetq_lane_s16(round_ptr[1], vround, 0);
523 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
524 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
525 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
526
527 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
528 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
529 int16x8_t v_iscan = vld1q_s16(&iscan[0]);
530 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
531 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
532 }
533 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
534
535 for (int i = 8; i < n_coeffs; i += 8) {
536 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
537 v_coeff_sign = vshrq_n_s16(v_coeff, 15);
538 v_abs = vabsq_s16(v_coeff);
539
540 if (qm_ptr == NULL) {
541 vcond = vcgeq_s16(v_abs, vzbins);
542 } else {
543 vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
544 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
545 }
546 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
547 if (nz_check) {
548 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
549
550 int16x8_t vtmp2;
551 if (qm_ptr == NULL) {
552 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
553 } else {
554 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
555 vtmp2 = vaddq_s16(vtmp2, vtmp);
556 }
557
558 vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
559 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
560 int16x8_t coeff_nz_mask =
561 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
562 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
563
564 if (iqm_ptr != NULL) {
565 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
566 vdequant = QM_MULL_SHIFT(vdequant, viwt);
567 }
568 int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
569 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
570 coeff_nz_mask =
571 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
572 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
573
574 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
575 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
576 int16x8_t v_iscan = vld1q_s16(&iscan[i]);
577 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
578 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
579 }
580 }
581 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
582 }
583
aom_quantize_b_helper_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)584 static void aom_quantize_b_helper_32x32_neon(
585 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
586 const int16_t *round_ptr, const int16_t *quant_ptr,
587 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
588 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
589 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
590 const qm_val_t *iqm_ptr) {
591 (void)scan;
592
593 uint16x8_t vwt, viwt;
594 const int log_scale = 1;
595 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
596 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
597
598 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
599 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
600
601 const int16x8_t zero = vdupq_n_s16(0);
602 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
603 const int16x8_t v_log_scale = v_eobmax_76543210;
604
605 int16x8_t vzbins = vdupq_n_s16(zbins[1]),
606 vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
607 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
608 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
609 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
610
611 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
612 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
613 int16x8_t v_abs = vabsq_s16(v_coeff);
614 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
615 uint16x8_t vcond;
616 if (qm_ptr == NULL) {
617 vcond = vcgeq_s16(v_abs, vzbins);
618 } else {
619 vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
620 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
621 }
622 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
623 if (nz_check) {
624 vround =
625 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
626 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
627 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
628 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
629
630 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
631
632 int16x8_t vtmp2;
633 if (qm_ptr == NULL) {
634 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
635 } else {
636 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
637 vtmp2 = vaddq_s16(vtmp2, vtmp);
638 }
639
640 vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
641 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
642 int16x8_t coeff_nz_mask =
643 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
644 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
645
646 if (iqm_ptr != NULL) {
647 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
648 vdequant = QM_MULL_SHIFT(vdequant, viwt);
649 }
650 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
651 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
652 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
653 coeff_nz_mask =
654 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
655 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
656
657 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
658 vround =
659 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
660 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
661 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
662 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
663
664 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
665 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
666 int16x8_t v_iscan = vld1q_s16(&iscan[0]);
667 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
668 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
669 }
670 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
671
672 for (int i = 8; i < n_coeffs; i += 8) {
673 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
674 v_coeff_sign = vshrq_n_s16(v_coeff, 15);
675 v_abs = vabsq_s16(v_coeff);
676
677 if (qm_ptr == NULL) {
678 vcond = vcgeq_s16(v_abs, vzbins);
679 } else {
680 vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
681 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
682 }
683 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
684 if (nz_check) {
685 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
686
687 int16x8_t vtmp2;
688 if (qm_ptr == NULL) {
689 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
690 } else {
691 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
692 vtmp2 = vaddq_s16(vtmp2, vtmp);
693 }
694 vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
695
696 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
697 int16x8_t coeff_nz_mask =
698 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
699 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
700
701 if (iqm_ptr != NULL) {
702 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
703 vdequant = QM_MULL_SHIFT(vdequant, viwt);
704 }
705 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
706 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
707 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
708 coeff_nz_mask =
709 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
710 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
711
712 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
713 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
714 int16x8_t v_iscan = vld1q_s16(&iscan[i]);
715 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
716 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
717 }
718 }
719 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
720 }
721
aom_quantize_b_helper_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)722 static void aom_quantize_b_helper_64x64_neon(
723 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
724 const int16_t *round_ptr, const int16_t *quant_ptr,
725 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
726 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
727 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
728 const qm_val_t *iqm_ptr) {
729 (void)scan;
730
731 uint16x8_t vwt, viwt;
732 const int log_scale = 2;
733 const int16x8_t v_log_scale =
734 vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
735
736 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
737 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
738
739 memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
740 memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
741
742 const int16x8_t zero = vdupq_n_s16(0);
743 int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
744 int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
745
746 int16x8_t vzbins = vdupq_n_s16(zbins[1]),
747 vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
748 int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
749 int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
750 int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
751
752 int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
753 int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
754 int16x8_t v_abs = vabsq_s16(v_coeff);
755 vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
756 uint16x8_t vcond;
757 if (qm_ptr == NULL) {
758 vcond = vcgeq_s16(v_abs, vzbins);
759 } else {
760 vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
761 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
762 }
763 uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
764 if (nz_check) {
765 vround =
766 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
767 vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
768 vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
769 vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
770 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
771
772 int16x8_t vtmp2;
773 if (qm_ptr == NULL) {
774 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
775 } else {
776 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
777 vtmp2 = vaddq_s16(vtmp2, vtmp);
778 }
779
780 int16x8_t ones =
781 vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
782 vtmp2 =
783 vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
784 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
785 int16x8_t coeff_nz_mask =
786 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
787 store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
788
789 if (iqm_ptr != NULL) {
790 viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
791 vdequant = QM_MULL_SHIFT(vdequant, viwt);
792 }
793 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
794 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
795 v_deq_abs =
796 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
797 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
798 coeff_nz_mask =
799 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
800 store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
801
802 vround =
803 vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
804 vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
805 vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
806 vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
807
808 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
809 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
810 int16x8_t v_iscan = vld1q_s16(&iscan[0]);
811 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
812 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
813 }
814 vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
815
816 for (int i = 8; i < n_coeffs; i += 8) {
817 v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
818 v_coeff_sign = vshrq_n_s16(v_coeff, 15);
819 v_abs = vabsq_s16(v_coeff);
820
821 if (qm_ptr == NULL) {
822 vcond = vcgeq_s16(v_abs, vzbins);
823 } else {
824 vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
825 vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
826 }
827 nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
828 if (nz_check) {
829 int16x8_t vtmp = vqaddq_s16(v_abs, vround);
830
831 int16x8_t vtmp2;
832 if (qm_ptr == NULL) {
833 vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
834 } else {
835 vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
836 vtmp2 = vaddq_s16(vtmp2, vtmp);
837 }
838
839 int16x8_t ones =
840 vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
841 vtmp2 =
842 vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
843 int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
844 int16x8_t coeff_nz_mask =
845 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
846 store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
847
848 if (iqm_ptr != NULL) {
849 viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
850 vdequant = QM_MULL_SHIFT(vdequant, viwt);
851 }
852 int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
853 vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
854 v_deq_abs =
855 vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
856 vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
857 coeff_nz_mask =
858 vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
859 store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
860
861 uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
862 const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
863 int16x8_t v_iscan = vld1q_s16(&iscan[i]);
864 vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
865 v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
866 }
867 }
868 *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
869 }
870
aom_quantize_b_helper_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr,const int log_scale)871 void aom_quantize_b_helper_neon(
872 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
873 const int16_t *round_ptr, const int16_t *quant_ptr,
874 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
875 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
876 const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
877 const qm_val_t *iqm_ptr, const int log_scale) {
878 switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2
879 case 0:
880 aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
881 quant_ptr, quant_shift_ptr, qcoeff_ptr,
882 dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
883 iscan, qm_ptr, iqm_ptr);
884 break;
885 case 1:
886 aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
887 quant_ptr, quant_shift_ptr, qcoeff_ptr,
888 dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
889 iscan, qm_ptr, iqm_ptr);
890 break;
891 case 2:
892 aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
893 quant_ptr, quant_shift_ptr, qcoeff_ptr,
894 dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
895 iscan, qm_ptr, iqm_ptr);
896 break;
897 }
898 }
899
aom_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)900 void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
901 const int16_t *zbin_ptr,
902 const int16_t *round_ptr,
903 const int16_t *quant_ptr,
904 const int16_t *quant_shift_ptr,
905 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
906 const int16_t *dequant_ptr, uint16_t *eob_ptr,
907 const int16_t *scan, const int16_t *iscan) {
908 aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
909 quant_ptr, quant_shift_ptr, qcoeff_ptr,
910 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
911 NULL, NULL, 1);
912 }
913
aom_quantize_b_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)914 void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
915 const int16_t *zbin_ptr,
916 const int16_t *round_ptr,
917 const int16_t *quant_ptr,
918 const int16_t *quant_shift_ptr,
919 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
920 const int16_t *dequant_ptr, uint16_t *eob_ptr,
921 const int16_t *scan, const int16_t *iscan) {
922 aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
923 quant_ptr, quant_shift_ptr, qcoeff_ptr,
924 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
925 NULL, NULL, 2);
926 }
927