1 /*
2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13 #include <assert.h>
14 #include <string.h>
15
16 #include "config/aom_config.h"
17
18 #include "aom_dsp/quantize.h"
19
sum_abs_coeff(const uint32x4_t a)20 static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
21 #if AOM_ARCH_AARCH64
22 return vaddvq_u32(a);
23 #else
24 const uint64x2_t b = vpaddlq_u32(a);
25 const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
26 return (uint32_t)vget_lane_u64(c, 0);
27 #endif
28 }
29
30 static INLINE uint16x4_t
quantize_4(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int32x4_t v_quant_s32,int32x4_t v_dequant_s32,int32x4_t v_round_s32,int32x4_t v_zbin_s32,int32x4_t v_quant_shift_s32,int log_scale)31 quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
32 tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
33 int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
34 int32x4_t v_quant_shift_s32, int log_scale) {
35 const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
36 const int32x4_t v_coeff_sign =
37 vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
38 const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
39 // if (abs_coeff < zbins[rc != 0]),
40 const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32);
41 const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
42 // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
43 const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
44 // const int32_t tmpw32 = tmp * wt;
45 const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS)));
46 // const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16);
47 const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32);
48 // const int32_t tmp3 =
49 // ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32);
50 const int32x4_t v_tmp3 = vqdmulhq_s32(
51 vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32);
52 // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0;
53 const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask),
54 vshrq_n_s32(v_tmp3, AOM_QM_BITS));
55 // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale;
56 // vshlq_s32 will shift right if shift value is negative.
57 const int32x4_t v_abs_dqcoeff =
58 vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
59 // qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
60 const int32x4_t v_qcoeff =
61 vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
62 // dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
63 const int32x4_t v_dqcoeff =
64 vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
65
66 vst1q_s32(qcoeff_ptr, v_qcoeff);
67 vst1q_s32(dqcoeff_ptr, v_dqcoeff);
68
69 // Used to find eob.
70 const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
71 return vmovn_u32(nz_qcoeff_mask);
72 }
73
get_max_lane_eob(const int16_t * iscan,int16x8_t v_eobmax,uint16x8_t v_mask)74 static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
75 int16x8_t v_eobmax,
76 uint16x8_t v_mask) {
77 const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
78 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
79 const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
80 return vmaxq_s16(v_eobmax, v_nz_iscan);
81 }
82
83 #if !CONFIG_REALTIME_ONLY
get_min_max_lane_eob(const int16_t * iscan,int16x8_t * v_eobmin,int16x8_t * v_eobmax,uint16x8_t v_mask,intptr_t n_coeffs)84 static INLINE void get_min_max_lane_eob(const int16_t *iscan,
85 int16x8_t *v_eobmin,
86 int16x8_t *v_eobmax, uint16x8_t v_mask,
87 intptr_t n_coeffs) {
88 const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
89 const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
90 #if SKIP_EOB_FACTOR_ADJUST
91 const int16x8_t v_nz_iscan_min =
92 vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
93 *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
94 #else
95 (void)v_eobmin;
96 #endif
97 *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
98 }
99 #endif // !CONFIG_REALTIME_ONLY
100
get_max_eob(int16x8_t v_eobmax)101 static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
102 #if AOM_ARCH_AARCH64
103 return (uint16_t)vmaxvq_s16(v_eobmax);
104 #else
105 const int16x4_t v_eobmax_3210 =
106 vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
107 const int64x1_t v_eobmax_xx32 =
108 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
109 const int16x4_t v_eobmax_tmp =
110 vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
111 const int64x1_t v_eobmax_xxx3 =
112 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
113 const int16x4_t v_eobmax_final =
114 vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
115 return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
116 #endif
117 }
118
119 #if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
get_min_eob(int16x8_t v_eobmin)120 static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
121 #if AOM_ARCH_AARCH64
122 return (uint16_t)vminvq_s16(v_eobmin);
123 #else
124 const int16x4_t v_eobmin_3210 =
125 vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin));
126 const int64x1_t v_eobmin_xx32 =
127 vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32);
128 const int16x4_t v_eobmin_tmp =
129 vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32));
130 const int64x1_t v_eobmin_xxx3 =
131 vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16);
132 const int16x4_t v_eobmin_final =
133 vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3));
134 return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
135 #endif
136 }
137 #endif // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
138
highbd_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const int log_scale)139 static void highbd_quantize_b_neon(
140 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
141 const int16_t *round_ptr, const int16_t *quant_ptr,
142 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
143 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
144 const int16_t *scan, const int16_t *iscan, const int log_scale) {
145 (void)scan;
146 const int16x4_t v_quant = vld1_s16(quant_ptr);
147 const int16x4_t v_dequant = vld1_s16(dequant_ptr);
148 const int16x4_t v_zero = vdup_n_s16(0);
149 const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
150 const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
151 const int16x4_t v_round_log_scale =
152 vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
153 const int16x4_t v_round =
154 vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
155 const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
156 const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
157 const int16x4_t v_zbin_log_scale =
158 vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
159 const int16x4_t v_zbin =
160 vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
161 int32x4_t v_round_s32 = vmovl_s16(v_round);
162 int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
163 int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
164 int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
165 int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
166 uint16x4_t v_mask_lo, v_mask_hi;
167 int16x8_t v_eobmax = vdupq_n_s16(-1);
168
169 intptr_t non_zero_count = n_coeffs;
170
171 assert(n_coeffs > 8);
172 // Pre-scan pass
173 const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
174 intptr_t i = n_coeffs;
175 do {
176 const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
177 const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
178 const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
179 const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
180 const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x);
181 const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x);
182 // If the coefficient is in the base ZBIN range, then discard.
183 if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
184 non_zero_count -= 8;
185 } else {
186 break;
187 }
188 i -= 8;
189 } while (i > 0);
190
191 const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
192 memset(qcoeff_ptr + non_zero_count, 0,
193 remaining_zcoeffs * sizeof(*qcoeff_ptr));
194 memset(dqcoeff_ptr + non_zero_count, 0,
195 remaining_zcoeffs * sizeof(*dqcoeff_ptr));
196
197 // DC and first 3 AC
198 v_mask_lo =
199 quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
200 v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
201
202 // overwrite the DC constants with AC constants
203 v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
204 v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
205 v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
206 v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
207 v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
208
209 // 4 more AC
210 v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
211 v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
212 v_quant_shift_s32, log_scale);
213
214 v_eobmax =
215 get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
216
217 intptr_t count = non_zero_count - 8;
218 for (; count > 0; count -= 8) {
219 coeff_ptr += 8;
220 qcoeff_ptr += 8;
221 dqcoeff_ptr += 8;
222 iscan += 8;
223 v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
224 v_dequant_s32, v_round_s32, v_zbin_s32,
225 v_quant_shift_s32, log_scale);
226 v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
227 v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
228 v_quant_shift_s32, log_scale);
229 // Find the max lane eob for 8 coeffs.
230 v_eobmax =
231 get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
232 }
233
234 *eob_ptr = get_max_eob(v_eobmax);
235 }
236
aom_highbd_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)237 void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
238 const int16_t *zbin_ptr,
239 const int16_t *round_ptr,
240 const int16_t *quant_ptr,
241 const int16_t *quant_shift_ptr,
242 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
243 const int16_t *dequant_ptr, uint16_t *eob_ptr,
244 const int16_t *scan, const int16_t *iscan) {
245 highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
246 quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
247 eob_ptr, scan, iscan, 0);
248 }
249
aom_highbd_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)250 void aom_highbd_quantize_b_32x32_neon(
251 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
252 const int16_t *round_ptr, const int16_t *quant_ptr,
253 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
254 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
255 const int16_t *scan, const int16_t *iscan) {
256 highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
257 quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
258 eob_ptr, scan, iscan, 1);
259 }
260
aom_highbd_quantize_b_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)261 void aom_highbd_quantize_b_64x64_neon(
262 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
263 const int16_t *round_ptr, const int16_t *quant_ptr,
264 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
265 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
266 const int16_t *scan, const int16_t *iscan) {
267 highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
268 quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
269 eob_ptr, scan, iscan, 2);
270 }
271
272 #if !CONFIG_REALTIME_ONLY
highbd_quantize_b_adaptive_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const int log_scale)273 static void highbd_quantize_b_adaptive_neon(
274 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
275 const int16_t *round_ptr, const int16_t *quant_ptr,
276 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
277 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
278 const int16_t *scan, const int16_t *iscan, const int log_scale) {
279 (void)scan;
280 const int16x4_t v_quant = vld1_s16(quant_ptr);
281 const int16x4_t v_dequant = vld1_s16(dequant_ptr);
282 const int16x4_t v_zero = vdup_n_s16(0);
283 const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
284 const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
285 const int16x4_t v_round_log_scale =
286 vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
287 const int16x4_t v_round =
288 vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
289 const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
290 const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
291 const int16x4_t v_zbin_log_scale =
292 vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
293 const int16x4_t v_zbin =
294 vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
295 int32x4_t v_round_s32 = vmovl_s16(v_round);
296 int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
297 int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
298 int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
299 int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
300 uint16x4_t v_mask_lo, v_mask_hi;
301 int16x8_t v_eobmax = vdupq_n_s16(-1);
302 int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
303
304 assert(n_coeffs > 8);
305 // Pre-scan pass
306 const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
307 const int prescan_add_1 =
308 ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS);
309 const int32x4_t v_zbin_prescan =
310 vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1));
311 intptr_t non_zero_count = n_coeffs;
312 intptr_t i = n_coeffs;
313 do {
314 const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
315 const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
316 const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
317 const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
318 const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan);
319 const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan);
320 // If the coefficient is in the base ZBIN range, then discard.
321 if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
322 non_zero_count -= 8;
323 } else {
324 break;
325 }
326 i -= 8;
327 } while (i > 0);
328
329 const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
330 memset(qcoeff_ptr + non_zero_count, 0,
331 remaining_zcoeffs * sizeof(*qcoeff_ptr));
332 memset(dqcoeff_ptr + non_zero_count, 0,
333 remaining_zcoeffs * sizeof(*dqcoeff_ptr));
334
335 // DC and first 3 AC
336 v_mask_lo =
337 quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
338 v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
339
340 // overwrite the DC constants with AC constants
341 v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
342 v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
343 v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
344 v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
345 v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
346
347 // 4 more AC
348 v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
349 v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
350 v_quant_shift_s32, log_scale);
351
352 get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
353 vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
354
355 intptr_t count = non_zero_count - 8;
356 for (; count > 0; count -= 8) {
357 coeff_ptr += 8;
358 qcoeff_ptr += 8;
359 dqcoeff_ptr += 8;
360 iscan += 8;
361 v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
362 v_dequant_s32, v_round_s32, v_zbin_s32,
363 v_quant_shift_s32, log_scale);
364 v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
365 v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
366 v_quant_shift_s32, log_scale);
367
368 get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
369 vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
370 }
371
372 int eob = get_max_eob(v_eobmax);
373
374 #if SKIP_EOB_FACTOR_ADJUST
375 const int first = get_min_eob(v_eobmin);
376 if (eob >= 0 && first == eob) {
377 const int rc = scan[eob];
378 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
379 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
380 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
381 const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
382 const qm_val_t wt = (1 << AOM_QM_BITS);
383 const int coeff = coeff_ptr[rc] * wt;
384 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
385 const int prescan_add_val =
386 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
387 if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
388 coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
389 qcoeff_ptr[rc] = 0;
390 dqcoeff_ptr[rc] = 0;
391 eob = -1;
392 }
393 }
394 }
395 #endif // SKIP_EOB_FACTOR_ADJUST
396 *eob_ptr = eob + 1;
397 }
398
aom_highbd_quantize_b_adaptive_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)399 void aom_highbd_quantize_b_adaptive_neon(
400 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
401 const int16_t *round_ptr, const int16_t *quant_ptr,
402 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
403 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
404 const int16_t *scan, const int16_t *iscan) {
405 highbd_quantize_b_adaptive_neon(
406 coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
407 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
408 }
409
aom_highbd_quantize_b_32x32_adaptive_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)410 void aom_highbd_quantize_b_32x32_adaptive_neon(
411 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
412 const int16_t *round_ptr, const int16_t *quant_ptr,
413 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
414 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
415 const int16_t *scan, const int16_t *iscan) {
416 highbd_quantize_b_adaptive_neon(
417 coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
418 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
419 }
420
aom_highbd_quantize_b_64x64_adaptive_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)421 void aom_highbd_quantize_b_64x64_adaptive_neon(
422 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
423 const int16_t *round_ptr, const int16_t *quant_ptr,
424 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
425 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
426 const int16_t *scan, const int16_t *iscan) {
427 highbd_quantize_b_adaptive_neon(
428 coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
429 qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2);
430 }
431 #endif // !CONFIG_REALTIME_ONLY
432