• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include <assert.h>
15 #include <math.h>
16 
17 #include "aom_dsp/arm/mem_neon.h"
18 #include "aom_dsp/arm/sum_neon.h"
19 #include "aom_mem/aom_mem.h"
20 
21 #include "av1/common/quant_common.h"
22 #include "av1/common/seg_common.h"
23 
24 #include "av1/encoder/av1_quantize.h"
25 #include "av1/encoder/encoder.h"
26 #include "av1/encoder/rd.h"
27 
get_max_eob(int16x8_t v_eobmax)28 static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
29 #ifdef __aarch64__
30   return (uint16_t)vmaxvq_s16(v_eobmax);
31 #else
32   const int16x4_t v_eobmax_3210 =
33       vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
34   const int64x1_t v_eobmax_xx32 =
35       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
36   const int16x4_t v_eobmax_tmp =
37       vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
38   const int64x1_t v_eobmax_xxx3 =
39       vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
40   const int16x4_t v_eobmax_final =
41       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
42   return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
43 #endif
44 }
45 
get_max_lane_eob(const int16_t * iscan,int16x8_t v_eobmax,uint16x8_t v_mask)46 static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
47                                          int16x8_t v_eobmax,
48                                          uint16x8_t v_mask) {
49   const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
50   const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
51   const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
52   return vmaxq_s16(v_eobmax, v_nz_iscan);
53 }
54 
quantize_fp_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)55 static INLINE uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
56                                        tran_low_t *qcoeff_ptr,
57                                        tran_low_t *dqcoeff_ptr,
58                                        int16x8_t v_quant, int16x8_t v_dequant,
59                                        int16x8_t v_round, int16x8_t v_zero) {
60   const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
61   const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
62   const int16x8_t v_abs = vabsq_s16(v_coeff);
63   const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
64   const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
65   const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
66   const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
67   const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
68   const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
69   store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
70   store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
71   return v_nz_mask;
72 }
73 
av1_quantize_fp_neon(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)74 void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
75                           const int16_t *zbin_ptr, const int16_t *round_ptr,
76                           const int16_t *quant_ptr,
77                           const int16_t *quant_shift_ptr,
78                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
79                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
80                           const int16_t *scan, const int16_t *iscan) {
81   // TODO(jingning) Decide the need of these arguments after the
82   // quantization process is completed.
83   (void)zbin_ptr;
84   (void)quant_shift_ptr;
85   (void)scan;
86 
87   // Quantization pass: All coefficients with index >= zero_flag are
88   // skippable. Note: zero_flag can be zero.
89   const int16x8_t v_zero = vdupq_n_s16(0);
90   int16x8_t v_quant = vld1q_s16(quant_ptr);
91   int16x8_t v_dequant = vld1q_s16(dequant_ptr);
92   int16x8_t v_round = vld1q_s16(round_ptr);
93   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
94   uint16x8_t v_nz_mask;
95   // process dc and the first seven ac coeffs
96   v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
97                             v_dequant, v_round, v_zero);
98   v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
99   // overwrite the dc constants with ac constants
100   v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
101   v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
102   v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
103 
104   count -= 8;
105   // now process the rest of the ac coeffs
106   do {
107     coeff_ptr += 8;
108     qcoeff_ptr += 8;
109     dqcoeff_ptr += 8;
110     iscan += 8;
111     v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
112                               v_dequant, v_round, v_zero);
113     v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
114     count -= 8;
115   } while (count > 0);
116   *eob_ptr = get_max_eob(v_eobmax_76543210);
117 }
118 
quantize_lp_8(const int16_t * coeff_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)119 static INLINE uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
120                                        int16_t *qcoeff_ptr,
121                                        int16_t *dqcoeff_ptr, int16x8_t v_quant,
122                                        int16x8_t v_dequant, int16x8_t v_round,
123                                        int16x8_t v_zero) {
124   const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
125   const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
126   const int16x8_t v_abs = vabsq_s16(v_coeff);
127   const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
128   const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
129   const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
130   const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
131   const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
132   const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
133   vst1q_s16(qcoeff_ptr, v_qcoeff);
134   vst1q_s16(dqcoeff_ptr, v_dqcoeff);
135   return v_nz_mask;
136 }
137 
av1_quantize_lp_neon(const int16_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)138 void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
139                           const int16_t *round_ptr, const int16_t *quant_ptr,
140                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
141                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
142                           const int16_t *scan, const int16_t *iscan) {
143   (void)scan;
144   // Quantization pass: All coefficients with index >= zero_flag are
145   // skippable. Note: zero_flag can be zero.
146   const int16x8_t v_zero = vdupq_n_s16(0);
147   int16x8_t v_quant = vld1q_s16(quant_ptr);
148   int16x8_t v_dequant = vld1q_s16(dequant_ptr);
149   int16x8_t v_round = vld1q_s16(round_ptr);
150   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
151   uint16x8_t v_nz_mask;
152   intptr_t count = n_coeffs;
153 
154   // process dc and the first seven ac coeffs
155   v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
156                             v_dequant, v_round, v_zero);
157   v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
158   // overwrite the dc constants with ac constants
159   v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
160   v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
161   v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
162 
163   count -= 8;
164   // now process the rest of the ac coeffs
165   do {
166     coeff_ptr += 8;
167     qcoeff_ptr += 8;
168     dqcoeff_ptr += 8;
169     iscan += 8;
170     v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
171                               v_dequant, v_round, v_zero);
172     v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
173     count -= 8;
174   } while (count != 0);
175   *eob_ptr = get_max_eob(v_eobmax_76543210);
176 }
177 
quantize_fp_logscale_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero,int log_scale)178 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
179     const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
180     tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
181     int16x8_t v_round, int16x8_t v_zero, int log_scale) {
182   const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
183   const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
184   const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
185   const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
186   const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
187   const uint16x8_t v_mask =
188       vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
189   // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
190   const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
191                                     vreinterpretq_s16_u16(v_mask));
192   const int16x8_t v_tmp2 =
193       vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
194   const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
195   const int16x8_t v_qcoeff =
196       vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
197   // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
198   // shifting right. (vshlq_s16 will shift right if shift value is negative)
199   const uint16x8_t v_abs_dqcoeff =
200       vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
201                 vdupq_n_s16(-log_scale));
202   const int16x8_t v_dqcoeff =
203       vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
204                 v_coeff_sign);
205   store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
206   store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
207   return v_nz_mask;
208 }
209 
quantize_fp_logscale2_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)210 static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
211     const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
212     tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
213     int16x8_t v_round, int16x8_t v_zero) {
214   const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
215   const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
216   const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
217   const uint16x8_t v_mask =
218       vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
219                 vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
220   // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
221   const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
222                                     vreinterpretq_s16_u16(v_mask));
223   // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
224   const int16x8_t v_tmp2 =
225       vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
226                 vreinterpretq_s16_u16(vshrq_n_u16(
227                     vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
228   const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
229   const int16x8_t v_qcoeff =
230       vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
231   // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
232   const int16x8_t v_abs_dqcoeff =
233       vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
234                 vreinterpretq_s16_u16(vshrq_n_u16(
235                     vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
236   const int16x8_t v_dqcoeff =
237       vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
238   store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
239   store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
240   return v_nz_mask;
241 }
242 
quantize_fp_no_qmatrix_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * iscan,int log_scale)243 static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
244     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
245     const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
246     const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
247     int log_scale) {
248   const int16x8_t v_zero = vdupq_n_s16(0);
249   int16x8_t v_quant = vld1q_s16(quant_ptr);
250   int16x8_t v_dequant = vld1q_s16(dequant_ptr);
251   const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
252   int16x8_t v_round =
253       vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
254   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
255   intptr_t non_zero_count = n_coeffs;
256 
257   assert(n_coeffs > 16);
258   // Pre-scan pass
259   const int16x8_t v_dequant_scaled =
260       vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
261   const int16x8_t v_zbin_s16 =
262       vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
263   intptr_t i = n_coeffs;
264   do {
265     const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
266     const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
267     const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
268     const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
269     const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
270     const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
271     // If the coefficient is in the base ZBIN range, then discard.
272     if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
273       non_zero_count -= 16;
274     } else {
275       break;
276     }
277     i -= 16;
278   } while (i > 0);
279 
280   const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
281   memset(qcoeff_ptr + non_zero_count, 0,
282          remaining_zcoeffs * sizeof(*qcoeff_ptr));
283   memset(dqcoeff_ptr + non_zero_count, 0,
284          remaining_zcoeffs * sizeof(*dqcoeff_ptr));
285 
286   // process dc and the first seven ac coeffs
287   uint16x8_t v_nz_mask;
288   if (log_scale == 2) {
289     v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
290                                         v_quant, v_dequant, v_round, v_zero);
291   } else {
292     v_nz_mask =
293         quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
294                                v_dequant, v_round, v_zero, log_scale);
295   }
296   v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
297   // overwrite the dc constants with ac constants
298   v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
299   v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
300   v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
301 
302   for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
303     coeff_ptr += 8;
304     qcoeff_ptr += 8;
305     dqcoeff_ptr += 8;
306     iscan += 8;
307     if (log_scale == 2) {
308       v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
309                                           v_quant, v_dequant, v_round, v_zero);
310     } else {
311       v_nz_mask =
312           quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
313                                  v_dequant, v_round, v_zero, log_scale);
314     }
315     v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
316   }
317   *eob_ptr = get_max_eob(v_eobmax_76543210);
318 }
319 
av1_quantize_fp_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)320 void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
321                                 const int16_t *zbin_ptr,
322                                 const int16_t *round_ptr,
323                                 const int16_t *quant_ptr,
324                                 const int16_t *quant_shift_ptr,
325                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
326                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
327                                 const int16_t *scan, const int16_t *iscan) {
328   (void)zbin_ptr;
329   (void)quant_shift_ptr;
330   (void)scan;
331   quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
332                               qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
333                               iscan, 1);
334 }
335 
av1_quantize_fp_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)336 void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
337                                 const int16_t *zbin_ptr,
338                                 const int16_t *round_ptr,
339                                 const int16_t *quant_ptr,
340                                 const int16_t *quant_shift_ptr,
341                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
342                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
343                                 const int16_t *scan, const int16_t *iscan) {
344   (void)zbin_ptr;
345   (void)quant_shift_ptr;
346   (void)scan;
347   quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
348                               qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
349                               iscan, 2);
350 }
351 
aom_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)352 void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
353                          const int16_t *zbin_ptr, const int16_t *round_ptr,
354                          const int16_t *quant_ptr,
355                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
356                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
357                          uint16_t *eob_ptr, const int16_t *scan,
358                          const int16_t *iscan) {
359   (void)quant_shift_ptr;
360   (void)scan;
361 
362   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
363 
364   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
365   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
366 
367   const int16x8_t zero = vdupq_n_s16(0);
368   int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
369 
370   int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
371   int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
372   int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
373   int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
374 
375   int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
376   int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
377   int16x8_t v_abs = vabsq_s16(v_coeff);
378 
379   vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
380 
381   uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
382   uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
383   if (nz_check) {
384     vround = vsetq_lane_s16(round_ptr[0], vround, 0);
385     vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
386     vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
387     vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
388 
389     int16x8_t vtmp = vqaddq_s16(v_abs, vround);
390     int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
391     vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
392 
393     int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
394     int16x8_t coeff_nz_mask =
395         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
396     store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
397     int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
398 
399     vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
400     coeff_nz_mask =
401         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
402     store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
403 
404     vround = vsetq_lane_s16(round_ptr[1], vround, 0);
405     vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
406     vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
407     vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
408 
409     uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
410     const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
411     int16x8_t v_iscan = vld1q_s16(&iscan[0]);
412     vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
413     v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
414   }
415   vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
416 
417   for (int i = 8; i < n_coeffs; i += 8) {
418     v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
419     v_coeff_sign = vshrq_n_s16(v_coeff, 15);
420     v_abs = vabsq_s16(v_coeff);
421     vcond = vcgeq_s16(v_abs, vzbins);
422 
423     nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
424     if (nz_check) {
425       int16x8_t vtmp = vqaddq_s16(v_abs, vround);
426       int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
427 
428       vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
429       int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
430       int16x8_t coeff_nz_mask =
431           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
432       store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
433       int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
434       vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
435       coeff_nz_mask =
436           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
437       store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
438 
439       uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
440       const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
441       int16x8_t v_iscan = vld1q_s16(&iscan[i]);
442       vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
443       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
444     }
445   }
446   *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
447 }
448 
449 #define QM_MULL_SHIFT(x0, x1)                                              \
450   vreinterpretq_s16_u16(vorrq_u16(                                         \
451       vreinterpretq_u16_s16(vshlq_n_s16(                                   \
452           vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
453       vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
454 
aom_quantize_b_helper_16x16_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)455 static void aom_quantize_b_helper_16x16_neon(
456     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
457     const int16_t *round_ptr, const int16_t *quant_ptr,
458     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
459     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
460     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
461     const qm_val_t *iqm_ptr) {
462   (void)scan;
463 
464   uint16x8_t vwt, viwt;
465   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
466 
467   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
468   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
469 
470   const int16x8_t zero = vdupq_n_s16(0);
471   int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
472 
473   int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
474   int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
475   int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
476   int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
477 
478   int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
479   int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
480   int16x8_t v_abs = vabsq_s16(v_coeff);
481   vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
482   uint16x8_t vcond;
483   if (qm_ptr == NULL) {
484     vcond = vcgeq_s16(v_abs, vzbins);
485   } else {
486     vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
487     vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
488   }
489   uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
490   if (nz_check) {
491     vround = vsetq_lane_s16(round_ptr[0], vround, 0);
492     vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
493     vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
494     vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
495 
496     int16x8_t vtmp = vqaddq_s16(v_abs, vround);
497 
498     int16x8_t vtmp2;
499     if (qm_ptr == NULL) {
500       vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
501     } else {
502       vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
503       vtmp2 = vaddq_s16(vtmp2, vtmp);
504     }
505 
506     vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
507     int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
508     int16x8_t coeff_nz_mask =
509         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
510     store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
511 
512     if (iqm_ptr != NULL) {
513       viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
514       vdequant = QM_MULL_SHIFT(vdequant, viwt);
515     }
516     int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
517     vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
518     coeff_nz_mask =
519         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
520     store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
521 
522     vround = vsetq_lane_s16(round_ptr[1], vround, 0);
523     vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
524     vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
525     vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
526 
527     uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
528     const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
529     int16x8_t v_iscan = vld1q_s16(&iscan[0]);
530     vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
531     v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
532   }
533   vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
534 
535   for (int i = 8; i < n_coeffs; i += 8) {
536     v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
537     v_coeff_sign = vshrq_n_s16(v_coeff, 15);
538     v_abs = vabsq_s16(v_coeff);
539 
540     if (qm_ptr == NULL) {
541       vcond = vcgeq_s16(v_abs, vzbins);
542     } else {
543       vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
544       vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
545     }
546     nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
547     if (nz_check) {
548       int16x8_t vtmp = vqaddq_s16(v_abs, vround);
549 
550       int16x8_t vtmp2;
551       if (qm_ptr == NULL) {
552         vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
553       } else {
554         vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
555         vtmp2 = vaddq_s16(vtmp2, vtmp);
556       }
557 
558       vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
559       int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
560       int16x8_t coeff_nz_mask =
561           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
562       store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
563 
564       if (iqm_ptr != NULL) {
565         viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
566         vdequant = QM_MULL_SHIFT(vdequant, viwt);
567       }
568       int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
569       vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
570       coeff_nz_mask =
571           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
572       store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
573 
574       uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
575       const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
576       int16x8_t v_iscan = vld1q_s16(&iscan[i]);
577       vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
578       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
579     }
580   }
581   *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
582 }
583 
aom_quantize_b_helper_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)584 static void aom_quantize_b_helper_32x32_neon(
585     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
586     const int16_t *round_ptr, const int16_t *quant_ptr,
587     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
588     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
589     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
590     const qm_val_t *iqm_ptr) {
591   (void)scan;
592 
593   uint16x8_t vwt, viwt;
594   const int log_scale = 1;
595   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
596                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
597 
598   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
599   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
600 
601   const int16x8_t zero = vdupq_n_s16(0);
602   int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
603   const int16x8_t v_log_scale = v_eobmax_76543210;
604 
605   int16x8_t vzbins = vdupq_n_s16(zbins[1]),
606             vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
607   int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
608   int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
609   int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
610 
611   int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
612   int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
613   int16x8_t v_abs = vabsq_s16(v_coeff);
614   vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
615   uint16x8_t vcond;
616   if (qm_ptr == NULL) {
617     vcond = vcgeq_s16(v_abs, vzbins);
618   } else {
619     vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
620     vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
621   }
622   uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
623   if (nz_check) {
624     vround =
625         vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
626     vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
627     vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
628     vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
629 
630     int16x8_t vtmp = vqaddq_s16(v_abs, vround);
631 
632     int16x8_t vtmp2;
633     if (qm_ptr == NULL) {
634       vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
635     } else {
636       vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
637       vtmp2 = vaddq_s16(vtmp2, vtmp);
638     }
639 
640     vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
641     int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
642     int16x8_t coeff_nz_mask =
643         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
644     store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
645 
646     if (iqm_ptr != NULL) {
647       viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
648       vdequant = QM_MULL_SHIFT(vdequant, viwt);
649     }
650     int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
651         vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
652     vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
653     coeff_nz_mask =
654         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
655     store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
656 
657     vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
658     vround =
659         vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
660     vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
661     vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
662     vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
663 
664     uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
665     const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
666     int16x8_t v_iscan = vld1q_s16(&iscan[0]);
667     vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
668     v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
669   }
670   vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
671 
672   for (int i = 8; i < n_coeffs; i += 8) {
673     v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
674     v_coeff_sign = vshrq_n_s16(v_coeff, 15);
675     v_abs = vabsq_s16(v_coeff);
676 
677     if (qm_ptr == NULL) {
678       vcond = vcgeq_s16(v_abs, vzbins);
679     } else {
680       vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
681       vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
682     }
683     nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
684     if (nz_check) {
685       int16x8_t vtmp = vqaddq_s16(v_abs, vround);
686 
687       int16x8_t vtmp2;
688       if (qm_ptr == NULL) {
689         vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
690       } else {
691         vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
692         vtmp2 = vaddq_s16(vtmp2, vtmp);
693       }
694       vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
695 
696       int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
697       int16x8_t coeff_nz_mask =
698           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
699       store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
700 
701       if (iqm_ptr != NULL) {
702         viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
703         vdequant = QM_MULL_SHIFT(vdequant, viwt);
704       }
705       int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
706           vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
707       vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
708       coeff_nz_mask =
709           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
710       store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
711 
712       uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
713       const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
714       int16x8_t v_iscan = vld1q_s16(&iscan[i]);
715       vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
716       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
717     }
718   }
719   *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
720 }
721 
aom_quantize_b_helper_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)722 static void aom_quantize_b_helper_64x64_neon(
723     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
724     const int16_t *round_ptr, const int16_t *quant_ptr,
725     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
726     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
727     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
728     const qm_val_t *iqm_ptr) {
729   (void)scan;
730 
731   uint16x8_t vwt, viwt;
732   const int log_scale = 2;
733   const int16x8_t v_log_scale =
734       vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
735 
736   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
737                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
738 
739   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
740   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
741 
742   const int16x8_t zero = vdupq_n_s16(0);
743   int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
744   int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
745 
746   int16x8_t vzbins = vdupq_n_s16(zbins[1]),
747             vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
748   int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
749   int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
750   int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
751 
752   int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
753   int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
754   int16x8_t v_abs = vabsq_s16(v_coeff);
755   vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
756   uint16x8_t vcond;
757   if (qm_ptr == NULL) {
758     vcond = vcgeq_s16(v_abs, vzbins);
759   } else {
760     vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
761     vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
762   }
763   uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
764   if (nz_check) {
765     vround =
766         vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
767     vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
768     vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
769     vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
770     int16x8_t vtmp = vqaddq_s16(v_abs, vround);
771 
772     int16x8_t vtmp2;
773     if (qm_ptr == NULL) {
774       vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
775     } else {
776       vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
777       vtmp2 = vaddq_s16(vtmp2, vtmp);
778     }
779 
780     int16x8_t ones =
781         vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
782     vtmp2 =
783         vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
784     int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
785     int16x8_t coeff_nz_mask =
786         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
787     store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
788 
789     if (iqm_ptr != NULL) {
790       viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
791       vdequant = QM_MULL_SHIFT(vdequant, viwt);
792     }
793     int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
794         vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
795     v_deq_abs =
796         vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
797     vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
798     coeff_nz_mask =
799         vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
800     store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
801 
802     vround =
803         vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
804     vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
805     vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
806     vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
807 
808     uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
809     const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
810     int16x8_t v_iscan = vld1q_s16(&iscan[0]);
811     vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
812     v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
813   }
814   vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
815 
816   for (int i = 8; i < n_coeffs; i += 8) {
817     v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
818     v_coeff_sign = vshrq_n_s16(v_coeff, 15);
819     v_abs = vabsq_s16(v_coeff);
820 
821     if (qm_ptr == NULL) {
822       vcond = vcgeq_s16(v_abs, vzbins);
823     } else {
824       vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
825       vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
826     }
827     nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
828     if (nz_check) {
829       int16x8_t vtmp = vqaddq_s16(v_abs, vround);
830 
831       int16x8_t vtmp2;
832       if (qm_ptr == NULL) {
833         vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
834       } else {
835         vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
836         vtmp2 = vaddq_s16(vtmp2, vtmp);
837       }
838 
839       int16x8_t ones =
840           vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
841       vtmp2 =
842           vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
843       int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
844       int16x8_t coeff_nz_mask =
845           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
846       store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
847 
848       if (iqm_ptr != NULL) {
849         viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
850         vdequant = QM_MULL_SHIFT(vdequant, viwt);
851       }
852       int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
853           vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
854       v_deq_abs =
855           vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
856       vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
857       coeff_nz_mask =
858           vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
859       store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
860 
861       uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
862       const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
863       int16x8_t v_iscan = vld1q_s16(&iscan[i]);
864       vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
865       v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
866     }
867   }
868   *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
869 }
870 
aom_quantize_b_helper_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr,const int log_scale)871 void aom_quantize_b_helper_neon(
872     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
873     const int16_t *round_ptr, const int16_t *quant_ptr,
874     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
875     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
876     const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
877     const qm_val_t *iqm_ptr, const int log_scale) {
878   switch (log_scale) {  // log_scale for AV1 encoder can be only 0, 1, 2
879     case 0:
880       aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
881                                        quant_ptr, quant_shift_ptr, qcoeff_ptr,
882                                        dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
883                                        iscan, qm_ptr, iqm_ptr);
884       break;
885     case 1:
886       aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
887                                        quant_ptr, quant_shift_ptr, qcoeff_ptr,
888                                        dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
889                                        iscan, qm_ptr, iqm_ptr);
890       break;
891     case 2:
892       aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
893                                        quant_ptr, quant_shift_ptr, qcoeff_ptr,
894                                        dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
895                                        iscan, qm_ptr, iqm_ptr);
896       break;
897   }
898 }
899 
aom_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)900 void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
901                                const int16_t *zbin_ptr,
902                                const int16_t *round_ptr,
903                                const int16_t *quant_ptr,
904                                const int16_t *quant_shift_ptr,
905                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
906                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
907                                const int16_t *scan, const int16_t *iscan) {
908   aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
909                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
910                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
911                              NULL, NULL, 1);
912 }
913 
aom_quantize_b_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)914 void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
915                                const int16_t *zbin_ptr,
916                                const int16_t *round_ptr,
917                                const int16_t *quant_ptr,
918                                const int16_t *quant_shift_ptr,
919                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
920                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
921                                const int16_t *scan, const int16_t *iscan) {
922   aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
923                              quant_ptr, quant_shift_ptr, qcoeff_ptr,
924                              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
925                              NULL, NULL, 2);
926 }
927