1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13 #include <math.h>
14
15 #include "./vpx_config.h"
16 #include "vpx_mem/vpx_mem.h"
17
18 #include "vp9/common/vp9_quant_common.h"
19 #include "vp9/common/vp9_seg_common.h"
20
21 #include "vp9/encoder/vp9_encoder.h"
22 #include "vp9/encoder/vp9_quantize.h"
23 #include "vp9/encoder/vp9_rd.h"
24
25 #include "vpx_dsp/arm/idct_neon.h"
26 #include "vpx_dsp/arm/mem_neon.h"
27 #include "vpx_dsp/vpx_dsp_common.h"
28
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)29 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
30 const int16x8_t dequant,
31 tran_low_t *dqcoeff) {
32 const int32x4_t dqcoeff_0 =
33 vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
34 const int32x4_t dqcoeff_1 =
35 vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
36
37 #if CONFIG_VP9_HIGHBITDEPTH
38 vst1q_s32(dqcoeff, dqcoeff_0);
39 vst1q_s32(dqcoeff + 4, dqcoeff_1);
40 #else
41 vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
42 #endif // CONFIG_VP9_HIGHBITDEPTH
43 }
44
vp9_quantize_fp_neon(const tran_low_t * coeff_ptr,intptr_t count,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)45 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
46 int skip_block, const int16_t *round_ptr,
47 const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
48 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
49 uint16_t *eob_ptr, const int16_t *scan,
50 const int16_t *iscan) {
51 // Quantization pass: All coefficients with index >= zero_flag are
52 // skippable. Note: zero_flag can be zero.
53 int i;
54 const int16x8_t v_zero = vdupq_n_s16(0);
55 const int16x8_t v_one = vdupq_n_s16(1);
56 int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
57 int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
58 int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
59 int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
60
61 (void)scan;
62 (void)skip_block;
63 assert(!skip_block);
64
65 // adjust for dc
66 v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
67 v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
68 v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
69 // process dc and the first seven ac coeffs
70 {
71 const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
72 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
73 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
74 const int16x8_t v_abs = vabsq_s16(v_coeff);
75 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
76 const int32x4_t v_tmp_lo =
77 vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
78 const int32x4_t v_tmp_hi =
79 vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
80 const int16x8_t v_tmp2 =
81 vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
82 const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
83 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
84 const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
85 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
86 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
87 calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
88 v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
89 store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
90 v_round = vmovq_n_s16(round_ptr[1]);
91 v_quant = vmovq_n_s16(quant_ptr[1]);
92 v_dequant = vmovq_n_s16(dequant_ptr[1]);
93 }
94 // now process the rest of the ac coeffs
95 for (i = 8; i < count; i += 8) {
96 const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
97 const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
98 const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
99 const int16x8_t v_abs = vabsq_s16(v_coeff);
100 const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
101 const int32x4_t v_tmp_lo =
102 vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
103 const int32x4_t v_tmp_hi =
104 vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
105 const int16x8_t v_tmp2 =
106 vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
107 const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
108 const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
109 const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
110 const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
111 const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
112 calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
113 v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
114 store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
115 }
116 #ifdef __aarch64__
117 *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
118 #else
119 {
120 const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
121 vget_high_s16(v_eobmax_76543210));
122 const int64x1_t v_eobmax_xx32 =
123 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
124 const int16x4_t v_eobmax_tmp =
125 vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
126 const int64x1_t v_eobmax_xxx3 =
127 vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
128 const int16x4_t v_eobmax_final =
129 vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
130
131 *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
132 }
133 #endif // __aarch64__
134 }
135
extract_sign_bit(int32x4_t a)136 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
137 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
138 }
139
vp9_quantize_fp_32x32_neon(const tran_low_t * coeff_ptr,intptr_t count,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)140 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
141 int skip_block, const int16_t *round_ptr,
142 const int16_t *quant_ptr,
143 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
144 const int16_t *dequant_ptr, uint16_t *eob_ptr,
145 const int16_t *scan, const int16_t *iscan) {
146 const int16x8_t one = vdupq_n_s16(1);
147 const int16x8_t neg_one = vdupq_n_s16(-1);
148
149 // ROUND_POWER_OF_TWO(round_ptr[], 1)
150 const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
151 const int16x8_t quant = vld1q_s16(quant_ptr);
152 const int16x4_t dequant = vld1_s16(dequant_ptr);
153 // dequant >> 2 is used similar to zbin as a threshold.
154 const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
155
156 // Process dc and the first seven ac coeffs.
157 const uint16x8_t v_iscan =
158 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
159 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
160 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
161 const int16x8_t coeff_abs = vabsq_s16(coeff);
162 const int16x8_t dequant_mask =
163 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
164
165 int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
166 int32x4_t dqcoeff_0, dqcoeff_1;
167 uint16x8_t eob_max;
168 (void)scan;
169 (void)count;
170 (void)skip_block;
171 assert(!skip_block);
172
173 // coeff * quant_ptr[]) >> 15
174 qcoeff = vqdmulhq_s16(qcoeff, quant);
175
176 // Restore sign.
177 qcoeff = veorq_s16(qcoeff, coeff_sign);
178 qcoeff = vsubq_s16(qcoeff, coeff_sign);
179 qcoeff = vandq_s16(qcoeff, dequant_mask);
180
181 // qcoeff * dequant[] / 2
182 dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
183 dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
184
185 // Add 1 if negative to round towards zero because the C uses division.
186 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
187 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
188 #if CONFIG_VP9_HIGHBITDEPTH
189 vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
190 vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
191 #else
192 store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1),
193 vshrn_n_s32(dqcoeff_1, 1)));
194 #endif
195
196 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
197
198 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
199
200 iscan += 8;
201 coeff_ptr += 8;
202 qcoeff_ptr += 8;
203 dqcoeff_ptr += 8;
204
205 {
206 int i;
207 const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
208 const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
209 const int16x8_t dequant_thresh =
210 vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
211
212 // Process the rest of the ac coeffs.
213 for (i = 8; i < 32 * 32; i += 8) {
214 const uint16x8_t v_iscan =
215 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
216 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
217 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
218 const int16x8_t coeff_abs = vabsq_s16(coeff);
219 const int16x8_t dequant_mask =
220 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
221
222 int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
223 int32x4_t dqcoeff_0, dqcoeff_1;
224
225 qcoeff = vqdmulhq_s16(qcoeff, quant);
226 qcoeff = veorq_s16(qcoeff, coeff_sign);
227 qcoeff = vsubq_s16(qcoeff, coeff_sign);
228 qcoeff = vandq_s16(qcoeff, dequant_mask);
229
230 dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
231 dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
232
233 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
234 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
235
236 #if CONFIG_VP9_HIGHBITDEPTH
237 vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
238 vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
239 #else
240 store_s16q_to_tran_low(
241 dqcoeff_ptr,
242 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
243 #endif
244
245 eob_max =
246 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
247
248 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
249
250 iscan += 8;
251 coeff_ptr += 8;
252 qcoeff_ptr += 8;
253 dqcoeff_ptr += 8;
254 }
255
256 #ifdef __aarch64__
257 *eob_ptr = vmaxvq_u16(eob_max);
258 #else
259 {
260 const uint16x4_t eob_max_0 =
261 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
262 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
263 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
264 vst1_lane_u16(eob_ptr, eob_max_2, 0);
265 }
266 #endif // __aarch64__
267 }
268 }
269