1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vpx_config.h"
15 #include "./vpx_dsp_rtcd.h"
16 #include "vpx_dsp/arm/mem_neon.h"
17
vpx_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)18 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
19 int skip_block, const int16_t *zbin_ptr,
20 const int16_t *round_ptr, const int16_t *quant_ptr,
21 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
22 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23 uint16_t *eob_ptr, const int16_t *scan_ptr,
24 const int16_t *iscan_ptr) {
25 const int16x8_t one = vdupq_n_s16(1);
26 const int16x8_t neg_one = vdupq_n_s16(-1);
27 uint16x8_t eob_max;
28 (void)scan_ptr;
29 (void)skip_block;
30 assert(!skip_block);
31
32 // Process first 8 values which include a dc component.
33 {
34 // Only the first element of each vector is DC.
35 const int16x8_t zbin = vld1q_s16(zbin_ptr);
36 const int16x8_t round = vld1q_s16(round_ptr);
37 const int16x8_t quant = vld1q_s16(quant_ptr);
38 const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
39 const int16x8_t dequant = vld1q_s16(dequant_ptr);
40 // Add one because the eob does not index from 0.
41 const uint16x8_t iscan =
42 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
43
44 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
45 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
46 const int16x8_t coeff_abs = vabsq_s16(coeff);
47
48 const int16x8_t zbin_mask =
49 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
50
51 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
52
53 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
54 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
55
56 qcoeff = vaddq_s16(qcoeff, rounded);
57
58 // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
59 qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
60
61 // Restore the sign bit.
62 qcoeff = veorq_s16(qcoeff, coeff_sign);
63 qcoeff = vsubq_s16(qcoeff, coeff_sign);
64
65 qcoeff = vandq_s16(qcoeff, zbin_mask);
66
67 // Set non-zero elements to -1 and use that to extract values for eob.
68 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
69
70 coeff_ptr += 8;
71 iscan_ptr += 8;
72
73 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
74 qcoeff_ptr += 8;
75
76 qcoeff = vmulq_s16(qcoeff, dequant);
77
78 store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
79 dqcoeff_ptr += 8;
80 }
81
82 n_coeffs -= 8;
83
84 {
85 const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
86 const int16x8_t round = vdupq_n_s16(round_ptr[1]);
87 const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
88 const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
89 const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
90
91 do {
92 // Add one because the eob is not its index.
93 const uint16x8_t iscan =
94 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
95
96 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
97 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
98 const int16x8_t coeff_abs = vabsq_s16(coeff);
99
100 const int16x8_t zbin_mask =
101 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
102
103 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
104
105 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
106 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
107
108 qcoeff = vaddq_s16(qcoeff, rounded);
109
110 // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
111 qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
112
113 // Restore the sign bit.
114 qcoeff = veorq_s16(qcoeff, coeff_sign);
115 qcoeff = vsubq_s16(qcoeff, coeff_sign);
116
117 qcoeff = vandq_s16(qcoeff, zbin_mask);
118
119 // Set non-zero elements to -1 and use that to extract values for eob.
120 eob_max =
121 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
122
123 coeff_ptr += 8;
124 iscan_ptr += 8;
125
126 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
127 qcoeff_ptr += 8;
128
129 qcoeff = vmulq_s16(qcoeff, dequant);
130
131 store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
132 dqcoeff_ptr += 8;
133
134 n_coeffs -= 8;
135 } while (n_coeffs > 0);
136 }
137
138 {
139 const uint16x4_t eob_max_0 =
140 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
141 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
142 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
143 vst1_lane_u16(eob_ptr, eob_max_2, 0);
144 }
145 }
146
extract_sign_bit(int32x4_t a)147 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
148 return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
149 }
150
151 // Main difference is that zbin values are halved before comparison and dqcoeff
152 // values are divided by 2. zbin is rounded but dqcoeff is not.
vpx_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)153 void vpx_quantize_b_32x32_neon(
154 const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
155 const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
156 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
157 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
158 const int16_t *scan_ptr, const int16_t *iscan_ptr) {
159 const int16x8_t one = vdupq_n_s16(1);
160 const int16x8_t neg_one = vdupq_n_s16(-1);
161 uint16x8_t eob_max;
162 int i;
163 (void)scan_ptr;
164 (void)n_coeffs; // Because we will always calculate 32*32.
165 (void)skip_block;
166 assert(!skip_block);
167
168 // Process first 8 values which include a dc component.
169 {
170 // Only the first element of each vector is DC.
171 const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
172 const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
173 const int16x8_t quant = vld1q_s16(quant_ptr);
174 const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
175 const int16x8_t dequant = vld1q_s16(dequant_ptr);
176 // Add one because the eob does not index from 0.
177 const uint16x8_t iscan =
178 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
179
180 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
181 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
182 const int16x8_t coeff_abs = vabsq_s16(coeff);
183
184 const int16x8_t zbin_mask =
185 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
186
187 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
188
189 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
190 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
191 int16x8_t dqcoeff;
192 int32x4_t dqcoeff_0, dqcoeff_1;
193
194 qcoeff = vaddq_s16(qcoeff, rounded);
195
196 // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
197 qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
198
199 // Restore the sign bit.
200 qcoeff = veorq_s16(qcoeff, coeff_sign);
201 qcoeff = vsubq_s16(qcoeff, coeff_sign);
202
203 qcoeff = vandq_s16(qcoeff, zbin_mask);
204
205 // Set non-zero elements to -1 and use that to extract values for eob.
206 eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
207
208 coeff_ptr += 8;
209 iscan_ptr += 8;
210
211 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
212 qcoeff_ptr += 8;
213
214 dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
215 dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
216
217 // Add 1 if negative to round towards zero because the C uses division.
218 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
219 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
220
221 dqcoeff =
222 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
223
224 store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
225 dqcoeff_ptr += 8;
226 }
227
228 {
229 const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
230 const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
231 const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
232 const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
233 const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
234
235 for (i = 1; i < 32 * 32 / 8; ++i) {
236 // Add one because the eob is not its index.
237 const uint16x8_t iscan =
238 vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
239
240 const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
241 const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
242 const int16x8_t coeff_abs = vabsq_s16(coeff);
243
244 const int16x8_t zbin_mask =
245 vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
246
247 const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
248
249 // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
250 int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
251 int16x8_t dqcoeff;
252 int32x4_t dqcoeff_0, dqcoeff_1;
253
254 qcoeff = vaddq_s16(qcoeff, rounded);
255
256 // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
257 qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
258
259 // Restore the sign bit.
260 qcoeff = veorq_s16(qcoeff, coeff_sign);
261 qcoeff = vsubq_s16(qcoeff, coeff_sign);
262
263 qcoeff = vandq_s16(qcoeff, zbin_mask);
264
265 // Set non-zero elements to -1 and use that to extract values for eob.
266 eob_max =
267 vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
268
269 coeff_ptr += 8;
270 iscan_ptr += 8;
271
272 store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
273 qcoeff_ptr += 8;
274
275 dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
276 dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
277
278 dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
279 dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
280
281 dqcoeff =
282 vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
283
284 store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
285 dqcoeff_ptr += 8;
286 }
287 }
288
289 {
290 const uint16x4_t eob_max_0 =
291 vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
292 const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
293 const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
294 vst1_lane_u16(eob_ptr, eob_max_2, 0);
295 }
296 }
297