• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 #include <assert.h>
13 #include <math.h>
14 
15 #include "./vpx_config.h"
16 #include "vpx_mem/vpx_mem.h"
17 
18 #include "vp9/common/vp9_quant_common.h"
19 #include "vp9/common/vp9_seg_common.h"
20 
21 #include "vp9/encoder/vp9_encoder.h"
22 #include "vp9/encoder/vp9_quantize.h"
23 #include "vp9/encoder/vp9_rd.h"
24 
25 #include "vpx_dsp/arm/idct_neon.h"
26 #include "vpx_dsp/arm/mem_neon.h"
27 #include "vpx_dsp/vpx_dsp_common.h"
28 
calculate_dqcoeff_and_store(const int16x8_t qcoeff,const int16x8_t dequant,tran_low_t * dqcoeff)29 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
30                                                const int16x8_t dequant,
31                                                tran_low_t *dqcoeff) {
32   const int32x4_t dqcoeff_0 =
33       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
34   const int32x4_t dqcoeff_1 =
35       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
36 
37 #if CONFIG_VP9_HIGHBITDEPTH
38   vst1q_s32(dqcoeff, dqcoeff_0);
39   vst1q_s32(dqcoeff + 4, dqcoeff_1);
40 #else
41   vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
42 #endif  // CONFIG_VP9_HIGHBITDEPTH
43 }
44 
vp9_quantize_fp_neon(const tran_low_t * coeff_ptr,intptr_t count,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)45 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
46                           int skip_block, const int16_t *round_ptr,
47                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
48                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
49                           uint16_t *eob_ptr, const int16_t *scan,
50                           const int16_t *iscan) {
51   // Quantization pass: All coefficients with index >= zero_flag are
52   // skippable. Note: zero_flag can be zero.
53   int i;
54   const int16x8_t v_zero = vdupq_n_s16(0);
55   const int16x8_t v_one = vdupq_n_s16(1);
56   int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
57   int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
58   int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
59   int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
60 
61   (void)scan;
62   (void)skip_block;
63   assert(!skip_block);
64 
65   // adjust for dc
66   v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
67   v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
68   v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
69   // process dc and the first seven ac coeffs
70   {
71     const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
72     const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
73     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
74     const int16x8_t v_abs = vabsq_s16(v_coeff);
75     const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
76     const int32x4_t v_tmp_lo =
77         vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
78     const int32x4_t v_tmp_hi =
79         vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
80     const int16x8_t v_tmp2 =
81         vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
82     const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
83     const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
84     const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
85     const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
86     const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
87     calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
88     v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
89     store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
90     v_round = vmovq_n_s16(round_ptr[1]);
91     v_quant = vmovq_n_s16(quant_ptr[1]);
92     v_dequant = vmovq_n_s16(dequant_ptr[1]);
93   }
94   // now process the rest of the ac coeffs
95   for (i = 8; i < count; i += 8) {
96     const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
97     const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
98     const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
99     const int16x8_t v_abs = vabsq_s16(v_coeff);
100     const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
101     const int32x4_t v_tmp_lo =
102         vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
103     const int32x4_t v_tmp_hi =
104         vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
105     const int16x8_t v_tmp2 =
106         vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
107     const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
108     const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
109     const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
110     const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
111     const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
112     calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
113     v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
114     store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
115   }
116 #ifdef __aarch64__
117   *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
118 #else
119   {
120     const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
121                                              vget_high_s16(v_eobmax_76543210));
122     const int64x1_t v_eobmax_xx32 =
123         vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
124     const int16x4_t v_eobmax_tmp =
125         vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
126     const int64x1_t v_eobmax_xxx3 =
127         vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
128     const int16x4_t v_eobmax_final =
129         vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
130 
131     *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
132   }
133 #endif  // __aarch64__
134 }
135 
extract_sign_bit(int32x4_t a)136 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
137   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
138 }
139 
vp9_quantize_fp_32x32_neon(const tran_low_t * coeff_ptr,intptr_t count,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)140 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
141                                 int skip_block, const int16_t *round_ptr,
142                                 const int16_t *quant_ptr,
143                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
144                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
145                                 const int16_t *scan, const int16_t *iscan) {
146   const int16x8_t one = vdupq_n_s16(1);
147   const int16x8_t neg_one = vdupq_n_s16(-1);
148 
149   // ROUND_POWER_OF_TWO(round_ptr[], 1)
150   const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
151   const int16x8_t quant = vld1q_s16(quant_ptr);
152   const int16x4_t dequant = vld1_s16(dequant_ptr);
153   // dequant >> 2 is used similar to zbin as a threshold.
154   const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
155 
156   // Process dc and the first seven ac coeffs.
157   const uint16x8_t v_iscan =
158       vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
159   const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
160   const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
161   const int16x8_t coeff_abs = vabsq_s16(coeff);
162   const int16x8_t dequant_mask =
163       vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
164 
165   int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
166   int32x4_t dqcoeff_0, dqcoeff_1;
167   uint16x8_t eob_max;
168   (void)scan;
169   (void)count;
170   (void)skip_block;
171   assert(!skip_block);
172 
173   // coeff * quant_ptr[]) >> 15
174   qcoeff = vqdmulhq_s16(qcoeff, quant);
175 
176   // Restore sign.
177   qcoeff = veorq_s16(qcoeff, coeff_sign);
178   qcoeff = vsubq_s16(qcoeff, coeff_sign);
179   qcoeff = vandq_s16(qcoeff, dequant_mask);
180 
181   // qcoeff * dequant[] / 2
182   dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
183   dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
184 
185   // Add 1 if negative to round towards zero because the C uses division.
186   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
187   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
188 #if CONFIG_VP9_HIGHBITDEPTH
189   vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
190   vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
191 #else
192   store_s16q_to_tran_low(dqcoeff_ptr, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1),
193                                                    vshrn_n_s32(dqcoeff_1, 1)));
194 #endif
195 
196   eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
197 
198   store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
199 
200   iscan += 8;
201   coeff_ptr += 8;
202   qcoeff_ptr += 8;
203   dqcoeff_ptr += 8;
204 
205   {
206     int i;
207     const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
208     const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
209     const int16x8_t dequant_thresh =
210         vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
211 
212     // Process the rest of the ac coeffs.
213     for (i = 8; i < 32 * 32; i += 8) {
214       const uint16x8_t v_iscan =
215           vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
216       const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
217       const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
218       const int16x8_t coeff_abs = vabsq_s16(coeff);
219       const int16x8_t dequant_mask =
220           vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
221 
222       int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
223       int32x4_t dqcoeff_0, dqcoeff_1;
224 
225       qcoeff = vqdmulhq_s16(qcoeff, quant);
226       qcoeff = veorq_s16(qcoeff, coeff_sign);
227       qcoeff = vsubq_s16(qcoeff, coeff_sign);
228       qcoeff = vandq_s16(qcoeff, dequant_mask);
229 
230       dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
231       dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
232 
233       dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
234       dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
235 
236 #if CONFIG_VP9_HIGHBITDEPTH
237       vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
238       vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
239 #else
240       store_s16q_to_tran_low(
241           dqcoeff_ptr,
242           vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
243 #endif
244 
245       eob_max =
246           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
247 
248       store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
249 
250       iscan += 8;
251       coeff_ptr += 8;
252       qcoeff_ptr += 8;
253       dqcoeff_ptr += 8;
254     }
255 
256 #ifdef __aarch64__
257     *eob_ptr = vmaxvq_u16(eob_max);
258 #else
259     {
260       const uint16x4_t eob_max_0 =
261           vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
262       const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
263       const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
264       vst1_lane_u16(eob_ptr, eob_max_2, 0);
265     }
266 #endif  // __aarch64__
267   }
268 }
269