• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <emmintrin.h>
12 #include <xmmintrin.h>
13 
14 #include "./vp9_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
18 
vp9_quantize_fp_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,int skip_block,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)19 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
20                           int skip_block, const int16_t *round_ptr,
21                           const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
22                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
23                           uint16_t *eob_ptr, const int16_t *scan_ptr,
24                           const int16_t *iscan_ptr) {
25   __m128i zero;
26   __m128i thr;
27   int16_t nzflag;
28 
29   (void)scan_ptr;
30 
31   coeff_ptr += n_coeffs;
32   iscan_ptr += n_coeffs;
33   qcoeff_ptr += n_coeffs;
34   dqcoeff_ptr += n_coeffs;
35   n_coeffs = -n_coeffs;
36   zero = _mm_setzero_si128();
37 
38   if (!skip_block) {
39     __m128i eob;
40     __m128i round, quant, dequant;
41     {
42       __m128i coeff0, coeff1;
43 
44       // Setup global values
45       {
46         round = _mm_load_si128((const __m128i *)round_ptr);
47         quant = _mm_load_si128((const __m128i *)quant_ptr);
48         dequant = _mm_load_si128((const __m128i *)dequant_ptr);
49       }
50 
51       {
52         __m128i coeff0_sign, coeff1_sign;
53         __m128i qcoeff0, qcoeff1;
54         __m128i qtmp0, qtmp1;
55         // Do DC and first 15 AC
56         coeff0 = load_tran_low(coeff_ptr + n_coeffs);
57         coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
58 
59         // Poor man's sign extract
60         coeff0_sign = _mm_srai_epi16(coeff0, 15);
61         coeff1_sign = _mm_srai_epi16(coeff1, 15);
62         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
63         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
64         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
65         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
66 
67         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
68         round = _mm_unpackhi_epi64(round, round);
69         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
70         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
71         quant = _mm_unpackhi_epi64(quant, quant);
72         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
73 
74         // Reinsert signs
75         qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
76         qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
77         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
78         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
79 
80         store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
81         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
82 
83         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
84         dequant = _mm_unpackhi_epi64(dequant, dequant);
85         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
86 
87         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
88         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
89       }
90 
91       {
92         // Scan for eob
93         __m128i zero_coeff0, zero_coeff1;
94         __m128i nzero_coeff0, nzero_coeff1;
95         __m128i iscan0, iscan1;
96         __m128i eob1;
97         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
98         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
99         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
100         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
101         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
102         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
103         // Add one to convert from indices to counts
104         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
105         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
106         eob = _mm_and_si128(iscan0, nzero_coeff0);
107         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
108         eob = _mm_max_epi16(eob, eob1);
109       }
110       n_coeffs += 8 * 2;
111     }
112 
113     thr = _mm_srai_epi16(dequant, 1);
114 
115     // AC only loop
116     while (n_coeffs < 0) {
117       __m128i coeff0, coeff1;
118       {
119         __m128i coeff0_sign, coeff1_sign;
120         __m128i qcoeff0, qcoeff1;
121         __m128i qtmp0, qtmp1;
122 
123         coeff0 = load_tran_low(coeff_ptr + n_coeffs);
124         coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
125 
126         // Poor man's sign extract
127         coeff0_sign = _mm_srai_epi16(coeff0, 15);
128         coeff1_sign = _mm_srai_epi16(coeff1, 15);
129         qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
130         qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
131         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
132         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
133 
134         nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
135                  _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
136 
137         if (nzflag) {
138           qcoeff0 = _mm_adds_epi16(qcoeff0, round);
139           qcoeff1 = _mm_adds_epi16(qcoeff1, round);
140           qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
141           qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
142 
143           // Reinsert signs
144           qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
145           qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
146           qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
147           qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
148 
149           store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
150           store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
151 
152           coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
153           coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
154 
155           store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
156           store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
157         } else {
158           store_zero_tran_low(qcoeff_ptr + n_coeffs);
159           store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
160 
161           store_zero_tran_low(dqcoeff_ptr + n_coeffs);
162           store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
163         }
164       }
165 
166       if (nzflag) {
167         // Scan for eob
168         __m128i zero_coeff0, zero_coeff1;
169         __m128i nzero_coeff0, nzero_coeff1;
170         __m128i iscan0, iscan1;
171         __m128i eob0, eob1;
172         zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
173         zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
174         nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
175         nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
176         iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
177         iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
178         // Add one to convert from indices to counts
179         iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
180         iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
181         eob0 = _mm_and_si128(iscan0, nzero_coeff0);
182         eob1 = _mm_and_si128(iscan1, nzero_coeff1);
183         eob0 = _mm_max_epi16(eob0, eob1);
184         eob = _mm_max_epi16(eob, eob0);
185       }
186       n_coeffs += 8 * 2;
187     }
188 
189     // Accumulate EOB
190     {
191       __m128i eob_shuffled;
192       eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
193       eob = _mm_max_epi16(eob, eob_shuffled);
194       eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
195       eob = _mm_max_epi16(eob, eob_shuffled);
196       eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
197       eob = _mm_max_epi16(eob, eob_shuffled);
198       *eob_ptr = _mm_extract_epi16(eob, 1);
199     }
200   } else {
201     do {
202       store_zero_tran_low(qcoeff_ptr + n_coeffs);
203       store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
204 
205       store_zero_tran_low(dqcoeff_ptr + n_coeffs);
206       store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
207       n_coeffs += 8 * 2;
208     } while (n_coeffs < 0);
209     *eob_ptr = 0;
210   }
211 }
212