• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/utils/entropy_decoder.h"
16 
17 #include <cassert>
18 #include <cstring>
19 
20 #include "src/utils/common.h"
21 #include "src/utils/compiler_attributes.h"
22 #include "src/utils/constants.h"
23 #include "src/utils/cpu.h"
24 
25 #if defined(__ARM_NEON__) || defined(__aarch64__) || \
26     (defined(_MSC_VER) && defined(_M_ARM))
27 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
28 #else
29 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
30 #endif
31 
32 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
33 #include <arm_neon.h>
34 #endif
35 
36 #if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
37 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
38 #else
39 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
40 #endif
41 
42 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
43 #include <emmintrin.h>
44 #endif
45 
46 namespace libgav1 {
47 namespace {
48 
49 constexpr uint32_t kReadBitMask = ~255;
50 constexpr int kCdfPrecision = 6;
51 constexpr int kMinimumProbabilityPerSymbol = 4;
52 
53 // This function computes the "cur" variable as specified inside the do-while
54 // loop in Section 8.2.6 of the spec. This function is monotonically
55 // decreasing as the values of index increases (note that the |cdf| array is
56 // sorted in decreasing order).
ScaleCdf(uint32_t values_in_range_shifted,const uint16_t * const cdf,int index,int symbol_count)57 uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
58                   int index, int symbol_count) {
59   return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
60          (kMinimumProbabilityPerSymbol * (symbol_count - index));
61 }
62 
UpdateCdf(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count,const int symbol)63 void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
64                const int symbol) {
65   const uint16_t count = cdf[symbol_count];
66   // rate is computed in the spec as:
67   //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
68   // In this case cdf[N] is |count|.
69   // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
70   // symbol_count > 3. So the equation becomes:
71   //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
72   // Note that the largest value for count is 32 (it is not incremented beyond
73   // 32). So using that information:
74   //  count >> 4 is 0 for count from 0 to 15.
75   //  count >> 4 is 1 for count from 16 to 31.
76   //  count >> 4 is 2 for count == 31.
77   // Now, the equation becomes:
78   //  4 + (count >> 4) + (symbol_count > 3).
79   // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
80   // with bitwise or:
81   //  (4 | (count >> 4)) + (symbol_count > 3).
82   // but using addition will allow the compiler to eliminate an operation when
83   // symbol_count is known and this function is inlined.
84   const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
85   // Hints for further optimizations:
86   //
87   // 1. clang can vectorize this for loop with width 4, even though the loop
88   // contains an if-else statement. Therefore, it may be advantageous to use
89   // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
90   // (a multiple of 4 that's not too small).
91   //
92   // 2. The for loop can be rewritten in the following form, which would enable
93   // clang to vectorize the loop with width 8:
94   //
95   //   const int rounding = (1 << rate) - 1;
96   //   for (int i = 0; i < symbol_count - 1; ++i) {
97   //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
98   //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
99   //   }
100   //
101   // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
102   // integer arithmetic. The result of the unsigned subtraction is cast to a
103   // signed integer and right-shifted. This requires the right shift of a
104   // signed integer be an arithmetic shift, which is true for clang, gcc, and
105   // Visual C++.
106   assert(symbol_count - 1 > 0);
107   int i = 0;
108   do {
109     if (i < symbol) {
110       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
111     } else {
112       cdf[i] -= cdf[i] >> rate;
113     }
114   } while (++i < symbol_count - 1);
115   cdf[symbol_count] += static_cast<uint16_t>(count < 32);
116 }
117 
118 // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
119 // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
120 // SIMD instruction sets if available.
121 
122 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
123 
124 // The UpdateCdf() method contains the following for loop:
125 //
126 //   for (int i = 0; i < symbol_count - 1; ++i) {
127 //     if (i < symbol) {
128 //       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
129 //     } else {
130 //       cdf[i] -= cdf[i] >> rate;
131 //     }
132 //   }
133 //
134 // It can be rewritten in the following two forms, which are amenable to SIMD
135 // implementations:
136 //
137 //   const int rounding = (1 << rate) - 1;
138 //   for (int i = 0; i < symbol_count - 1; ++i) {
139 //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
140 //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
141 //   }
142 //
143 // or:
144 //
145 //   const int rounding = (1 << rate) - 1;
146 //   for (int i = 0; i < symbol_count - 1; ++i) {
147 //     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
148 //     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
149 //   }
150 //
151 // The following ARM NEON implementations use a modified version of the first
152 // form, using the comparison mask and unsigned rollover to avoid the need to
153 // calculate rounding.
154 //
155 // The cdf array has symbol_count + 1 elements. The first symbol_count elements
156 // are the CDF. The last element is a count that is initialized to 0 and may
157 // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
158 // cdf[symbol_count - 1] is always 0, the for loop does not update
159 // cdf[symbol_count - 1]. However, it would be correct to have the for loop
160 // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
161 // for loop would take the else branch when i is symbol_count - 1:
162 //      cdf[i] -= cdf[i] >> rate;
163 // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
164 // after the update. The ARM NEON implementations take advantage of this in the
165 // following two cases:
166 // 1. When symbol_count is 8 or 16, the vectorized code updates the first
167 //    symbol_count elements in the array.
168 // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
169 //    the cdf array. Since an invalid CDF value is written into cdf[7], the
170 //    count in cdf[7] needs to be fixed up after the vectorized code.
171 
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)172 void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
173   uint16x4_t cdf_vec = vld1_u16(cdf);
174   const uint16_t count = cdf[5];
175   const int rate = (count >> 4) + 5;
176   const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
177   const uint16x4_t index = vcreate_u16(0x0003000200010000);
178   const uint16x4_t symbol_vec = vdup_n_u16(symbol);
179   const uint16x4_t mask = vcge_u16(index, symbol_vec);
180   // i < symbol: 32768, i >= symbol: 65535.
181   const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
182   // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
183   const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
184   // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
185   const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
186   const int16x4_t negative_rate = vdup_n_s16(-rate);
187   // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
188   const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
189   // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
190   // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
191   cdf_vec = vadd_u16(cdf_offset, delta);
192   vst1_u16(cdf, cdf_vec);
193   cdf[5] = count + static_cast<uint16_t>(count < 32);
194 }
195 
196 // This version works for |symbol_count| = 7, 8, or 9.
197 // See UpdateCdf5 for implementation details.
198 template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)199 void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
200   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
201   uint16x8_t cdf_vec = vld1q_u16(cdf);
202   const uint16_t count = cdf[symbol_count];
203   const int rate = (count >> 4) + 5;
204   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
205   const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
206                                         vcreate_u16(0x0007000600050004));
207   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
208   const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
209   const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
210   const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
211   const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
212   const int16x8_t negative_rate = vdupq_n_s16(-rate);
213   const uint16x8_t delta =
214       vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
215   cdf_vec = vaddq_u16(cdf_offset, delta);
216   vst1q_u16(cdf, cdf_vec);
217   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
218 }
219 
UpdateCdf7(uint16_t * const cdf,const int symbol)220 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
221   UpdateCdf7To9<7>(cdf, symbol);
222 }
223 
UpdateCdf8(uint16_t * const cdf,const int symbol)224 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
225   UpdateCdf7To9<8>(cdf, symbol);
226 }
227 
UpdateCdf9(uint16_t * const cdf,const int symbol)228 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
229   UpdateCdf7To9<9>(cdf, symbol);
230 }
231 
232 // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)233 void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
234   uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
235   const uint16_t count = cdf[11];
236   cdf[11] = count + static_cast<uint16_t>(count < 32);
237   const int rate = (count >> 4) + 5;
238   if (symbol > 1) {
239     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
240     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
241     const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
242     const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
243     const int16x8_t negative_rate = vdupq_n_s16(-rate);
244     const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
245                                           vcreate_u16(0x0009000800070006));
246     const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
247     const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
248     const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
249     const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
250     const uint16x8_t delta =
251         vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
252     cdf_vec = vaddq_u16(cdf_offset, delta);
253     vst1q_u16(cdf + 2, cdf_vec);
254   } else {
255     if (symbol != 0) {
256       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
257       cdf[1] -= cdf[1] >> rate;
258     } else {
259       cdf[0] -= cdf[0] >> rate;
260       cdf[1] -= cdf[1] >> rate;
261     }
262     const int16x8_t negative_rate = vdupq_n_s16(-rate);
263     const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
264     cdf_vec = vsubq_u16(cdf_vec, delta);
265     vst1q_u16(cdf + 2, cdf_vec);
266   }
267 }
268 
269 // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)270 void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
271   uint16x8_t cdf_vec0 = vld1q_u16(cdf);
272   uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
273   const uint16_t count = cdf[13];
274   const int rate = (count >> 4) + 5;
275   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
276   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
277   const int16x8_t negative_rate = vdupq_n_s16(-rate);
278 
279   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
280                                   vcreate_u16(0x0007000600050004));
281   uint16x8_t mask = vcgeq_u16(index, symbol_vec);
282   uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
283   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
284   uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
285   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
286   cdf_vec0 = vaddq_u16(cdf_offset, delta);
287   vst1q_u16(cdf, cdf_vec0);
288 
289   index = vcombine_u16(vcreate_u16(0x0007000600050004),
290                        vcreate_u16(0x000b000a00090008));
291   mask = vcgeq_u16(index, symbol_vec);
292   a = vorrq_u16(mask, cdf_max_probability);
293   diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
294   cdf_offset = vsubq_u16(cdf_vec1, mask);
295   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
296   cdf_vec1 = vaddq_u16(cdf_offset, delta);
297   vst1q_u16(cdf + 4, cdf_vec1);
298 
299   cdf[13] = count + static_cast<uint16_t>(count < 32);
300 }
301 
302 // See UpdateCdf5 for implementation details.
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)303 void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
304   uint16x8_t cdf_vec = vld1q_u16(cdf);
305   const uint16_t count = cdf[16];
306   const int rate = (count >> 4) + 5;
307   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
308   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
309   const int16x8_t negative_rate = vdupq_n_s16(-rate);
310 
311   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
312                                   vcreate_u16(0x0007000600050004));
313   uint16x8_t mask = vcgeq_u16(index, symbol_vec);
314   uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
315   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
316   uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
317   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
318   cdf_vec = vaddq_u16(cdf_offset, delta);
319   vst1q_u16(cdf, cdf_vec);
320 
321   cdf_vec = vld1q_u16(cdf + 8);
322   index = vcombine_u16(vcreate_u16(0x000b000a00090008),
323                        vcreate_u16(0x000f000e000d000c));
324   mask = vcgeq_u16(index, symbol_vec);
325   a = vorrq_u16(mask, cdf_max_probability);
326   diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
327   cdf_offset = vsubq_u16(cdf_vec, mask);
328   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
329   cdf_vec = vaddq_u16(cdf_offset, delta);
330   vst1q_u16(cdf + 8, cdf_vec);
331 
332   cdf[16] = count + static_cast<uint16_t>(count < 32);
333 }
334 
335 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
336 
337 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
338 
LoadLo8(const void * a)339 inline __m128i LoadLo8(const void* a) {
340   return _mm_loadl_epi64(static_cast<const __m128i*>(a));
341 }
342 
LoadUnaligned16(const void * a)343 inline __m128i LoadUnaligned16(const void* a) {
344   return _mm_loadu_si128(static_cast<const __m128i*>(a));
345 }
346 
StoreLo8(void * a,const __m128i v)347 inline void StoreLo8(void* a, const __m128i v) {
348   _mm_storel_epi64(static_cast<__m128i*>(a), v);
349 }
350 
StoreUnaligned16(void * a,const __m128i v)351 inline void StoreUnaligned16(void* a, const __m128i v) {
352   _mm_storeu_si128(static_cast<__m128i*>(a), v);
353 }
354 
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)355 void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
356   __m128i cdf_vec = LoadLo8(cdf);
357   const uint16_t count = cdf[5];
358   const int rate = (count >> 4) + 5;
359   const __m128i cdf_max_probability =
360       _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
361   const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
362   const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
363   // i >= symbol.
364   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
365   // i < symbol: 32768, i >= symbol: 65535.
366   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
367   // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
368   const __m128i diff = _mm_sub_epi16(a, cdf_vec);
369   // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
370   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
371   // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
372   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
373   // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
374   // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
375   cdf_vec = _mm_add_epi16(cdf_offset, delta);
376   StoreLo8(cdf, cdf_vec);
377   cdf[5] = count + static_cast<uint16_t>(count < 32);
378 }
379 
380 // This version works for |symbol_count| = 7, 8, or 9.
381 // See UpdateCdf5 for implementation details.
382 template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)383 void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
384   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
385   __m128i cdf_vec = LoadUnaligned16(cdf);
386   const uint16_t count = cdf[symbol_count];
387   const int rate = (count >> 4) + 5;
388   const __m128i cdf_max_probability =
389       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
390   const __m128i index =
391       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
392   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
393   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
394   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
395   const __m128i diff = _mm_sub_epi16(a, cdf_vec);
396   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
397   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
398   cdf_vec = _mm_add_epi16(cdf_offset, delta);
399   StoreUnaligned16(cdf, cdf_vec);
400   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
401 }
402 
UpdateCdf7(uint16_t * const cdf,const int symbol)403 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
404   UpdateCdf7To9<7>(cdf, symbol);
405 }
406 
UpdateCdf8(uint16_t * const cdf,const int symbol)407 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
408   UpdateCdf7To9<8>(cdf, symbol);
409 }
410 
UpdateCdf9(uint16_t * const cdf,const int symbol)411 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
412   UpdateCdf7To9<9>(cdf, symbol);
413 }
414 
415 // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)416 void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
417   __m128i cdf_vec = LoadUnaligned16(cdf + 2);
418   const uint16_t count = cdf[11];
419   cdf[11] = count + static_cast<uint16_t>(count < 32);
420   const int rate = (count >> 4) + 5;
421   if (symbol > 1) {
422     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
423     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
424     const __m128i cdf_max_probability =
425         _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
426     const __m128i index =
427         _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
428     const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
429     const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
430     const __m128i a = _mm_or_si128(mask, cdf_max_probability);
431     const __m128i diff = _mm_sub_epi16(a, cdf_vec);
432     const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
433     const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
434     cdf_vec = _mm_add_epi16(cdf_offset, delta);
435     StoreUnaligned16(cdf + 2, cdf_vec);
436   } else {
437     if (symbol != 0) {
438       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
439       cdf[1] -= cdf[1] >> rate;
440     } else {
441       cdf[0] -= cdf[0] >> rate;
442       cdf[1] -= cdf[1] >> rate;
443     }
444     const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
445     cdf_vec = _mm_sub_epi16(cdf_vec, delta);
446     StoreUnaligned16(cdf + 2, cdf_vec);
447   }
448 }
449 
450 // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)451 void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
452   __m128i cdf_vec0 = LoadLo8(cdf);
453   __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
454   const uint16_t count = cdf[13];
455   const int rate = (count >> 4) + 5;
456   const __m128i cdf_max_probability =
457       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
458   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
459 
460   const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
461   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
462   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
463   const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
464   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
465   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
466   cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
467   StoreLo8(cdf, cdf_vec0);
468 
469   const __m128i index1 =
470       _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
471   const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
472   const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
473   const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
474   const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
475   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
476   cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
477   StoreUnaligned16(cdf + 4, cdf_vec1);
478 
479   cdf[13] = count + static_cast<uint16_t>(count < 32);
480 }
481 
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)482 void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
483   __m128i cdf_vec0 = LoadUnaligned16(cdf);
484   const uint16_t count = cdf[16];
485   const int rate = (count >> 4) + 5;
486   const __m128i cdf_max_probability =
487       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
488   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
489 
490   const __m128i index =
491       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
492   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
493   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
494   const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
495   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
496   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
497   cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
498   StoreUnaligned16(cdf, cdf_vec0);
499 
500   __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
501   const __m128i index1 =
502       _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
503   const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
504   const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
505   const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
506   const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
507   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
508   cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
509   StoreUnaligned16(cdf + 8, cdf_vec1);
510 
511   cdf[16] = count + static_cast<uint16_t>(count < 32);
512 }
513 
514 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
515 
UpdateCdf5(uint16_t * const cdf,const int symbol)516 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
517   UpdateCdf(cdf, 5, symbol);
518 }
519 
UpdateCdf7(uint16_t * const cdf,const int symbol)520 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
521   UpdateCdf(cdf, 7, symbol);
522 }
523 
UpdateCdf8(uint16_t * const cdf,const int symbol)524 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
525   UpdateCdf(cdf, 8, symbol);
526 }
527 
UpdateCdf9(uint16_t * const cdf,const int symbol)528 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
529   UpdateCdf(cdf, 9, symbol);
530 }
531 
UpdateCdf11(uint16_t * const cdf,const int symbol)532 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
533   UpdateCdf(cdf, 11, symbol);
534 }
535 
UpdateCdf13(uint16_t * const cdf,const int symbol)536 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
537   UpdateCdf(cdf, 13, symbol);
538 }
539 
UpdateCdf16(uint16_t * const cdf,const int symbol)540 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
541   UpdateCdf(cdf, 16, symbol);
542 }
543 
544 #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
545 #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
546 
HostToBigEndian(const EntropyDecoder::WindowSize x)547 inline EntropyDecoder::WindowSize HostToBigEndian(
548     const EntropyDecoder::WindowSize x) {
549   static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
550 #if defined(__GNUC__)
551 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
552   return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
553 #else
554   return x;
555 #endif
556 #elif defined(_WIN32)
557   // Note Windows targets are assumed to be little endian.
558   return static_cast<EntropyDecoder::WindowSize>(
559       (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
560                        : _byteswap_ulong(static_cast<unsigned long>(x)));
561 #else
562 #error Unknown compiler!
563 #endif  // defined(__GNUC__)
564 }
565 
566 }  // namespace
567 
568 #if !LIBGAV1_CXX17
569 constexpr int EntropyDecoder::kWindowSize;  // static.
570 #endif
571 
EntropyDecoder(const uint8_t * data,size_t size,bool allow_update_cdf)572 EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
573                                bool allow_update_cdf)
574     : data_(data),
575       data_end_(data + size),
576       data_memcpy_end_((size >= sizeof(WindowSize))
577                            ? data + size - sizeof(WindowSize) + 1
578                            : data),
579       allow_update_cdf_(allow_update_cdf),
580       values_in_range_(kCdfMaxProbability) {
581   if (data_ < data_memcpy_end_) {
582     // This is a simplified version of PopulateBits() which loads 8 extra bits
583     // and skips the unnecessary shifts of value and window_diff_.
584     WindowSize value;
585     memcpy(&value, data_, sizeof(value));
586     data_ += sizeof(value);
587     window_diff_ = HostToBigEndian(value) ^ -1;
588     // Note the initial value of bits_ is larger than kMaxCachedBits as it's
589     // used to restore the most significant 0 bit that would be present after
590     // PopulateBits() when we extract the first symbol value.
591     // As shown in Section 8.2.2 Initialization process for symbol decoder,
592     // which uses a fixed offset to read the symbol values, the most
593     // significant bit is always 0:
594     //   The variable numBits is set equal to Min( sz * 8, 15).
595     //   The variable buf is read using the f(numBits) parsing process.
596     //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
597     //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
598     bits_ = kWindowSize - 15;
599     return;
600   }
601   window_diff_ = 0;
602   bits_ = -15;
603   PopulateBits();
604 }
605 
606 // This is similar to the ReadSymbol() implementation but it is optimized based
607 // on the following facts:
608 //   * The probability is fixed at half. So some multiplications can be replaced
609 //     with bit operations.
610 //   * Symbol count is fixed at 2.
ReadBit()611 int EntropyDecoder::ReadBit() {
612   const uint32_t curr =
613       ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
614   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
615   int bit = 1;
616   if (symbol_value >= curr) {
617     values_in_range_ -= curr;
618     window_diff_ -= static_cast<WindowSize>(curr) << bits_;
619     bit = 0;
620   } else {
621     values_in_range_ = curr;
622   }
623   NormalizeRange();
624   return bit;
625 }
626 
ReadLiteral(int num_bits)627 int64_t EntropyDecoder::ReadLiteral(int num_bits) {
628   assert(num_bits <= 32);
629   assert(num_bits > 0);
630   uint32_t literal = 0;
631   int bit = num_bits - 1;
632   do {
633     // ARM can combine a shift operation with a constant number of bits with
634     // some other operations, such as the OR operation.
635     // Here is an ARM disassembly example:
636     // orr w1, w0, w1, lsl #1
637     // which left shifts register w1 by 1 bit and OR the shift result with
638     // register w0.
639     // The next 2 lines are equivalent to:
640     // literal |= static_cast<uint32_t>(ReadBit()) << bit;
641     literal <<= 1;
642     literal |= static_cast<uint32_t>(ReadBit());
643   } while (--bit >= 0);
644   return literal;
645 }
646 
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)647 int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
648                                int symbol_count) {
649   const int symbol = ReadSymbolImpl(cdf, symbol_count);
650   if (allow_update_cdf_) {
651     UpdateCdf(cdf, symbol_count, symbol);
652   }
653   return symbol;
654 }
655 
ReadSymbol(uint16_t * LIBGAV1_RESTRICT cdf)656 bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
657   assert(cdf[1] == 0);
658   const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
659   if (allow_update_cdf_) {
660     const uint16_t count = cdf[2];
661     // rate is computed in the spec as:
662     //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
663     // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
664     //  4 + (count > 15) + (count > 31)
665     // Note that the largest value for count is 32 (it is not incremented beyond
666     // 32). So using that information:
667     //  count >> 4 is 0 for count from 0 to 15.
668     //  count >> 4 is 1 for count from 16 to 31.
669     //  count >> 4 is 2 for count == 32.
670     // Now, the equation becomes:
671     //  4 + (count >> 4).
672     // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
673     // with bitwise or. So the final equation is:
674     //  4 | (count >> 4).
675     const int rate = 4 | (count >> 4);
676     if (symbol) {
677       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
678     } else {
679       cdf[0] -= cdf[0] >> rate;
680     }
681     cdf[2] += static_cast<uint16_t>(count < 32);
682   }
683   return symbol;
684 }
685 
ReadSymbolWithoutCdfUpdate(uint16_t cdf)686 bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
687   return ReadSymbolImpl(cdf) != 0;
688 }
689 
690 template <int symbol_count>
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf)691 int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
692   static_assert(symbol_count >= 3 && symbol_count <= 16, "");
693   if (symbol_count == 3 || symbol_count == 4) {
694     return ReadSymbol3Or4(cdf, symbol_count);
695   }
696   int symbol;
697   if (symbol_count == 8) {
698     symbol = ReadSymbolImpl8(cdf);
699   } else if (symbol_count <= 13) {
700     symbol = ReadSymbolImpl(cdf, symbol_count);
701   } else {
702     symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
703   }
704   if (allow_update_cdf_) {
705     if (symbol_count == 5) {
706       UpdateCdf5(cdf, symbol);
707     } else if (symbol_count == 7) {
708       UpdateCdf7(cdf, symbol);
709     } else if (symbol_count == 8) {
710       UpdateCdf8(cdf, symbol);
711     } else if (symbol_count == 9) {
712       UpdateCdf9(cdf, symbol);
713     } else if (symbol_count == 11) {
714       UpdateCdf11(cdf, symbol);
715     } else if (symbol_count == 13) {
716       UpdateCdf13(cdf, symbol);
717     } else if (symbol_count == 16) {
718       UpdateCdf16(cdf, symbol);
719     } else {
720       UpdateCdf(cdf, symbol_count, symbol);
721     }
722   }
723   return symbol;
724 }
725 
ReadSymbolImpl(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)726 int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
727                                    int symbol_count) {
728   assert(cdf[symbol_count - 1] == 0);
729   --symbol_count;
730   uint32_t curr = values_in_range_;
731   int symbol = -1;
732   uint32_t prev;
733   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
734   uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
735   // Search through the |cdf| array to determine where the scaled cdf value and
736   // |symbol_value| cross over.
737   do {
738     prev = curr;
739     curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
740            delta;
741     delta -= kMinimumProbabilityPerSymbol;
742   } while (symbol_value < curr);
743   values_in_range_ = prev - curr;
744   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
745   NormalizeRange();
746   return symbol;
747 }
748 
ReadSymbolImplBinarySearch(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)749 int EntropyDecoder::ReadSymbolImplBinarySearch(
750     const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
751   assert(cdf[symbol_count - 1] == 0);
752   assert(symbol_count > 1 && symbol_count <= 16);
753   --symbol_count;
754   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
755   // Search through the |cdf| array to determine where the scaled cdf value and
756   // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
757   // search to do this. Let |symbol| be the index of the first |cdf| array
758   // entry whose scaled cdf value is less than or equal to |symbol_value|. The
759   // binary search maintains the invariant:
760   //   low <= symbol <= high + 1
761   // and terminates when low == high + 1.
762   int low = 0;
763   int high = symbol_count - 1;
764   // The binary search maintains the invariants that |prev| is the scaled cdf
765   // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
766   // convention, the scaled cdf value for -1 is values_in_range_.) When the
767   // binary search terminates, |prev| is the scaled cdf value for symbol - 1
768   // and |curr| is the scaled cdf value for |symbol|.
769   uint32_t prev = values_in_range_;
770   uint32_t curr = 0;
771   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
772   do {
773     const int mid = DivideBy2(low + high);
774     const uint32_t scaled_cdf =
775         ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
776     if (symbol_value < scaled_cdf) {
777       low = mid + 1;
778       prev = scaled_cdf;
779     } else {
780       high = mid - 1;
781       curr = scaled_cdf;
782     }
783   } while (low <= high);
784   assert(low == high + 1);
785   // At this point, |low| is the symbol that has been decoded.
786   values_in_range_ = prev - curr;
787   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
788   NormalizeRange();
789   return low;
790 }
791 
ReadSymbolImpl(uint16_t cdf)792 int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
793   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
794   const uint32_t curr =
795       (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
796       kMinimumProbabilityPerSymbol;
797   const int symbol = static_cast<int>(symbol_value < curr);
798   if (symbol == 1) {
799     values_in_range_ = curr;
800   } else {
801     values_in_range_ -= curr;
802     window_diff_ -= static_cast<WindowSize>(curr) << bits_;
803   }
804   NormalizeRange();
805   return symbol;
806 }
807 
808 // Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
809 // calls inlined.
ReadSymbol3Or4(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count)810 int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
811                                    const int symbol_count) {
812   assert(cdf[symbol_count - 1] == 0);
813   uint32_t curr = values_in_range_;
814   uint32_t prev;
815   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
816   uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
817   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
818 
819   // Search through the |cdf| array to determine where the scaled cdf value and
820   // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
821   // array.
822   //
823   // The original code is:
824   //
825   //  int symbol = -1;
826   //  do {
827   //    prev = curr;
828   //    curr =
829   //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
830   //        + delta;
831   //    delta -= kMinimumProbabilityPerSymbol;
832   //  } while (symbol_value < curr);
833   //  if (allow_update_cdf_) {
834   //    UpdateCdf(cdf, [3,4], symbol);
835   //  }
836   //
837   // The do-while loop is unrolled with three or four iterations, and the
838   // UpdateCdf call is inlined and merged into the iterations.
839   int symbol = 0;
840   // Iteration 0.
841   prev = curr;
842   curr =
843       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
844   if (symbol_value >= curr) {
845     // symbol == 0.
846     if (allow_update_cdf_) {
847       // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
848       const uint16_t count = cdf[symbol_count];
849       cdf[symbol_count] += static_cast<uint16_t>(count < 32);
850       const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
851       if (symbol_count == 4) {
852 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
853         // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
854         // NEON code is slower. Consider using the C version if __arm__ is
855         // defined.
856         // 2. The ARM NEON code (compiled for arm64) is slightly slower on
857         // Samsung Galaxy S8+ (SM-G955FD).
858         uint16x4_t cdf_vec = vld1_u16(cdf);
859         const int16x4_t negative_rate = vdup_n_s16(-rate);
860         const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
861         cdf_vec = vsub_u16(cdf_vec, delta);
862         vst1_u16(cdf, cdf_vec);
863 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
864         __m128i cdf_vec = LoadLo8(cdf);
865         const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
866         cdf_vec = _mm_sub_epi16(cdf_vec, delta);
867         StoreLo8(cdf, cdf_vec);
868 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
869         cdf[0] -= cdf[0] >> rate;
870         cdf[1] -= cdf[1] >> rate;
871         cdf[2] -= cdf[2] >> rate;
872 #endif
873       } else {  // symbol_count == 3.
874         cdf[0] -= cdf[0] >> rate;
875         cdf[1] -= cdf[1] >> rate;
876       }
877     }
878     goto found;
879   }
880   ++symbol;
881   delta -= kMinimumProbabilityPerSymbol;
882   // Iteration 1.
883   prev = curr;
884   curr =
885       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
886   if (symbol_value >= curr) {
887     // symbol == 1.
888     if (allow_update_cdf_) {
889       // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
890       const uint16_t count = cdf[symbol_count];
891       cdf[symbol_count] += static_cast<uint16_t>(count < 32);
892       const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
893       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
894       cdf[1] -= cdf[1] >> rate;
895       if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
896     }
897     goto found;
898   }
899   ++symbol;
900   if (symbol_count == 4) {
901     delta -= kMinimumProbabilityPerSymbol;
902     // Iteration 2.
903     prev = curr;
904     curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
905            delta;
906     if (symbol_value >= curr) {
907       // symbol == 2.
908       if (allow_update_cdf_) {
909         // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
910         const uint16_t count = cdf[4];
911         cdf[4] += static_cast<uint16_t>(count < 32);
912         const int rate = (count >> 4) + 5;
913         cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
914         cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
915         cdf[2] -= cdf[2] >> rate;
916       }
917       goto found;
918     }
919     ++symbol;
920   }
921   // |delta| is 0 for the last iteration.
922   // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
923   prev = curr;
924   // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
925   curr = 0;
926   // symbol == [2,3].
927   if (allow_update_cdf_) {
928     // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
929     const uint16_t count = cdf[symbol_count];
930     cdf[symbol_count] += static_cast<uint16_t>(count < 32);
931     const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
932     if (symbol_count == 4) {
933 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
934       // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
935       // code is a tiny bit slower. Consider using the C version if __arm__ is
936       // defined.
937       uint16x4_t cdf_vec = vld1_u16(cdf);
938       const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
939       const int16x4_t diff =
940           vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
941       const int16x4_t negative_rate = vdup_n_s16(-rate);
942       const uint16x4_t delta =
943           vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
944       cdf_vec = vadd_u16(cdf_vec, delta);
945       vst1_u16(cdf, cdf_vec);
946       cdf[3] = 0;
947 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
948       __m128i cdf_vec = LoadLo8(cdf);
949       const __m128i cdf_max_probability =
950           _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
951       const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
952       const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
953       cdf_vec = _mm_add_epi16(cdf_vec, delta);
954       StoreLo8(cdf, cdf_vec);
955       cdf[3] = 0;
956 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
957       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
958       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
959       cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
960 #endif
961     } else {  // symbol_count == 3.
962       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
963       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
964     }
965   }
966 found:
967   // End of unrolled do-while loop.
968 
969   values_in_range_ = prev - curr;
970   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
971   NormalizeRange();
972   return symbol;
973 }
974 
ReadSymbolImpl8(const uint16_t * LIBGAV1_RESTRICT const cdf)975 int EntropyDecoder::ReadSymbolImpl8(
976     const uint16_t* LIBGAV1_RESTRICT const cdf) {
977   assert(cdf[7] == 0);
978   uint32_t curr = values_in_range_;
979   uint32_t prev;
980   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
981   uint32_t delta = kMinimumProbabilityPerSymbol * 7;
982   // Search through the |cdf| array to determine where the scaled cdf value and
983   // |symbol_value| cross over.
984   //
985   // The original code is:
986   //
987   // int symbol = -1;
988   // do {
989   //   prev = curr;
990   //   curr =
991   //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
992   //       + delta;
993   //   delta -= kMinimumProbabilityPerSymbol;
994   // } while (symbol_value < curr);
995   //
996   // The do-while loop is unrolled with eight iterations.
997   int symbol = 0;
998 
999 #define READ_SYMBOL_ITERATION                                                \
1000   prev = curr;                                                               \
1001   curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
1002          delta;                                                              \
1003   if (symbol_value >= curr) goto found;                                      \
1004   ++symbol;                                                                  \
1005   delta -= kMinimumProbabilityPerSymbol
1006 
1007   READ_SYMBOL_ITERATION;  // Iteration 0.
1008   READ_SYMBOL_ITERATION;  // Iteration 1.
1009   READ_SYMBOL_ITERATION;  // Iteration 2.
1010   READ_SYMBOL_ITERATION;  // Iteration 3.
1011   READ_SYMBOL_ITERATION;  // Iteration 4.
1012   READ_SYMBOL_ITERATION;  // Iteration 5.
1013 
1014   // The last two iterations can be simplified, so they don't use the
1015   // READ_SYMBOL_ITERATION macro.
1016 #undef READ_SYMBOL_ITERATION
1017 
1018   // Iteration 6.
1019   prev = curr;
1020   curr =
1021       (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
1022   if (symbol_value >= curr) goto found;  // symbol == 6.
1023   ++symbol;
1024   // |delta| is 0 for the last iteration.
1025   // Iteration 7.
1026   prev = curr;
1027   // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
1028   curr = 0;
1029   // symbol == 7.
1030 found:
1031   // End of unrolled do-while loop.
1032 
1033   values_in_range_ = prev - curr;
1034   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
1035   NormalizeRange();
1036   return symbol;
1037 }
1038 
PopulateBits()1039 void EntropyDecoder::PopulateBits() {
1040   constexpr int kMaxCachedBits = kWindowSize - 16;
1041 #if defined(__aarch64__)
1042   // Fast path: read eight bytes and add the first six bytes to window_diff_.
1043   // This fast path makes the following assumptions.
1044   // 1. We assume that unaligned load of uint64_t is fast.
1045   // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
1046   //    bytes depending on the value of bits_. This fast path always reads 6
1047   //    bytes, which results in more calls to PopulateBits(). We assume that
1048   //    making more calls to a faster PopulateBits() is overall a win.
1049   // NOTE: Although this fast path could also be used on x86_64, it hurts
1050   // performance (measured on Lenovo ThinkStation P920 running Linux). (The
1051   // reason is still unknown.) Therefore this fast path is only used on arm64.
1052   static_assert(kWindowSize == 64, "");
1053   if (data_ < data_memcpy_end_) {
1054     uint64_t value;
1055     // arm64 supports unaligned loads, so this memcpy call is compiled to a
1056     // single ldr instruction.
1057     memcpy(&value, data_, sizeof(value));
1058     data_ += kMaxCachedBits >> 3;
1059     value = HostToBigEndian(value) ^ -1;
1060     value >>= kWindowSize - kMaxCachedBits;
1061     window_diff_ = value | (window_diff_ << kMaxCachedBits);
1062     bits_ += kMaxCachedBits;
1063     return;
1064   }
1065 #endif
1066 
1067   const uint8_t* data = data_;
1068   int bits = bits_;
1069   WindowSize window_diff = window_diff_;
1070 
1071   int count = kWindowSize - 9 - (bits + 15);
1072   // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
1073   // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
1074   // iterations when WindowSize is 64 bits. So it is not profitable to
1075   // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
1076   // the fast path above is not compiled.
1077 
1078 #ifdef __clang__
1079 #pragma clang loop vectorize(disable) interleave(disable)
1080 #endif
1081   for (; count >= 0 && data < data_end_; count -= 8) {
1082     const uint8_t value = *data++ ^ -1;
1083     window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
1084     bits += 8;
1085   }
1086   assert(bits <= kMaxCachedBits);
1087   if (data == data_end_) {
1088     // Shift in some 1s. This is equivalent to providing fake 0 data bits.
1089     window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
1090     bits = kMaxCachedBits;
1091   }
1092 
1093   data_ = data;
1094   bits_ = bits;
1095   window_diff_ = window_diff;
1096 }
1097 
NormalizeRange()1098 void EntropyDecoder::NormalizeRange() {
1099   const int bits_used = 15 ^ FloorLog2(values_in_range_);
1100   bits_ -= bits_used;
1101   values_in_range_ <<= bits_used;
1102   if (bits_ < 0) PopulateBits();
1103 }
1104 
1105 // Explicit instantiations.
1106 template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
1107 template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
1108 template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
1109 template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
1110 template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
1111 template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
1112 template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
1113 template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
1114 template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
1115 template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
1116 template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
1117 template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
1118 template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
1119 
1120 }  // namespace libgav1
1121