• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/utils/entropy_decoder.h"
16 
17 #include <cassert>
18 #include <cstring>
19 
20 #include "src/utils/common.h"
21 #include "src/utils/compiler_attributes.h"
22 #include "src/utils/constants.h"
23 
24 #if defined(__ARM_NEON__) || defined(__aarch64__) || \
25     (defined(_MSC_VER) && defined(_M_ARM))
26 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
27 #else
28 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
29 #endif
30 
31 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
32 #include <arm_neon.h>
33 #endif
34 
35 #if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
36 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 1
37 #else
38 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 0
39 #endif
40 
41 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
42 #include <smmintrin.h>
43 #endif
44 
45 namespace libgav1 {
46 namespace {
47 
48 constexpr uint32_t kReadBitMask = ~255;
49 // This constant is used to set the value of |bits_| so that bits can be read
50 // after end of stream without trying to refill the buffer for a reasonably long
51 // time.
52 constexpr int kLargeBitCount = 0x4000;
53 constexpr int kCdfPrecision = 6;
54 constexpr int kMinimumProbabilityPerSymbol = 4;
55 
56 // This function computes the "cur" variable as specified inside the do-while
57 // loop in Section 8.2.6 of the spec. This function is monotonically
58 // decreasing as the values of index increases (note that the |cdf| array is
59 // sorted in decreasing order).
ScaleCdf(uint32_t values_in_range_shifted,const uint16_t * const cdf,int index,int symbol_count)60 uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
61                   int index, int symbol_count) {
62   return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
63          (kMinimumProbabilityPerSymbol * (symbol_count - index));
64 }
65 
UpdateCdf(uint16_t * const cdf,const int symbol_count,const int symbol)66 void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) {
67   const uint16_t count = cdf[symbol_count];
68   // rate is computed in the spec as:
69   //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
70   // In this case cdf[N] is |count|.
71   // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
72   // symbol_count > 3. So the equation becomes:
73   //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
74   // Note that the largest value for count is 32 (it is not incremented beyond
75   // 32). So using that information:
76   //  count >> 4 is 0 for count from 0 to 15.
77   //  count >> 4 is 1 for count from 16 to 31.
78   //  count >> 4 is 2 for count == 31.
79   // Now, the equation becomes:
80   //  4 + (count >> 4) + (symbol_count > 3).
81   // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
82   // with bitwise or. So the final equation is:
83   // (4 | (count >> 4)) + (symbol_count > 3).
84   const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count > 3);
85   // Hints for further optimizations:
86   //
87   // 1. clang can vectorize this for loop with width 4, even though the loop
88   // contains an if-else statement. Therefore, it may be advantageous to use
89   // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
90   // (a multiple of 4 that's not too small).
91   //
92   // 2. The for loop can be rewritten in the following form, which would enable
93   // clang to vectorize the loop with width 8:
94   //
95   //   const int rounding = (1 << rate) - 1;
96   //   for (int i = 0; i < symbol_count - 1; ++i) {
97   //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
98   //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
99   //   }
100   //
101   // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
102   // integer arithmetic. The result of the unsigned subtraction is cast to a
103   // signed integer and right-shifted. This requires the right shift of a
104   // signed integer be an arithmetic shift, which is true for clang, gcc, and
105   // Visual C++.
106   for (int i = 0; i < symbol_count - 1; ++i) {
107     if (i < symbol) {
108       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
109     } else {
110       cdf[i] -= cdf[i] >> rate;
111     }
112   }
113   cdf[symbol_count] += static_cast<uint16_t>(count < 32);
114 }
115 
116 // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
117 // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
118 // SIMD instruction sets if available.
119 
120 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
121 
122 // The UpdateCdf() method contains the following for loop:
123 //
124 //   for (int i = 0; i < symbol_count - 1; ++i) {
125 //     if (i < symbol) {
126 //       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
127 //     } else {
128 //       cdf[i] -= cdf[i] >> rate;
129 //     }
130 //   }
131 //
132 // It can be rewritten in the following two forms, which are amenable to SIMD
133 // implementations:
134 //
135 //   const int rounding = (1 << rate) - 1;
136 //   for (int i = 0; i < symbol_count - 1; ++i) {
137 //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
138 //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
139 //   }
140 //
141 // or:
142 //
143 //   const int rounding = (1 << rate) - 1;
144 //   for (int i = 0; i < symbol_count - 1; ++i) {
145 //     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
146 //     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
147 //   }
148 //
149 // The following ARM NEON implementations use the second form, which seems
150 // slightly faster.
151 //
152 // The cdf array has symbol_count + 1 elements. The first symbol_count elements
153 // are the CDF. The last element is a count that is initialized to 0 and may
154 // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
155 // cdf[symbol_count - 1] is always 0, the for loop does not update
156 // cdf[symbol_count - 1]. However, it would be correct to have the for loop
157 // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
158 // for loop would take the else branch when i is symbol_count - 1:
159 //      cdf[i] -= cdf[i] >> rate;
160 // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
161 // after the update. The ARM NEON implementations take advantage of this in the
162 // following two cases:
163 // 1. When symbol_count is 8 or 16, the vectorized code updates the first
164 //    symbol_count elements in the array.
165 // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
166 //    the cdf array. Since an invalid CDF value is written into cdf[7], the
167 //    count in cdf[7] needs to be fixed up after the vectorized code.
168 
UpdateCdf5(uint16_t * const cdf,const int symbol)169 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
170   uint16x4_t cdf_vec = vld1_u16(cdf);
171   const uint16_t count = cdf[5];
172   const int rate = (4 | (count >> 4)) + 1;
173   const uint16x4_t zero = vdup_n_u16(0);
174   const uint16x4_t cdf_max_probability =
175       vdup_n_u16(kCdfMaxProbability + 1 - (1 << rate));
176   const uint16x4_t index = vcreate_u16(0x0003000200010000);
177   const uint16x4_t symbol_vec = vdup_n_u16(symbol);
178   const uint16x4_t mask = vclt_u16(index, symbol_vec);
179   const uint16x4_t a = vbsl_u16(mask, cdf_max_probability, zero);
180   const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(cdf_vec, a));
181   const int16x4_t negative_rate = vdup_n_s16(-rate);
182   const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
183   cdf_vec = vsub_u16(cdf_vec, delta);
184   vst1_u16(cdf, cdf_vec);
185   cdf[5] = count + static_cast<uint16_t>(count < 32);
186 }
187 
188 // This version works for |symbol_count| = 7, 8, or 9.
189 template <int symbol_count>
UpdateCdf7To9(uint16_t * const cdf,const int symbol)190 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
191   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
192   uint16x8_t cdf_vec = vld1q_u16(cdf);
193   const uint16_t count = cdf[symbol_count];
194   const int rate = (4 | (count >> 4)) + 1;
195   const uint16x8_t zero = vdupq_n_u16(0);
196   const uint16x8_t cdf_max_probability =
197       vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
198   const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
199                                         vcreate_u16(0x0007000600050004));
200   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
201   const uint16x8_t mask = vcltq_u16(index, symbol_vec);
202   const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
203   const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
204   const int16x8_t negative_rate = vdupq_n_s16(-rate);
205   const uint16x8_t delta =
206       vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
207   cdf_vec = vsubq_u16(cdf_vec, delta);
208   vst1q_u16(cdf, cdf_vec);
209   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
210 }
211 
UpdateCdf7(uint16_t * const cdf,const int symbol)212 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
213   UpdateCdf7To9<7>(cdf, symbol);
214 }
215 
UpdateCdf8(uint16_t * const cdf,const int symbol)216 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
217   UpdateCdf7To9<8>(cdf, symbol);
218 }
219 
UpdateCdf11(uint16_t * const cdf,const int symbol)220 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
221   uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
222   const uint16_t count = cdf[11];
223   cdf[11] = count + static_cast<uint16_t>(count < 32);
224   const int rate = (4 | (count >> 4)) + 1;
225   if (symbol > 1) {
226     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
227     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
228     const uint16x8_t zero = vdupq_n_u16(0);
229     const uint16x8_t cdf_max_probability =
230         vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
231     const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
232     const int16x8_t negative_rate = vdupq_n_s16(-rate);
233     const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
234                                           vcreate_u16(0x0009000800070006));
235     const uint16x8_t mask = vcltq_u16(index, symbol_vec);
236     const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
237     const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
238     const uint16x8_t delta =
239         vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
240     cdf_vec = vsubq_u16(cdf_vec, delta);
241     vst1q_u16(cdf + 2, cdf_vec);
242   } else {
243     if (symbol != 0) {
244       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
245       cdf[1] -= cdf[1] >> rate;
246     } else {
247       cdf[0] -= cdf[0] >> rate;
248       cdf[1] -= cdf[1] >> rate;
249     }
250     const int16x8_t negative_rate = vdupq_n_s16(-rate);
251     const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
252     cdf_vec = vsubq_u16(cdf_vec, delta);
253     vst1q_u16(cdf + 2, cdf_vec);
254   }
255 }
256 
UpdateCdf13(uint16_t * const cdf,const int symbol)257 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
258   uint16x8_t cdf_vec0 = vld1q_u16(cdf);
259   uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
260   const uint16_t count = cdf[13];
261   const int rate = (4 | (count >> 4)) + 1;
262   const uint16x8_t zero = vdupq_n_u16(0);
263   const uint16x8_t cdf_max_probability =
264       vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
265   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
266   const int16x8_t negative_rate = vdupq_n_s16(-rate);
267 
268   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
269                                   vcreate_u16(0x0007000600050004));
270   uint16x8_t mask = vcltq_u16(index, symbol_vec);
271   uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
272   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec0, a));
273   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
274   cdf_vec0 = vsubq_u16(cdf_vec0, delta);
275   vst1q_u16(cdf, cdf_vec0);
276 
277   index = vcombine_u16(vcreate_u16(0x0007000600050004),
278                        vcreate_u16(0x000b000a00090008));
279   mask = vcltq_u16(index, symbol_vec);
280   a = vbslq_u16(mask, cdf_max_probability, zero);
281   diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec1, a));
282   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
283   cdf_vec1 = vsubq_u16(cdf_vec1, delta);
284   vst1q_u16(cdf + 4, cdf_vec1);
285 
286   cdf[13] = count + static_cast<uint16_t>(count < 32);
287 }
288 
UpdateCdf16(uint16_t * const cdf,const int symbol)289 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
290   uint16x8_t cdf_vec = vld1q_u16(cdf);
291   const uint16_t count = cdf[16];
292   const int rate = (4 | (count >> 4)) + 1;
293   const uint16x8_t zero = vdupq_n_u16(0);
294   const uint16x8_t cdf_max_probability =
295       vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
296   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
297   const int16x8_t negative_rate = vdupq_n_s16(-rate);
298 
299   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
300                                   vcreate_u16(0x0007000600050004));
301   uint16x8_t mask = vcltq_u16(index, symbol_vec);
302   uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
303   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
304   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
305   cdf_vec = vsubq_u16(cdf_vec, delta);
306   vst1q_u16(cdf, cdf_vec);
307 
308   cdf_vec = vld1q_u16(cdf + 8);
309   index = vcombine_u16(vcreate_u16(0x000b000a00090008),
310                        vcreate_u16(0x000f000e000d000c));
311   mask = vcltq_u16(index, symbol_vec);
312   a = vbslq_u16(mask, cdf_max_probability, zero);
313   diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
314   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
315   cdf_vec = vsubq_u16(cdf_vec, delta);
316   vst1q_u16(cdf + 8, cdf_vec);
317 
318   cdf[16] = count + static_cast<uint16_t>(count < 32);
319 }
320 
321 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
322 
323 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
324 
LoadLo8(const void * a)325 inline __m128i LoadLo8(const void* a) {
326   return _mm_loadl_epi64(static_cast<const __m128i*>(a));
327 }
328 
LoadUnaligned16(const void * a)329 inline __m128i LoadUnaligned16(const void* a) {
330   return _mm_loadu_si128(static_cast<const __m128i*>(a));
331 }
332 
StoreLo8(void * a,const __m128i v)333 inline void StoreLo8(void* a, const __m128i v) {
334   _mm_storel_epi64(static_cast<__m128i*>(a), v);
335 }
336 
StoreUnaligned16(void * a,const __m128i v)337 inline void StoreUnaligned16(void* a, const __m128i v) {
338   _mm_storeu_si128(static_cast<__m128i*>(a), v);
339 }
340 
UpdateCdf5(uint16_t * const cdf,const int symbol)341 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
342   __m128i cdf_vec = LoadLo8(cdf);
343   const uint16_t count = cdf[5];
344   const int rate = (4 | (count >> 4)) + 1;
345   const __m128i zero = _mm_setzero_si128();
346   const __m128i cdf_max_probability = _mm_shufflelo_epi16(
347       _mm_cvtsi32_si128(kCdfMaxProbability + 1 - (1 << rate)), 0);
348   const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00030002, 0x00010000);
349   const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
350   const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
351   const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
352   const __m128i diff = _mm_sub_epi16(cdf_vec, a);
353   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
354   cdf_vec = _mm_sub_epi16(cdf_vec, delta);
355   StoreLo8(cdf, cdf_vec);
356   cdf[5] = count + static_cast<uint16_t>(count < 32);
357 }
358 
359 // This version works for |symbol_count| = 7, 8, or 9.
360 template <int symbol_count>
UpdateCdf7To9(uint16_t * const cdf,const int symbol)361 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
362   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
363   __m128i cdf_vec = LoadUnaligned16(cdf);
364   const uint16_t count = cdf[symbol_count];
365   const int rate = (4 | (count >> 4)) + 1;
366   const __m128i zero = _mm_setzero_si128();
367   const __m128i cdf_max_probability =
368       _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
369   const __m128i index =
370       _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
371   const __m128i symbol_vec = _mm_set1_epi16(symbol);
372   const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
373   const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
374   const __m128i diff = _mm_sub_epi16(cdf_vec, a);
375   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
376   cdf_vec = _mm_sub_epi16(cdf_vec, delta);
377   StoreUnaligned16(cdf, cdf_vec);
378   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
379 }
380 
UpdateCdf7(uint16_t * const cdf,const int symbol)381 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
382   UpdateCdf7To9<7>(cdf, symbol);
383 }
384 
UpdateCdf8(uint16_t * const cdf,const int symbol)385 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
386   UpdateCdf7To9<8>(cdf, symbol);
387 }
388 
UpdateCdf11(uint16_t * const cdf,const int symbol)389 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
390   __m128i cdf_vec = LoadUnaligned16(cdf + 2);
391   const uint16_t count = cdf[11];
392   cdf[11] = count + static_cast<uint16_t>(count < 32);
393   const int rate = (4 | (count >> 4)) + 1;
394   if (symbol > 1) {
395     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
396     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
397     const __m128i zero = _mm_setzero_si128();
398     const __m128i cdf_max_probability =
399         _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
400     const __m128i index =
401         _mm_set_epi32(0x00090008, 0x00070006, 0x00050004, 0x00030002);
402     const __m128i symbol_vec = _mm_set1_epi16(symbol);
403     const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
404     const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
405     const __m128i diff = _mm_sub_epi16(cdf_vec, a);
406     const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
407     cdf_vec = _mm_sub_epi16(cdf_vec, delta);
408     StoreUnaligned16(cdf + 2, cdf_vec);
409   } else {
410     if (symbol != 0) {
411       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
412       cdf[1] -= cdf[1] >> rate;
413     } else {
414       cdf[0] -= cdf[0] >> rate;
415       cdf[1] -= cdf[1] >> rate;
416     }
417     const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
418     cdf_vec = _mm_sub_epi16(cdf_vec, delta);
419     StoreUnaligned16(cdf + 2, cdf_vec);
420   }
421 }
422 
UpdateCdf13(uint16_t * const cdf,const int symbol)423 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
424   __m128i cdf_vec0 = LoadUnaligned16(cdf);
425   __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
426   const uint16_t count = cdf[13];
427   const int rate = (4 | (count >> 4)) + 1;
428   const __m128i zero = _mm_setzero_si128();
429   const __m128i cdf_max_probability =
430       _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
431   const __m128i symbol_vec = _mm_set1_epi16(symbol);
432 
433   const __m128i index =
434       _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
435   const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
436   const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
437   const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
438   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
439   cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
440   StoreUnaligned16(cdf, cdf_vec0);
441 
442   const __m128i index1 =
443       _mm_set_epi32(0x000b000a, 0x00090008, 0x00070006, 0x00050004);
444   const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
445   const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
446   const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
447   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
448   cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
449   StoreUnaligned16(cdf + 4, cdf_vec1);
450 
451   cdf[13] = count + static_cast<uint16_t>(count < 32);
452 }
453 
UpdateCdf16(uint16_t * const cdf,const int symbol)454 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
455   __m128i cdf_vec0 = LoadUnaligned16(cdf);
456   const uint16_t count = cdf[16];
457   const int rate = (4 | (count >> 4)) + 1;
458   const __m128i zero = _mm_setzero_si128();
459   const __m128i cdf_max_probability =
460       _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
461   const __m128i symbol_vec = _mm_set1_epi16(symbol);
462 
463   const __m128i index =
464       _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
465   const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
466   const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
467   const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
468   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
469   cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
470   StoreUnaligned16(cdf, cdf_vec0);
471 
472   __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
473   const __m128i index1 =
474       _mm_set_epi32(0x000f000e, 0x000d000c, 0x000b000a, 0x00090008);
475   const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
476   const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
477   const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
478   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
479   cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
480   StoreUnaligned16(cdf + 8, cdf_vec1);
481 
482   cdf[16] = count + static_cast<uint16_t>(count < 32);
483 }
484 
485 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
486 
UpdateCdf5(uint16_t * const cdf,const int symbol)487 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
488   UpdateCdf(cdf, 5, symbol);
489 }
490 
UpdateCdf7(uint16_t * const cdf,const int symbol)491 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
492   UpdateCdf(cdf, 7, symbol);
493 }
494 
UpdateCdf8(uint16_t * const cdf,const int symbol)495 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
496   UpdateCdf(cdf, 8, symbol);
497 }
498 
UpdateCdf11(uint16_t * const cdf,const int symbol)499 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
500   UpdateCdf(cdf, 11, symbol);
501 }
502 
UpdateCdf13(uint16_t * const cdf,const int symbol)503 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
504   UpdateCdf(cdf, 13, symbol);
505 }
506 
UpdateCdf16(uint16_t * const cdf,const int symbol)507 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
508   UpdateCdf(cdf, 16, symbol);
509 }
510 
511 #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
512 #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
513 
514 }  // namespace
515 
516 #if !LIBGAV1_CXX17
517 constexpr int DaalaBitReader::kWindowSize;  // static.
518 #endif
519 
DaalaBitReader(const uint8_t * data,size_t size,bool allow_update_cdf)520 DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
521                                bool allow_update_cdf)
522     : data_(data),
523       size_(size),
524       data_index_(0),
525       allow_update_cdf_(allow_update_cdf) {
526   window_diff_ = (WindowSize{1} << (kWindowSize - 1)) - 1;
527   values_in_range_ = kCdfMaxProbability;
528   bits_ = -15;
529   PopulateBits();
530 }
531 
532 // This is similar to the ReadSymbol() implementation but it is optimized based
533 // on the following facts:
534 //   * The probability is fixed at half. So some multiplications can be replaced
535 //     with bit operations.
536 //   * Symbol count is fixed at 2.
ReadBit()537 int DaalaBitReader::ReadBit() {
538   const uint32_t curr =
539       ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
540   const WindowSize zero_threshold = static_cast<WindowSize>(curr)
541                                     << (kWindowSize - 16);
542   int bit = 1;
543   if (window_diff_ >= zero_threshold) {
544     values_in_range_ -= curr;
545     window_diff_ -= zero_threshold;
546     bit = 0;
547   } else {
548     values_in_range_ = curr;
549   }
550   NormalizeRange();
551   return bit;
552 }
553 
ReadLiteral(int num_bits)554 int64_t DaalaBitReader::ReadLiteral(int num_bits) {
555   assert(num_bits <= 32);
556   assert(num_bits > 0);
557   uint32_t literal = 0;
558   int bit = num_bits - 1;
559   do {
560     // ARM can combine a shift operation with a constant number of bits with
561     // some other operations, such as the OR operation.
562     // Here is an ARM disassembly example:
563     // orr w1, w0, w1, lsl #1
564     // which left shifts register w1 by 1 bit and OR the shift result with
565     // register w0.
566     // The next 2 lines are equivalent to:
567     // literal |= static_cast<uint32_t>(ReadBit()) << bit;
568     literal <<= 1;
569     literal |= static_cast<uint32_t>(ReadBit());
570   } while (--bit >= 0);
571   return literal;
572 }
573 
ReadSymbol(uint16_t * const cdf,int symbol_count)574 int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) {
575   const int symbol = ReadSymbolImpl(cdf, symbol_count);
576   if (allow_update_cdf_) {
577     UpdateCdf(cdf, symbol_count, symbol);
578   }
579   return symbol;
580 }
581 
ReadSymbol(uint16_t * cdf)582 bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
583   const bool symbol = ReadSymbolImpl(cdf) != 0;
584   if (allow_update_cdf_) {
585     const uint16_t count = cdf[2];
586     // rate is computed in the spec as:
587     //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
588     // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
589     //  4 + (count > 15) + (count > 31)
590     // Note that the largest value for count is 32 (it is not incremented beyond
591     // 32). So using that information:
592     //  count >> 4 is 0 for count from 0 to 15.
593     //  count >> 4 is 1 for count from 16 to 31.
594     //  count >> 4 is 2 for count == 32.
595     // Now, the equation becomes:
596     //  4 + (count >> 4).
597     // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
598     // with bitwise or. So the final equation is:
599     //  4 | (count >> 4).
600     const int rate = 4 | (count >> 4);
601     if (symbol) {
602       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
603     } else {
604       cdf[0] -= cdf[0] >> rate;
605     }
606     cdf[2] += static_cast<uint16_t>(count < 32);
607   }
608   return symbol;
609 }
610 
ReadSymbolWithoutCdfUpdate(uint16_t * cdf)611 bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t* cdf) {
612   return ReadSymbolImpl(cdf) != 0;
613 }
614 
615 template <int symbol_count>
ReadSymbol(uint16_t * const cdf)616 int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
617   static_assert(symbol_count >= 3 && symbol_count <= 16, "");
618   if (symbol_count == 4) {
619     return ReadSymbol4(cdf);
620   }
621   int symbol;
622   if (symbol_count == 8) {
623     symbol = ReadSymbolImpl8(cdf);
624   } else if (symbol_count <= 13) {
625     symbol = ReadSymbolImpl(cdf, symbol_count);
626   } else {
627     symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
628   }
629   if (allow_update_cdf_) {
630     if (symbol_count == 5) {
631       UpdateCdf5(cdf, symbol);
632     } else if (symbol_count == 7) {
633       UpdateCdf7(cdf, symbol);
634     } else if (symbol_count == 8) {
635       UpdateCdf8(cdf, symbol);
636     } else if (symbol_count == 11) {
637       UpdateCdf11(cdf, symbol);
638     } else if (symbol_count == 13) {
639       UpdateCdf13(cdf, symbol);
640     } else if (symbol_count == 16) {
641       UpdateCdf16(cdf, symbol);
642     } else {
643       UpdateCdf(cdf, symbol_count, symbol);
644     }
645   }
646   return symbol;
647 }
648 
ReadSymbolImpl(const uint16_t * const cdf,int symbol_count)649 int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf,
650                                    int symbol_count) {
651   assert(cdf[symbol_count - 1] == 0);
652   --symbol_count;
653   uint32_t curr = values_in_range_;
654   int symbol = -1;
655   uint32_t prev;
656   const auto symbol_value =
657       static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
658   uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
659   // Search through the |cdf| array to determine where the scaled cdf value and
660   // |symbol_value| cross over.
661   do {
662     prev = curr;
663     curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
664            delta;
665     delta -= kMinimumProbabilityPerSymbol;
666   } while (symbol_value < curr);
667   values_in_range_ = prev - curr;
668   window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
669   NormalizeRange();
670   return symbol;
671 }
672 
ReadSymbolImplBinarySearch(const uint16_t * const cdf,int symbol_count)673 int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf,
674                                                int symbol_count) {
675   assert(cdf[symbol_count - 1] == 0);
676   assert(symbol_count > 1 && symbol_count <= 16);
677   --symbol_count;
678   const auto symbol_value =
679       static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
680   // Search through the |cdf| array to determine where the scaled cdf value and
681   // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
682   // search to do this. Let |symbol| be the index of the first |cdf| array
683   // entry whose scaled cdf value is less than or equal to |symbol_value|. The
684   // binary search maintains the invariant:
685   //   low <= symbol <= high + 1
686   // and terminates when low == high + 1.
687   int low = 0;
688   int high = symbol_count - 1;
689   // The binary search maintains the invariants that |prev| is the scaled cdf
690   // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
691   // convention, the scaled cdf value for -1 is values_in_range_.) When the
692   // binary search terminates, |prev| is the scaled cdf value for symbol - 1
693   // and |curr| is the scaled cdf value for |symbol|.
694   uint32_t prev = values_in_range_;
695   uint32_t curr = 0;
696   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
697   do {
698     const int mid = DivideBy2(low + high);
699     const uint32_t scaled_cdf =
700         ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
701     if (symbol_value < scaled_cdf) {
702       low = mid + 1;
703       prev = scaled_cdf;
704     } else {
705       high = mid - 1;
706       curr = scaled_cdf;
707     }
708   } while (low <= high);
709   assert(low == high + 1);
710   // At this point, |low| is the symbol that has been decoded.
711   values_in_range_ = prev - curr;
712   window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
713   NormalizeRange();
714   return low;
715 }
716 
ReadSymbolImpl(const uint16_t * const cdf)717 int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf) {
718   assert(cdf[1] == 0);
719   const auto symbol_value =
720       static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
721   const uint32_t curr = ScaleCdf(values_in_range_ >> 8, cdf, 0, 1);
722   const int symbol = static_cast<int>(symbol_value < curr);
723   if (symbol == 1) {
724     values_in_range_ = curr;
725   } else {
726     values_in_range_ -= curr;
727     window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
728   }
729   NormalizeRange();
730   return symbol;
731 }
732 
733 // Equivalent to ReadSymbol(cdf, 4), with the ReadSymbolImpl and UpdateCdf
734 // calls inlined.
ReadSymbol4(uint16_t * const cdf)735 int DaalaBitReader::ReadSymbol4(uint16_t* const cdf) {
736   assert(cdf[3] == 0);
737   uint32_t curr = values_in_range_;
738   uint32_t prev;
739   const auto symbol_value =
740       static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
741   uint32_t delta = kMinimumProbabilityPerSymbol * 3;
742   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
743 
744   // Search through the |cdf| array to determine where the scaled cdf value and
745   // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
746   // array.
747   //
748   // The original code is:
749   //
750   //  int symbol = -1;
751   //  do {
752   //    prev = curr;
753   //    curr =
754   //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
755   //        + delta;
756   //    delta -= kMinimumProbabilityPerSymbol;
757   //  } while (symbol_value < curr);
758   //  if (allow_update_cdf_) {
759   //    UpdateCdf(cdf, 4, symbol);
760   //  }
761   //
762   // The do-while loop is unrolled with four iterations, and the UpdateCdf call
763   // is inlined and merged into the four iterations.
764   int symbol = 0;
765   // Iteration 0.
766   prev = curr;
767   curr =
768       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
769   if (symbol_value >= curr) {
770     // symbol == 0.
771     if (allow_update_cdf_) {
772       // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/0).
773       const uint16_t count = cdf[4];
774       cdf[4] += static_cast<uint16_t>(count < 32);
775       const int rate = (4 | (count >> 4)) + 1;
776 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
777       // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
778       // NEON code is slower. Consider using the C version if __arm__ is
779       // defined.
780       // 2. The ARM NEON code (compiled for arm64) is slightly slower on
781       // Samsung Galaxy S8+ (SM-G955FD).
782       uint16x4_t cdf_vec = vld1_u16(cdf);
783       const int16x4_t negative_rate = vdup_n_s16(-rate);
784       const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
785       cdf_vec = vsub_u16(cdf_vec, delta);
786       vst1_u16(cdf, cdf_vec);
787 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
788       __m128i cdf_vec = LoadLo8(cdf);
789       const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
790       cdf_vec = _mm_sub_epi16(cdf_vec, delta);
791       StoreLo8(cdf, cdf_vec);
792 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
793       cdf[0] -= cdf[0] >> rate;
794       cdf[1] -= cdf[1] >> rate;
795       cdf[2] -= cdf[2] >> rate;
796 #endif
797     }
798     goto found;
799   }
800   ++symbol;
801   delta -= kMinimumProbabilityPerSymbol;
802   // Iteration 1.
803   prev = curr;
804   curr =
805       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
806   if (symbol_value >= curr) {
807     // symbol == 1.
808     if (allow_update_cdf_) {
809       // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/1).
810       const uint16_t count = cdf[4];
811       cdf[4] += static_cast<uint16_t>(count < 32);
812       const int rate = (4 | (count >> 4)) + 1;
813       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
814       cdf[1] -= cdf[1] >> rate;
815       cdf[2] -= cdf[2] >> rate;
816     }
817     goto found;
818   }
819   ++symbol;
820   delta -= kMinimumProbabilityPerSymbol;
821   // Iteration 2.
822   prev = curr;
823   curr =
824       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
825   if (symbol_value >= curr) {
826     // symbol == 2.
827     if (allow_update_cdf_) {
828       // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
829       const uint16_t count = cdf[4];
830       cdf[4] += static_cast<uint16_t>(count < 32);
831       const int rate = (4 | (count >> 4)) + 1;
832       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
833       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
834       cdf[2] -= cdf[2] >> rate;
835     }
836     goto found;
837   }
838   ++symbol;
839   // |delta| is 0 for the last iteration.
840   // Iteration 3.
841   prev = curr;
842   // Since cdf[3] is 0 and |delta| is 0, |curr| is also 0.
843   curr = 0;
844   // symbol == 3.
845   if (allow_update_cdf_) {
846     // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/3).
847     const uint16_t count = cdf[4];
848     cdf[4] += static_cast<uint16_t>(count < 32);
849     const int rate = (4 | (count >> 4)) + 1;
850 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
851     // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
852     // code is a tiny bit slower. Consider using the C version if __arm__ is
853     // defined.
854     uint16x4_t cdf_vec = vld1_u16(cdf);
855     const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
856     const int16x4_t diff =
857         vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
858     const int16x4_t negative_rate = vdup_n_s16(-rate);
859     const uint16x4_t delta =
860         vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
861     cdf_vec = vadd_u16(cdf_vec, delta);
862     vst1_u16(cdf, cdf_vec);
863     cdf[3] = 0;
864 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
865     __m128i cdf_vec = LoadLo8(cdf);
866     const __m128i cdf_max_probability =
867         _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
868     const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
869     const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
870     cdf_vec = _mm_add_epi16(cdf_vec, delta);
871     StoreLo8(cdf, cdf_vec);
872     cdf[3] = 0;
873 #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
874     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
875     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
876     cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
877 #endif
878   }
879 found:
880   // End of unrolled do-while loop.
881 
882   values_in_range_ = prev - curr;
883   window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
884   NormalizeRange();
885   return symbol;
886 }
887 
ReadSymbolImpl8(const uint16_t * const cdf)888 int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) {
889   assert(cdf[7] == 0);
890   uint32_t curr = values_in_range_;
891   uint32_t prev;
892   const auto symbol_value =
893       static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
894   uint32_t delta = kMinimumProbabilityPerSymbol * 7;
895   // Search through the |cdf| array to determine where the scaled cdf value and
896   // |symbol_value| cross over.
897   //
898   // The original code is:
899   //
900   // int symbol = -1;
901   // do {
902   //   prev = curr;
903   //   curr =
904   //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
905   //       + delta;
906   //   delta -= kMinimumProbabilityPerSymbol;
907   // } while (symbol_value < curr);
908   //
909   // The do-while loop is unrolled with eight iterations.
910   int symbol = 0;
911 
912 #define READ_SYMBOL_ITERATION                                                \
913   prev = curr;                                                               \
914   curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
915          delta;                                                              \
916   if (symbol_value >= curr) goto found;                                      \
917   ++symbol;                                                                  \
918   delta -= kMinimumProbabilityPerSymbol
919 
920   READ_SYMBOL_ITERATION;  // Iteration 0.
921   READ_SYMBOL_ITERATION;  // Iteration 1.
922   READ_SYMBOL_ITERATION;  // Iteration 2.
923   READ_SYMBOL_ITERATION;  // Iteration 3.
924   READ_SYMBOL_ITERATION;  // Iteration 4.
925   READ_SYMBOL_ITERATION;  // Iteration 5.
926 
927   // The last two iterations can be simplified, so they don't use the
928   // READ_SYMBOL_ITERATION macro.
929 #undef READ_SYMBOL_ITERATION
930 
931   // Iteration 6.
932   prev = curr;
933   curr =
934       (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
935   if (symbol_value >= curr) goto found;  // symbol == 6.
936   ++symbol;
937   // |delta| is 0 for the last iteration.
938   // Iteration 7.
939   prev = curr;
940   // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
941   curr = 0;
942   // symbol == 7.
943 found:
944   // End of unrolled do-while loop.
945 
946   values_in_range_ = prev - curr;
947   window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
948   NormalizeRange();
949   return symbol;
950 }
951 
PopulateBits()952 void DaalaBitReader::PopulateBits() {
953 #if defined(__aarch64__)
954   // Fast path: read eight bytes and add the first six bytes to window_diff_.
955   // This fast path makes the following assumptions.
956   // 1. We assume that unaligned load of uint64_t is fast.
957   // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
958   //    bytes depending on the value of bits_. This fast path always reads 6
959   //    bytes, which results in more calls to PopulateBits(). We assume that
960   //    making more calls to a faster PopulateBits() is overall a win.
961   // NOTE: Although this fast path could also be used on x86_64, it hurts
962   // performance (measured on Lenovo ThinkStation P920 running Linux). (The
963   // reason is still unknown.) Therefore this fast path is only used on arm64.
964   static_assert(kWindowSize == 64, "");
965   if (size_ - data_index_ >= 8) {
966     uint64_t value;
967     // arm64 supports unaligned loads, so this memcpy call is compiled to a
968     // single ldr instruction.
969     memcpy(&value, &data_[data_index_], 8);
970 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
971     value = __builtin_bswap64(value);
972 #endif
973     value &= 0xffffffffffff0000;
974     window_diff_ ^= static_cast<WindowSize>(value) >> (bits_ + 16);
975     data_index_ += 6;
976     bits_ += 6 * 8;
977     return;
978   }
979 #endif
980 
981   size_t data_index = data_index_;
982   int bits = bits_;
983   WindowSize window_diff = window_diff_;
984 
985   int shift = kWindowSize - 9 - (bits + 15);
986   // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
987   // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
988   // iterations when WindowSize is 64 bits. So it is not profitable to
989   // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
990   // the fast path above is not compiled.
991 
992 #ifdef __clang__
993 #pragma clang loop vectorize(disable) interleave(disable)
994 #endif
995   for (; shift >= 0 && data_index < size_; shift -= 8) {
996     window_diff ^= static_cast<WindowSize>(data_[data_index++]) << shift;
997     bits += 8;
998   }
999   if (data_index >= size_) {
1000     bits = kLargeBitCount;
1001   }
1002 
1003   data_index_ = data_index;
1004   bits_ = bits;
1005   window_diff_ = window_diff;
1006 }
1007 
NormalizeRange()1008 void DaalaBitReader::NormalizeRange() {
1009   const int bits_used = 15 - FloorLog2(values_in_range_);
1010   bits_ -= bits_used;
1011   window_diff_ = ((window_diff_ + 1) << bits_used) - 1;
1012   values_in_range_ <<= bits_used;
1013   if (bits_ < 0) PopulateBits();
1014 }
1015 
1016 // Explicit instantiations.
1017 template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
1018 template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
1019 template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
1020 template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
1021 template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
1022 template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
1023 template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
1024 template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
1025 template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
1026 template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
1027 
1028 }  // namespace libgav1
1029