1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/utils/entropy_decoder.h"
16
17 #include <cassert>
18 #include <cstring>
19
20 #include "src/utils/common.h"
21 #include "src/utils/compiler_attributes.h"
22 #include "src/utils/constants.h"
23
24 #if defined(__ARM_NEON__) || defined(__aarch64__) || \
25 (defined(_MSC_VER) && defined(_M_ARM))
26 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
27 #else
28 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
29 #endif
30
31 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
32 #include <arm_neon.h>
33 #endif
34
35 #if defined(__SSE4_1__) || defined(LIBGAV1_X86_MSVC)
36 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 1
37 #else
38 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4 0
39 #endif
40
41 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
42 #include <smmintrin.h>
43 #endif
44
45 namespace libgav1 {
46 namespace {
47
48 constexpr uint32_t kReadBitMask = ~255;
49 // This constant is used to set the value of |bits_| so that bits can be read
50 // after end of stream without trying to refill the buffer for a reasonably long
51 // time.
52 constexpr int kLargeBitCount = 0x4000;
53 constexpr int kCdfPrecision = 6;
54 constexpr int kMinimumProbabilityPerSymbol = 4;
55
56 // This function computes the "cur" variable as specified inside the do-while
57 // loop in Section 8.2.6 of the spec. This function is monotonically
58 // decreasing as the values of index increases (note that the |cdf| array is
59 // sorted in decreasing order).
ScaleCdf(uint32_t values_in_range_shifted,const uint16_t * const cdf,int index,int symbol_count)60 uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
61 int index, int symbol_count) {
62 return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
63 (kMinimumProbabilityPerSymbol * (symbol_count - index));
64 }
65
UpdateCdf(uint16_t * const cdf,const int symbol_count,const int symbol)66 void UpdateCdf(uint16_t* const cdf, const int symbol_count, const int symbol) {
67 const uint16_t count = cdf[symbol_count];
68 // rate is computed in the spec as:
69 // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
70 // In this case cdf[N] is |count|.
71 // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
72 // symbol_count > 3. So the equation becomes:
73 // 4 + (count > 15) + (count > 31) + (symbol_count > 3).
74 // Note that the largest value for count is 32 (it is not incremented beyond
75 // 32). So using that information:
76 // count >> 4 is 0 for count from 0 to 15.
77 // count >> 4 is 1 for count from 16 to 31.
78 // count >> 4 is 2 for count == 31.
79 // Now, the equation becomes:
80 // 4 + (count >> 4) + (symbol_count > 3).
81 // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
82 // with bitwise or. So the final equation is:
83 // (4 | (count >> 4)) + (symbol_count > 3).
84 const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count > 3);
85 // Hints for further optimizations:
86 //
87 // 1. clang can vectorize this for loop with width 4, even though the loop
88 // contains an if-else statement. Therefore, it may be advantageous to use
89 // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
90 // (a multiple of 4 that's not too small).
91 //
92 // 2. The for loop can be rewritten in the following form, which would enable
93 // clang to vectorize the loop with width 8:
94 //
95 // const int rounding = (1 << rate) - 1;
96 // for (int i = 0; i < symbol_count - 1; ++i) {
97 // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
98 // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
99 // }
100 //
101 // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
102 // integer arithmetic. The result of the unsigned subtraction is cast to a
103 // signed integer and right-shifted. This requires the right shift of a
104 // signed integer be an arithmetic shift, which is true for clang, gcc, and
105 // Visual C++.
106 for (int i = 0; i < symbol_count - 1; ++i) {
107 if (i < symbol) {
108 cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
109 } else {
110 cdf[i] -= cdf[i] >> rate;
111 }
112 }
113 cdf[symbol_count] += static_cast<uint16_t>(count < 32);
114 }
115
116 // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
117 // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
118 // SIMD instruction sets if available.
119
120 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
121
122 // The UpdateCdf() method contains the following for loop:
123 //
124 // for (int i = 0; i < symbol_count - 1; ++i) {
125 // if (i < symbol) {
126 // cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
127 // } else {
128 // cdf[i] -= cdf[i] >> rate;
129 // }
130 // }
131 //
132 // It can be rewritten in the following two forms, which are amenable to SIMD
133 // implementations:
134 //
135 // const int rounding = (1 << rate) - 1;
136 // for (int i = 0; i < symbol_count - 1; ++i) {
137 // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
138 // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
139 // }
140 //
141 // or:
142 //
143 // const int rounding = (1 << rate) - 1;
144 // for (int i = 0; i < symbol_count - 1; ++i) {
145 // const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
146 // cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
147 // }
148 //
149 // The following ARM NEON implementations use the second form, which seems
150 // slightly faster.
151 //
152 // The cdf array has symbol_count + 1 elements. The first symbol_count elements
153 // are the CDF. The last element is a count that is initialized to 0 and may
154 // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
155 // cdf[symbol_count - 1] is always 0, the for loop does not update
156 // cdf[symbol_count - 1]. However, it would be correct to have the for loop
157 // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
158 // for loop would take the else branch when i is symbol_count - 1:
159 // cdf[i] -= cdf[i] >> rate;
160 // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
161 // after the update. The ARM NEON implementations take advantage of this in the
162 // following two cases:
163 // 1. When symbol_count is 8 or 16, the vectorized code updates the first
164 // symbol_count elements in the array.
165 // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
166 // the cdf array. Since an invalid CDF value is written into cdf[7], the
167 // count in cdf[7] needs to be fixed up after the vectorized code.
168
UpdateCdf5(uint16_t * const cdf,const int symbol)169 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
170 uint16x4_t cdf_vec = vld1_u16(cdf);
171 const uint16_t count = cdf[5];
172 const int rate = (4 | (count >> 4)) + 1;
173 const uint16x4_t zero = vdup_n_u16(0);
174 const uint16x4_t cdf_max_probability =
175 vdup_n_u16(kCdfMaxProbability + 1 - (1 << rate));
176 const uint16x4_t index = vcreate_u16(0x0003000200010000);
177 const uint16x4_t symbol_vec = vdup_n_u16(symbol);
178 const uint16x4_t mask = vclt_u16(index, symbol_vec);
179 const uint16x4_t a = vbsl_u16(mask, cdf_max_probability, zero);
180 const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(cdf_vec, a));
181 const int16x4_t negative_rate = vdup_n_s16(-rate);
182 const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
183 cdf_vec = vsub_u16(cdf_vec, delta);
184 vst1_u16(cdf, cdf_vec);
185 cdf[5] = count + static_cast<uint16_t>(count < 32);
186 }
187
188 // This version works for |symbol_count| = 7, 8, or 9.
189 template <int symbol_count>
UpdateCdf7To9(uint16_t * const cdf,const int symbol)190 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
191 static_assert(symbol_count >= 7 && symbol_count <= 9, "");
192 uint16x8_t cdf_vec = vld1q_u16(cdf);
193 const uint16_t count = cdf[symbol_count];
194 const int rate = (4 | (count >> 4)) + 1;
195 const uint16x8_t zero = vdupq_n_u16(0);
196 const uint16x8_t cdf_max_probability =
197 vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
198 const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
199 vcreate_u16(0x0007000600050004));
200 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
201 const uint16x8_t mask = vcltq_u16(index, symbol_vec);
202 const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
203 const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
204 const int16x8_t negative_rate = vdupq_n_s16(-rate);
205 const uint16x8_t delta =
206 vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
207 cdf_vec = vsubq_u16(cdf_vec, delta);
208 vst1q_u16(cdf, cdf_vec);
209 cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
210 }
211
UpdateCdf7(uint16_t * const cdf,const int symbol)212 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
213 UpdateCdf7To9<7>(cdf, symbol);
214 }
215
UpdateCdf8(uint16_t * const cdf,const int symbol)216 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
217 UpdateCdf7To9<8>(cdf, symbol);
218 }
219
UpdateCdf11(uint16_t * const cdf,const int symbol)220 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
221 uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
222 const uint16_t count = cdf[11];
223 cdf[11] = count + static_cast<uint16_t>(count < 32);
224 const int rate = (4 | (count >> 4)) + 1;
225 if (symbol > 1) {
226 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
227 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
228 const uint16x8_t zero = vdupq_n_u16(0);
229 const uint16x8_t cdf_max_probability =
230 vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
231 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
232 const int16x8_t negative_rate = vdupq_n_s16(-rate);
233 const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
234 vcreate_u16(0x0009000800070006));
235 const uint16x8_t mask = vcltq_u16(index, symbol_vec);
236 const uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
237 const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
238 const uint16x8_t delta =
239 vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
240 cdf_vec = vsubq_u16(cdf_vec, delta);
241 vst1q_u16(cdf + 2, cdf_vec);
242 } else {
243 if (symbol != 0) {
244 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
245 cdf[1] -= cdf[1] >> rate;
246 } else {
247 cdf[0] -= cdf[0] >> rate;
248 cdf[1] -= cdf[1] >> rate;
249 }
250 const int16x8_t negative_rate = vdupq_n_s16(-rate);
251 const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
252 cdf_vec = vsubq_u16(cdf_vec, delta);
253 vst1q_u16(cdf + 2, cdf_vec);
254 }
255 }
256
UpdateCdf13(uint16_t * const cdf,const int symbol)257 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
258 uint16x8_t cdf_vec0 = vld1q_u16(cdf);
259 uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
260 const uint16_t count = cdf[13];
261 const int rate = (4 | (count >> 4)) + 1;
262 const uint16x8_t zero = vdupq_n_u16(0);
263 const uint16x8_t cdf_max_probability =
264 vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
265 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
266 const int16x8_t negative_rate = vdupq_n_s16(-rate);
267
268 uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
269 vcreate_u16(0x0007000600050004));
270 uint16x8_t mask = vcltq_u16(index, symbol_vec);
271 uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
272 int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec0, a));
273 uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
274 cdf_vec0 = vsubq_u16(cdf_vec0, delta);
275 vst1q_u16(cdf, cdf_vec0);
276
277 index = vcombine_u16(vcreate_u16(0x0007000600050004),
278 vcreate_u16(0x000b000a00090008));
279 mask = vcltq_u16(index, symbol_vec);
280 a = vbslq_u16(mask, cdf_max_probability, zero);
281 diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec1, a));
282 delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
283 cdf_vec1 = vsubq_u16(cdf_vec1, delta);
284 vst1q_u16(cdf + 4, cdf_vec1);
285
286 cdf[13] = count + static_cast<uint16_t>(count < 32);
287 }
288
UpdateCdf16(uint16_t * const cdf,const int symbol)289 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
290 uint16x8_t cdf_vec = vld1q_u16(cdf);
291 const uint16_t count = cdf[16];
292 const int rate = (4 | (count >> 4)) + 1;
293 const uint16x8_t zero = vdupq_n_u16(0);
294 const uint16x8_t cdf_max_probability =
295 vdupq_n_u16(kCdfMaxProbability + 1 - (1 << rate));
296 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
297 const int16x8_t negative_rate = vdupq_n_s16(-rate);
298
299 uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
300 vcreate_u16(0x0007000600050004));
301 uint16x8_t mask = vcltq_u16(index, symbol_vec);
302 uint16x8_t a = vbslq_u16(mask, cdf_max_probability, zero);
303 int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
304 uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
305 cdf_vec = vsubq_u16(cdf_vec, delta);
306 vst1q_u16(cdf, cdf_vec);
307
308 cdf_vec = vld1q_u16(cdf + 8);
309 index = vcombine_u16(vcreate_u16(0x000b000a00090008),
310 vcreate_u16(0x000f000e000d000c));
311 mask = vcltq_u16(index, symbol_vec);
312 a = vbslq_u16(mask, cdf_max_probability, zero);
313 diff = vreinterpretq_s16_u16(vsubq_u16(cdf_vec, a));
314 delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
315 cdf_vec = vsubq_u16(cdf_vec, delta);
316 vst1q_u16(cdf + 8, cdf_vec);
317
318 cdf[16] = count + static_cast<uint16_t>(count < 32);
319 }
320
321 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
322
323 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
324
LoadLo8(const void * a)325 inline __m128i LoadLo8(const void* a) {
326 return _mm_loadl_epi64(static_cast<const __m128i*>(a));
327 }
328
LoadUnaligned16(const void * a)329 inline __m128i LoadUnaligned16(const void* a) {
330 return _mm_loadu_si128(static_cast<const __m128i*>(a));
331 }
332
StoreLo8(void * a,const __m128i v)333 inline void StoreLo8(void* a, const __m128i v) {
334 _mm_storel_epi64(static_cast<__m128i*>(a), v);
335 }
336
StoreUnaligned16(void * a,const __m128i v)337 inline void StoreUnaligned16(void* a, const __m128i v) {
338 _mm_storeu_si128(static_cast<__m128i*>(a), v);
339 }
340
UpdateCdf5(uint16_t * const cdf,const int symbol)341 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
342 __m128i cdf_vec = LoadLo8(cdf);
343 const uint16_t count = cdf[5];
344 const int rate = (4 | (count >> 4)) + 1;
345 const __m128i zero = _mm_setzero_si128();
346 const __m128i cdf_max_probability = _mm_shufflelo_epi16(
347 _mm_cvtsi32_si128(kCdfMaxProbability + 1 - (1 << rate)), 0);
348 const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00030002, 0x00010000);
349 const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
350 const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
351 const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
352 const __m128i diff = _mm_sub_epi16(cdf_vec, a);
353 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
354 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
355 StoreLo8(cdf, cdf_vec);
356 cdf[5] = count + static_cast<uint16_t>(count < 32);
357 }
358
359 // This version works for |symbol_count| = 7, 8, or 9.
360 template <int symbol_count>
UpdateCdf7To9(uint16_t * const cdf,const int symbol)361 void UpdateCdf7To9(uint16_t* const cdf, const int symbol) {
362 static_assert(symbol_count >= 7 && symbol_count <= 9, "");
363 __m128i cdf_vec = LoadUnaligned16(cdf);
364 const uint16_t count = cdf[symbol_count];
365 const int rate = (4 | (count >> 4)) + 1;
366 const __m128i zero = _mm_setzero_si128();
367 const __m128i cdf_max_probability =
368 _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
369 const __m128i index =
370 _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
371 const __m128i symbol_vec = _mm_set1_epi16(symbol);
372 const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
373 const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
374 const __m128i diff = _mm_sub_epi16(cdf_vec, a);
375 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
376 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
377 StoreUnaligned16(cdf, cdf_vec);
378 cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
379 }
380
UpdateCdf7(uint16_t * const cdf,const int symbol)381 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
382 UpdateCdf7To9<7>(cdf, symbol);
383 }
384
UpdateCdf8(uint16_t * const cdf,const int symbol)385 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
386 UpdateCdf7To9<8>(cdf, symbol);
387 }
388
UpdateCdf11(uint16_t * const cdf,const int symbol)389 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
390 __m128i cdf_vec = LoadUnaligned16(cdf + 2);
391 const uint16_t count = cdf[11];
392 cdf[11] = count + static_cast<uint16_t>(count < 32);
393 const int rate = (4 | (count >> 4)) + 1;
394 if (symbol > 1) {
395 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
396 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
397 const __m128i zero = _mm_setzero_si128();
398 const __m128i cdf_max_probability =
399 _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
400 const __m128i index =
401 _mm_set_epi32(0x00090008, 0x00070006, 0x00050004, 0x00030002);
402 const __m128i symbol_vec = _mm_set1_epi16(symbol);
403 const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
404 const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
405 const __m128i diff = _mm_sub_epi16(cdf_vec, a);
406 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
407 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
408 StoreUnaligned16(cdf + 2, cdf_vec);
409 } else {
410 if (symbol != 0) {
411 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
412 cdf[1] -= cdf[1] >> rate;
413 } else {
414 cdf[0] -= cdf[0] >> rate;
415 cdf[1] -= cdf[1] >> rate;
416 }
417 const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
418 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
419 StoreUnaligned16(cdf + 2, cdf_vec);
420 }
421 }
422
UpdateCdf13(uint16_t * const cdf,const int symbol)423 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
424 __m128i cdf_vec0 = LoadUnaligned16(cdf);
425 __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
426 const uint16_t count = cdf[13];
427 const int rate = (4 | (count >> 4)) + 1;
428 const __m128i zero = _mm_setzero_si128();
429 const __m128i cdf_max_probability =
430 _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
431 const __m128i symbol_vec = _mm_set1_epi16(symbol);
432
433 const __m128i index =
434 _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
435 const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
436 const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
437 const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
438 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
439 cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
440 StoreUnaligned16(cdf, cdf_vec0);
441
442 const __m128i index1 =
443 _mm_set_epi32(0x000b000a, 0x00090008, 0x00070006, 0x00050004);
444 const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
445 const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
446 const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
447 const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
448 cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
449 StoreUnaligned16(cdf + 4, cdf_vec1);
450
451 cdf[13] = count + static_cast<uint16_t>(count < 32);
452 }
453
UpdateCdf16(uint16_t * const cdf,const int symbol)454 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
455 __m128i cdf_vec0 = LoadUnaligned16(cdf);
456 const uint16_t count = cdf[16];
457 const int rate = (4 | (count >> 4)) + 1;
458 const __m128i zero = _mm_setzero_si128();
459 const __m128i cdf_max_probability =
460 _mm_set1_epi16(kCdfMaxProbability + 1 - (1 << rate));
461 const __m128i symbol_vec = _mm_set1_epi16(symbol);
462
463 const __m128i index =
464 _mm_set_epi32(0x00070006, 0x00050004, 0x00030002, 0x00010000);
465 const __m128i mask = _mm_cmplt_epi16(index, symbol_vec);
466 const __m128i a = _mm_blendv_epi8(zero, cdf_max_probability, mask);
467 const __m128i diff = _mm_sub_epi16(cdf_vec0, a);
468 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
469 cdf_vec0 = _mm_sub_epi16(cdf_vec0, delta);
470 StoreUnaligned16(cdf, cdf_vec0);
471
472 __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
473 const __m128i index1 =
474 _mm_set_epi32(0x000f000e, 0x000d000c, 0x000b000a, 0x00090008);
475 const __m128i mask1 = _mm_cmplt_epi16(index1, symbol_vec);
476 const __m128i a1 = _mm_blendv_epi8(zero, cdf_max_probability, mask1);
477 const __m128i diff1 = _mm_sub_epi16(cdf_vec1, a1);
478 const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
479 cdf_vec1 = _mm_sub_epi16(cdf_vec1, delta1);
480 StoreUnaligned16(cdf + 8, cdf_vec1);
481
482 cdf[16] = count + static_cast<uint16_t>(count < 32);
483 }
484
485 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
486
UpdateCdf5(uint16_t * const cdf,const int symbol)487 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
488 UpdateCdf(cdf, 5, symbol);
489 }
490
UpdateCdf7(uint16_t * const cdf,const int symbol)491 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
492 UpdateCdf(cdf, 7, symbol);
493 }
494
UpdateCdf8(uint16_t * const cdf,const int symbol)495 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
496 UpdateCdf(cdf, 8, symbol);
497 }
498
UpdateCdf11(uint16_t * const cdf,const int symbol)499 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
500 UpdateCdf(cdf, 11, symbol);
501 }
502
UpdateCdf13(uint16_t * const cdf,const int symbol)503 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
504 UpdateCdf(cdf, 13, symbol);
505 }
506
UpdateCdf16(uint16_t * const cdf,const int symbol)507 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
508 UpdateCdf(cdf, 16, symbol);
509 }
510
511 #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
512 #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
513
514 } // namespace
515
516 #if !LIBGAV1_CXX17
517 constexpr int DaalaBitReader::kWindowSize; // static.
518 #endif
519
DaalaBitReader(const uint8_t * data,size_t size,bool allow_update_cdf)520 DaalaBitReader::DaalaBitReader(const uint8_t* data, size_t size,
521 bool allow_update_cdf)
522 : data_(data),
523 size_(size),
524 data_index_(0),
525 allow_update_cdf_(allow_update_cdf) {
526 window_diff_ = (WindowSize{1} << (kWindowSize - 1)) - 1;
527 values_in_range_ = kCdfMaxProbability;
528 bits_ = -15;
529 PopulateBits();
530 }
531
532 // This is similar to the ReadSymbol() implementation but it is optimized based
533 // on the following facts:
534 // * The probability is fixed at half. So some multiplications can be replaced
535 // with bit operations.
536 // * Symbol count is fixed at 2.
ReadBit()537 int DaalaBitReader::ReadBit() {
538 const uint32_t curr =
539 ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
540 const WindowSize zero_threshold = static_cast<WindowSize>(curr)
541 << (kWindowSize - 16);
542 int bit = 1;
543 if (window_diff_ >= zero_threshold) {
544 values_in_range_ -= curr;
545 window_diff_ -= zero_threshold;
546 bit = 0;
547 } else {
548 values_in_range_ = curr;
549 }
550 NormalizeRange();
551 return bit;
552 }
553
ReadLiteral(int num_bits)554 int64_t DaalaBitReader::ReadLiteral(int num_bits) {
555 assert(num_bits <= 32);
556 assert(num_bits > 0);
557 uint32_t literal = 0;
558 int bit = num_bits - 1;
559 do {
560 // ARM can combine a shift operation with a constant number of bits with
561 // some other operations, such as the OR operation.
562 // Here is an ARM disassembly example:
563 // orr w1, w0, w1, lsl #1
564 // which left shifts register w1 by 1 bit and OR the shift result with
565 // register w0.
566 // The next 2 lines are equivalent to:
567 // literal |= static_cast<uint32_t>(ReadBit()) << bit;
568 literal <<= 1;
569 literal |= static_cast<uint32_t>(ReadBit());
570 } while (--bit >= 0);
571 return literal;
572 }
573
ReadSymbol(uint16_t * const cdf,int symbol_count)574 int DaalaBitReader::ReadSymbol(uint16_t* const cdf, int symbol_count) {
575 const int symbol = ReadSymbolImpl(cdf, symbol_count);
576 if (allow_update_cdf_) {
577 UpdateCdf(cdf, symbol_count, symbol);
578 }
579 return symbol;
580 }
581
ReadSymbol(uint16_t * cdf)582 bool DaalaBitReader::ReadSymbol(uint16_t* cdf) {
583 const bool symbol = ReadSymbolImpl(cdf) != 0;
584 if (allow_update_cdf_) {
585 const uint16_t count = cdf[2];
586 // rate is computed in the spec as:
587 // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
588 // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
589 // 4 + (count > 15) + (count > 31)
590 // Note that the largest value for count is 32 (it is not incremented beyond
591 // 32). So using that information:
592 // count >> 4 is 0 for count from 0 to 15.
593 // count >> 4 is 1 for count from 16 to 31.
594 // count >> 4 is 2 for count == 32.
595 // Now, the equation becomes:
596 // 4 + (count >> 4).
597 // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
598 // with bitwise or. So the final equation is:
599 // 4 | (count >> 4).
600 const int rate = 4 | (count >> 4);
601 if (symbol) {
602 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
603 } else {
604 cdf[0] -= cdf[0] >> rate;
605 }
606 cdf[2] += static_cast<uint16_t>(count < 32);
607 }
608 return symbol;
609 }
610
ReadSymbolWithoutCdfUpdate(uint16_t * cdf)611 bool DaalaBitReader::ReadSymbolWithoutCdfUpdate(uint16_t* cdf) {
612 return ReadSymbolImpl(cdf) != 0;
613 }
614
615 template <int symbol_count>
ReadSymbol(uint16_t * const cdf)616 int DaalaBitReader::ReadSymbol(uint16_t* const cdf) {
617 static_assert(symbol_count >= 3 && symbol_count <= 16, "");
618 if (symbol_count == 4) {
619 return ReadSymbol4(cdf);
620 }
621 int symbol;
622 if (symbol_count == 8) {
623 symbol = ReadSymbolImpl8(cdf);
624 } else if (symbol_count <= 13) {
625 symbol = ReadSymbolImpl(cdf, symbol_count);
626 } else {
627 symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
628 }
629 if (allow_update_cdf_) {
630 if (symbol_count == 5) {
631 UpdateCdf5(cdf, symbol);
632 } else if (symbol_count == 7) {
633 UpdateCdf7(cdf, symbol);
634 } else if (symbol_count == 8) {
635 UpdateCdf8(cdf, symbol);
636 } else if (symbol_count == 11) {
637 UpdateCdf11(cdf, symbol);
638 } else if (symbol_count == 13) {
639 UpdateCdf13(cdf, symbol);
640 } else if (symbol_count == 16) {
641 UpdateCdf16(cdf, symbol);
642 } else {
643 UpdateCdf(cdf, symbol_count, symbol);
644 }
645 }
646 return symbol;
647 }
648
ReadSymbolImpl(const uint16_t * const cdf,int symbol_count)649 int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf,
650 int symbol_count) {
651 assert(cdf[symbol_count - 1] == 0);
652 --symbol_count;
653 uint32_t curr = values_in_range_;
654 int symbol = -1;
655 uint32_t prev;
656 const auto symbol_value =
657 static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
658 uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
659 // Search through the |cdf| array to determine where the scaled cdf value and
660 // |symbol_value| cross over.
661 do {
662 prev = curr;
663 curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
664 delta;
665 delta -= kMinimumProbabilityPerSymbol;
666 } while (symbol_value < curr);
667 values_in_range_ = prev - curr;
668 window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
669 NormalizeRange();
670 return symbol;
671 }
672
ReadSymbolImplBinarySearch(const uint16_t * const cdf,int symbol_count)673 int DaalaBitReader::ReadSymbolImplBinarySearch(const uint16_t* const cdf,
674 int symbol_count) {
675 assert(cdf[symbol_count - 1] == 0);
676 assert(symbol_count > 1 && symbol_count <= 16);
677 --symbol_count;
678 const auto symbol_value =
679 static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
680 // Search through the |cdf| array to determine where the scaled cdf value and
681 // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
682 // search to do this. Let |symbol| be the index of the first |cdf| array
683 // entry whose scaled cdf value is less than or equal to |symbol_value|. The
684 // binary search maintains the invariant:
685 // low <= symbol <= high + 1
686 // and terminates when low == high + 1.
687 int low = 0;
688 int high = symbol_count - 1;
689 // The binary search maintains the invariants that |prev| is the scaled cdf
690 // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
691 // convention, the scaled cdf value for -1 is values_in_range_.) When the
692 // binary search terminates, |prev| is the scaled cdf value for symbol - 1
693 // and |curr| is the scaled cdf value for |symbol|.
694 uint32_t prev = values_in_range_;
695 uint32_t curr = 0;
696 const uint32_t values_in_range_shifted = values_in_range_ >> 8;
697 do {
698 const int mid = DivideBy2(low + high);
699 const uint32_t scaled_cdf =
700 ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
701 if (symbol_value < scaled_cdf) {
702 low = mid + 1;
703 prev = scaled_cdf;
704 } else {
705 high = mid - 1;
706 curr = scaled_cdf;
707 }
708 } while (low <= high);
709 assert(low == high + 1);
710 // At this point, |low| is the symbol that has been decoded.
711 values_in_range_ = prev - curr;
712 window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
713 NormalizeRange();
714 return low;
715 }
716
ReadSymbolImpl(const uint16_t * const cdf)717 int DaalaBitReader::ReadSymbolImpl(const uint16_t* const cdf) {
718 assert(cdf[1] == 0);
719 const auto symbol_value =
720 static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
721 const uint32_t curr = ScaleCdf(values_in_range_ >> 8, cdf, 0, 1);
722 const int symbol = static_cast<int>(symbol_value < curr);
723 if (symbol == 1) {
724 values_in_range_ = curr;
725 } else {
726 values_in_range_ -= curr;
727 window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
728 }
729 NormalizeRange();
730 return symbol;
731 }
732
733 // Equivalent to ReadSymbol(cdf, 4), with the ReadSymbolImpl and UpdateCdf
734 // calls inlined.
ReadSymbol4(uint16_t * const cdf)735 int DaalaBitReader::ReadSymbol4(uint16_t* const cdf) {
736 assert(cdf[3] == 0);
737 uint32_t curr = values_in_range_;
738 uint32_t prev;
739 const auto symbol_value =
740 static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
741 uint32_t delta = kMinimumProbabilityPerSymbol * 3;
742 const uint32_t values_in_range_shifted = values_in_range_ >> 8;
743
744 // Search through the |cdf| array to determine where the scaled cdf value and
745 // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
746 // array.
747 //
748 // The original code is:
749 //
750 // int symbol = -1;
751 // do {
752 // prev = curr;
753 // curr =
754 // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
755 // + delta;
756 // delta -= kMinimumProbabilityPerSymbol;
757 // } while (symbol_value < curr);
758 // if (allow_update_cdf_) {
759 // UpdateCdf(cdf, 4, symbol);
760 // }
761 //
762 // The do-while loop is unrolled with four iterations, and the UpdateCdf call
763 // is inlined and merged into the four iterations.
764 int symbol = 0;
765 // Iteration 0.
766 prev = curr;
767 curr =
768 ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
769 if (symbol_value >= curr) {
770 // symbol == 0.
771 if (allow_update_cdf_) {
772 // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/0).
773 const uint16_t count = cdf[4];
774 cdf[4] += static_cast<uint16_t>(count < 32);
775 const int rate = (4 | (count >> 4)) + 1;
776 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
777 // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
778 // NEON code is slower. Consider using the C version if __arm__ is
779 // defined.
780 // 2. The ARM NEON code (compiled for arm64) is slightly slower on
781 // Samsung Galaxy S8+ (SM-G955FD).
782 uint16x4_t cdf_vec = vld1_u16(cdf);
783 const int16x4_t negative_rate = vdup_n_s16(-rate);
784 const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
785 cdf_vec = vsub_u16(cdf_vec, delta);
786 vst1_u16(cdf, cdf_vec);
787 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
788 __m128i cdf_vec = LoadLo8(cdf);
789 const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
790 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
791 StoreLo8(cdf, cdf_vec);
792 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
793 cdf[0] -= cdf[0] >> rate;
794 cdf[1] -= cdf[1] >> rate;
795 cdf[2] -= cdf[2] >> rate;
796 #endif
797 }
798 goto found;
799 }
800 ++symbol;
801 delta -= kMinimumProbabilityPerSymbol;
802 // Iteration 1.
803 prev = curr;
804 curr =
805 ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
806 if (symbol_value >= curr) {
807 // symbol == 1.
808 if (allow_update_cdf_) {
809 // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/1).
810 const uint16_t count = cdf[4];
811 cdf[4] += static_cast<uint16_t>(count < 32);
812 const int rate = (4 | (count >> 4)) + 1;
813 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
814 cdf[1] -= cdf[1] >> rate;
815 cdf[2] -= cdf[2] >> rate;
816 }
817 goto found;
818 }
819 ++symbol;
820 delta -= kMinimumProbabilityPerSymbol;
821 // Iteration 2.
822 prev = curr;
823 curr =
824 ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
825 if (symbol_value >= curr) {
826 // symbol == 2.
827 if (allow_update_cdf_) {
828 // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
829 const uint16_t count = cdf[4];
830 cdf[4] += static_cast<uint16_t>(count < 32);
831 const int rate = (4 | (count >> 4)) + 1;
832 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
833 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
834 cdf[2] -= cdf[2] >> rate;
835 }
836 goto found;
837 }
838 ++symbol;
839 // |delta| is 0 for the last iteration.
840 // Iteration 3.
841 prev = curr;
842 // Since cdf[3] is 0 and |delta| is 0, |curr| is also 0.
843 curr = 0;
844 // symbol == 3.
845 if (allow_update_cdf_) {
846 // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/3).
847 const uint16_t count = cdf[4];
848 cdf[4] += static_cast<uint16_t>(count < 32);
849 const int rate = (4 | (count >> 4)) + 1;
850 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
851 // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
852 // code is a tiny bit slower. Consider using the C version if __arm__ is
853 // defined.
854 uint16x4_t cdf_vec = vld1_u16(cdf);
855 const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
856 const int16x4_t diff =
857 vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
858 const int16x4_t negative_rate = vdup_n_s16(-rate);
859 const uint16x4_t delta =
860 vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
861 cdf_vec = vadd_u16(cdf_vec, delta);
862 vst1_u16(cdf, cdf_vec);
863 cdf[3] = 0;
864 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
865 __m128i cdf_vec = LoadLo8(cdf);
866 const __m128i cdf_max_probability =
867 _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
868 const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
869 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
870 cdf_vec = _mm_add_epi16(cdf_vec, delta);
871 StoreLo8(cdf, cdf_vec);
872 cdf[3] = 0;
873 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE4
874 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
875 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
876 cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
877 #endif
878 }
879 found:
880 // End of unrolled do-while loop.
881
882 values_in_range_ = prev - curr;
883 window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
884 NormalizeRange();
885 return symbol;
886 }
887
ReadSymbolImpl8(const uint16_t * const cdf)888 int DaalaBitReader::ReadSymbolImpl8(const uint16_t* const cdf) {
889 assert(cdf[7] == 0);
890 uint32_t curr = values_in_range_;
891 uint32_t prev;
892 const auto symbol_value =
893 static_cast<uint32_t>(window_diff_ >> (kWindowSize - 16));
894 uint32_t delta = kMinimumProbabilityPerSymbol * 7;
895 // Search through the |cdf| array to determine where the scaled cdf value and
896 // |symbol_value| cross over.
897 //
898 // The original code is:
899 //
900 // int symbol = -1;
901 // do {
902 // prev = curr;
903 // curr =
904 // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
905 // + delta;
906 // delta -= kMinimumProbabilityPerSymbol;
907 // } while (symbol_value < curr);
908 //
909 // The do-while loop is unrolled with eight iterations.
910 int symbol = 0;
911
912 #define READ_SYMBOL_ITERATION \
913 prev = curr; \
914 curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
915 delta; \
916 if (symbol_value >= curr) goto found; \
917 ++symbol; \
918 delta -= kMinimumProbabilityPerSymbol
919
920 READ_SYMBOL_ITERATION; // Iteration 0.
921 READ_SYMBOL_ITERATION; // Iteration 1.
922 READ_SYMBOL_ITERATION; // Iteration 2.
923 READ_SYMBOL_ITERATION; // Iteration 3.
924 READ_SYMBOL_ITERATION; // Iteration 4.
925 READ_SYMBOL_ITERATION; // Iteration 5.
926
927 // The last two iterations can be simplified, so they don't use the
928 // READ_SYMBOL_ITERATION macro.
929 #undef READ_SYMBOL_ITERATION
930
931 // Iteration 6.
932 prev = curr;
933 curr =
934 (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
935 if (symbol_value >= curr) goto found; // symbol == 6.
936 ++symbol;
937 // |delta| is 0 for the last iteration.
938 // Iteration 7.
939 prev = curr;
940 // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
941 curr = 0;
942 // symbol == 7.
943 found:
944 // End of unrolled do-while loop.
945
946 values_in_range_ = prev - curr;
947 window_diff_ -= static_cast<WindowSize>(curr) << (kWindowSize - 16);
948 NormalizeRange();
949 return symbol;
950 }
951
PopulateBits()952 void DaalaBitReader::PopulateBits() {
953 #if defined(__aarch64__)
954 // Fast path: read eight bytes and add the first six bytes to window_diff_.
955 // This fast path makes the following assumptions.
956 // 1. We assume that unaligned load of uint64_t is fast.
957 // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
958 // bytes depending on the value of bits_. This fast path always reads 6
959 // bytes, which results in more calls to PopulateBits(). We assume that
960 // making more calls to a faster PopulateBits() is overall a win.
961 // NOTE: Although this fast path could also be used on x86_64, it hurts
962 // performance (measured on Lenovo ThinkStation P920 running Linux). (The
963 // reason is still unknown.) Therefore this fast path is only used on arm64.
964 static_assert(kWindowSize == 64, "");
965 if (size_ - data_index_ >= 8) {
966 uint64_t value;
967 // arm64 supports unaligned loads, so this memcpy call is compiled to a
968 // single ldr instruction.
969 memcpy(&value, &data_[data_index_], 8);
970 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
971 value = __builtin_bswap64(value);
972 #endif
973 value &= 0xffffffffffff0000;
974 window_diff_ ^= static_cast<WindowSize>(value) >> (bits_ + 16);
975 data_index_ += 6;
976 bits_ += 6 * 8;
977 return;
978 }
979 #endif
980
981 size_t data_index = data_index_;
982 int bits = bits_;
983 WindowSize window_diff = window_diff_;
984
985 int shift = kWindowSize - 9 - (bits + 15);
986 // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
987 // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
988 // iterations when WindowSize is 64 bits. So it is not profitable to
989 // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
990 // the fast path above is not compiled.
991
992 #ifdef __clang__
993 #pragma clang loop vectorize(disable) interleave(disable)
994 #endif
995 for (; shift >= 0 && data_index < size_; shift -= 8) {
996 window_diff ^= static_cast<WindowSize>(data_[data_index++]) << shift;
997 bits += 8;
998 }
999 if (data_index >= size_) {
1000 bits = kLargeBitCount;
1001 }
1002
1003 data_index_ = data_index;
1004 bits_ = bits;
1005 window_diff_ = window_diff;
1006 }
1007
NormalizeRange()1008 void DaalaBitReader::NormalizeRange() {
1009 const int bits_used = 15 - FloorLog2(values_in_range_);
1010 bits_ -= bits_used;
1011 window_diff_ = ((window_diff_ + 1) << bits_used) - 1;
1012 values_in_range_ <<= bits_used;
1013 if (bits_ < 0) PopulateBits();
1014 }
1015
1016 // Explicit instantiations.
1017 template int DaalaBitReader::ReadSymbol<3>(uint16_t* cdf);
1018 template int DaalaBitReader::ReadSymbol<4>(uint16_t* cdf);
1019 template int DaalaBitReader::ReadSymbol<5>(uint16_t* cdf);
1020 template int DaalaBitReader::ReadSymbol<7>(uint16_t* cdf);
1021 template int DaalaBitReader::ReadSymbol<8>(uint16_t* cdf);
1022 template int DaalaBitReader::ReadSymbol<10>(uint16_t* cdf);
1023 template int DaalaBitReader::ReadSymbol<11>(uint16_t* cdf);
1024 template int DaalaBitReader::ReadSymbol<13>(uint16_t* cdf);
1025 template int DaalaBitReader::ReadSymbol<14>(uint16_t* cdf);
1026 template int DaalaBitReader::ReadSymbol<16>(uint16_t* cdf);
1027
1028 } // namespace libgav1
1029