1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/utils/entropy_decoder.h"
16
17 #include <cassert>
18 #include <cstring>
19
20 #include "src/utils/common.h"
21 #include "src/utils/compiler_attributes.h"
22 #include "src/utils/constants.h"
23 #include "src/utils/cpu.h"
24
25 #if defined(__ARM_NEON__) || defined(__aarch64__) || \
26 (defined(_MSC_VER) && defined(_M_ARM))
27 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
28 #else
29 #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
30 #endif
31
32 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
33 #include <arm_neon.h>
34 #endif
35
36 #if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
37 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
38 #else
39 #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
40 #endif
41
42 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
43 #include <emmintrin.h>
44 #endif
45
46 namespace libgav1 {
47 namespace {
48
49 constexpr uint32_t kReadBitMask = ~255;
50 constexpr int kCdfPrecision = 6;
51 constexpr int kMinimumProbabilityPerSymbol = 4;
52
53 // This function computes the "cur" variable as specified inside the do-while
54 // loop in Section 8.2.6 of the spec. This function is monotonically
55 // decreasing as the values of index increases (note that the |cdf| array is
56 // sorted in decreasing order).
ScaleCdf(uint32_t values_in_range_shifted,const uint16_t * const cdf,int index,int symbol_count)57 uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
58 int index, int symbol_count) {
59 return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
60 (kMinimumProbabilityPerSymbol * (symbol_count - index));
61 }
62
UpdateCdf(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count,const int symbol)63 void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
64 const int symbol) {
65 const uint16_t count = cdf[symbol_count];
66 // rate is computed in the spec as:
67 // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
68 // In this case cdf[N] is |count|.
69 // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
70 // symbol_count > 3. So the equation becomes:
71 // 4 + (count > 15) + (count > 31) + (symbol_count > 3).
72 // Note that the largest value for count is 32 (it is not incremented beyond
73 // 32). So using that information:
74 // count >> 4 is 0 for count from 0 to 15.
75 // count >> 4 is 1 for count from 16 to 31.
76 // count >> 4 is 2 for count == 31.
77 // Now, the equation becomes:
78 // 4 + (count >> 4) + (symbol_count > 3).
79 // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
80 // with bitwise or:
81 // (4 | (count >> 4)) + (symbol_count > 3).
82 // but using addition will allow the compiler to eliminate an operation when
83 // symbol_count is known and this function is inlined.
84 const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
85 // Hints for further optimizations:
86 //
87 // 1. clang can vectorize this for loop with width 4, even though the loop
88 // contains an if-else statement. Therefore, it may be advantageous to use
89 // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
90 // (a multiple of 4 that's not too small).
91 //
92 // 2. The for loop can be rewritten in the following form, which would enable
93 // clang to vectorize the loop with width 8:
94 //
95 // const int rounding = (1 << rate) - 1;
96 // for (int i = 0; i < symbol_count - 1; ++i) {
97 // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
98 // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
99 // }
100 //
101 // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
102 // integer arithmetic. The result of the unsigned subtraction is cast to a
103 // signed integer and right-shifted. This requires the right shift of a
104 // signed integer be an arithmetic shift, which is true for clang, gcc, and
105 // Visual C++.
106 assert(symbol_count - 1 > 0);
107 int i = 0;
108 do {
109 if (i < symbol) {
110 cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
111 } else {
112 cdf[i] -= cdf[i] >> rate;
113 }
114 } while (++i < symbol_count - 1);
115 cdf[symbol_count] += static_cast<uint16_t>(count < 32);
116 }
117
118 // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
119 // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
120 // SIMD instruction sets if available.
121
122 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
123
124 // The UpdateCdf() method contains the following for loop:
125 //
126 // for (int i = 0; i < symbol_count - 1; ++i) {
127 // if (i < symbol) {
128 // cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
129 // } else {
130 // cdf[i] -= cdf[i] >> rate;
131 // }
132 // }
133 //
134 // It can be rewritten in the following two forms, which are amenable to SIMD
135 // implementations:
136 //
137 // const int rounding = (1 << rate) - 1;
138 // for (int i = 0; i < symbol_count - 1; ++i) {
139 // const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
140 // cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
141 // }
142 //
143 // or:
144 //
145 // const int rounding = (1 << rate) - 1;
146 // for (int i = 0; i < symbol_count - 1; ++i) {
147 // const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
148 // cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
149 // }
150 //
151 // The following ARM NEON implementations use a modified version of the first
152 // form, using the comparison mask and unsigned rollover to avoid the need to
153 // calculate rounding.
154 //
155 // The cdf array has symbol_count + 1 elements. The first symbol_count elements
156 // are the CDF. The last element is a count that is initialized to 0 and may
157 // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
158 // cdf[symbol_count - 1] is always 0, the for loop does not update
159 // cdf[symbol_count - 1]. However, it would be correct to have the for loop
160 // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
161 // for loop would take the else branch when i is symbol_count - 1:
162 // cdf[i] -= cdf[i] >> rate;
163 // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
164 // after the update. The ARM NEON implementations take advantage of this in the
165 // following two cases:
166 // 1. When symbol_count is 8 or 16, the vectorized code updates the first
167 // symbol_count elements in the array.
168 // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
169 // the cdf array. Since an invalid CDF value is written into cdf[7], the
170 // count in cdf[7] needs to be fixed up after the vectorized code.
171
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)172 void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
173 uint16x4_t cdf_vec = vld1_u16(cdf);
174 const uint16_t count = cdf[5];
175 const int rate = (count >> 4) + 5;
176 const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
177 const uint16x4_t index = vcreate_u16(0x0003000200010000);
178 const uint16x4_t symbol_vec = vdup_n_u16(symbol);
179 const uint16x4_t mask = vcge_u16(index, symbol_vec);
180 // i < symbol: 32768, i >= symbol: 65535.
181 const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
182 // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
183 const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
184 // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
185 const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
186 const int16x4_t negative_rate = vdup_n_s16(-rate);
187 // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
188 const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
189 // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
190 // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
191 cdf_vec = vadd_u16(cdf_offset, delta);
192 vst1_u16(cdf, cdf_vec);
193 cdf[5] = count + static_cast<uint16_t>(count < 32);
194 }
195
196 // This version works for |symbol_count| = 7, 8, or 9.
197 // See UpdateCdf5 for implementation details.
198 template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)199 void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
200 static_assert(symbol_count >= 7 && symbol_count <= 9, "");
201 uint16x8_t cdf_vec = vld1q_u16(cdf);
202 const uint16_t count = cdf[symbol_count];
203 const int rate = (count >> 4) + 5;
204 const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
205 const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
206 vcreate_u16(0x0007000600050004));
207 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
208 const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
209 const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
210 const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
211 const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
212 const int16x8_t negative_rate = vdupq_n_s16(-rate);
213 const uint16x8_t delta =
214 vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
215 cdf_vec = vaddq_u16(cdf_offset, delta);
216 vst1q_u16(cdf, cdf_vec);
217 cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
218 }
219
UpdateCdf7(uint16_t * const cdf,const int symbol)220 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
221 UpdateCdf7To9<7>(cdf, symbol);
222 }
223
UpdateCdf8(uint16_t * const cdf,const int symbol)224 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
225 UpdateCdf7To9<8>(cdf, symbol);
226 }
227
UpdateCdf9(uint16_t * const cdf,const int symbol)228 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
229 UpdateCdf7To9<9>(cdf, symbol);
230 }
231
232 // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)233 void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
234 uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
235 const uint16_t count = cdf[11];
236 cdf[11] = count + static_cast<uint16_t>(count < 32);
237 const int rate = (count >> 4) + 5;
238 if (symbol > 1) {
239 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
240 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
241 const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
242 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
243 const int16x8_t negative_rate = vdupq_n_s16(-rate);
244 const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
245 vcreate_u16(0x0009000800070006));
246 const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
247 const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
248 const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
249 const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
250 const uint16x8_t delta =
251 vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
252 cdf_vec = vaddq_u16(cdf_offset, delta);
253 vst1q_u16(cdf + 2, cdf_vec);
254 } else {
255 if (symbol != 0) {
256 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
257 cdf[1] -= cdf[1] >> rate;
258 } else {
259 cdf[0] -= cdf[0] >> rate;
260 cdf[1] -= cdf[1] >> rate;
261 }
262 const int16x8_t negative_rate = vdupq_n_s16(-rate);
263 const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
264 cdf_vec = vsubq_u16(cdf_vec, delta);
265 vst1q_u16(cdf + 2, cdf_vec);
266 }
267 }
268
269 // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)270 void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
271 uint16x8_t cdf_vec0 = vld1q_u16(cdf);
272 uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
273 const uint16_t count = cdf[13];
274 const int rate = (count >> 4) + 5;
275 const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
276 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
277 const int16x8_t negative_rate = vdupq_n_s16(-rate);
278
279 uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
280 vcreate_u16(0x0007000600050004));
281 uint16x8_t mask = vcgeq_u16(index, symbol_vec);
282 uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
283 int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
284 uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
285 uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
286 cdf_vec0 = vaddq_u16(cdf_offset, delta);
287 vst1q_u16(cdf, cdf_vec0);
288
289 index = vcombine_u16(vcreate_u16(0x0007000600050004),
290 vcreate_u16(0x000b000a00090008));
291 mask = vcgeq_u16(index, symbol_vec);
292 a = vorrq_u16(mask, cdf_max_probability);
293 diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
294 cdf_offset = vsubq_u16(cdf_vec1, mask);
295 delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
296 cdf_vec1 = vaddq_u16(cdf_offset, delta);
297 vst1q_u16(cdf + 4, cdf_vec1);
298
299 cdf[13] = count + static_cast<uint16_t>(count < 32);
300 }
301
302 // See UpdateCdf5 for implementation details.
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)303 void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
304 uint16x8_t cdf_vec = vld1q_u16(cdf);
305 const uint16_t count = cdf[16];
306 const int rate = (count >> 4) + 5;
307 const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
308 const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
309 const int16x8_t negative_rate = vdupq_n_s16(-rate);
310
311 uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
312 vcreate_u16(0x0007000600050004));
313 uint16x8_t mask = vcgeq_u16(index, symbol_vec);
314 uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
315 int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
316 uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
317 uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
318 cdf_vec = vaddq_u16(cdf_offset, delta);
319 vst1q_u16(cdf, cdf_vec);
320
321 cdf_vec = vld1q_u16(cdf + 8);
322 index = vcombine_u16(vcreate_u16(0x000b000a00090008),
323 vcreate_u16(0x000f000e000d000c));
324 mask = vcgeq_u16(index, symbol_vec);
325 a = vorrq_u16(mask, cdf_max_probability);
326 diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
327 cdf_offset = vsubq_u16(cdf_vec, mask);
328 delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
329 cdf_vec = vaddq_u16(cdf_offset, delta);
330 vst1q_u16(cdf + 8, cdf_vec);
331
332 cdf[16] = count + static_cast<uint16_t>(count < 32);
333 }
334
335 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
336
337 #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
338
LoadLo8(const void * a)339 inline __m128i LoadLo8(const void* a) {
340 return _mm_loadl_epi64(static_cast<const __m128i*>(a));
341 }
342
LoadUnaligned16(const void * a)343 inline __m128i LoadUnaligned16(const void* a) {
344 return _mm_loadu_si128(static_cast<const __m128i*>(a));
345 }
346
StoreLo8(void * a,const __m128i v)347 inline void StoreLo8(void* a, const __m128i v) {
348 _mm_storel_epi64(static_cast<__m128i*>(a), v);
349 }
350
StoreUnaligned16(void * a,const __m128i v)351 inline void StoreUnaligned16(void* a, const __m128i v) {
352 _mm_storeu_si128(static_cast<__m128i*>(a), v);
353 }
354
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)355 void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
356 __m128i cdf_vec = LoadLo8(cdf);
357 const uint16_t count = cdf[5];
358 const int rate = (count >> 4) + 5;
359 const __m128i cdf_max_probability =
360 _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
361 const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
362 const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
363 // i >= symbol.
364 const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
365 // i < symbol: 32768, i >= symbol: 65535.
366 const __m128i a = _mm_or_si128(mask, cdf_max_probability);
367 // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
368 const __m128i diff = _mm_sub_epi16(a, cdf_vec);
369 // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
370 const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
371 // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
372 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
373 // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
374 // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
375 cdf_vec = _mm_add_epi16(cdf_offset, delta);
376 StoreLo8(cdf, cdf_vec);
377 cdf[5] = count + static_cast<uint16_t>(count < 32);
378 }
379
380 // This version works for |symbol_count| = 7, 8, or 9.
381 // See UpdateCdf5 for implementation details.
382 template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)383 void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
384 static_assert(symbol_count >= 7 && symbol_count <= 9, "");
385 __m128i cdf_vec = LoadUnaligned16(cdf);
386 const uint16_t count = cdf[symbol_count];
387 const int rate = (count >> 4) + 5;
388 const __m128i cdf_max_probability =
389 _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
390 const __m128i index =
391 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
392 const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
393 const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
394 const __m128i a = _mm_or_si128(mask, cdf_max_probability);
395 const __m128i diff = _mm_sub_epi16(a, cdf_vec);
396 const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
397 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
398 cdf_vec = _mm_add_epi16(cdf_offset, delta);
399 StoreUnaligned16(cdf, cdf_vec);
400 cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
401 }
402
UpdateCdf7(uint16_t * const cdf,const int symbol)403 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
404 UpdateCdf7To9<7>(cdf, symbol);
405 }
406
UpdateCdf8(uint16_t * const cdf,const int symbol)407 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
408 UpdateCdf7To9<8>(cdf, symbol);
409 }
410
UpdateCdf9(uint16_t * const cdf,const int symbol)411 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
412 UpdateCdf7To9<9>(cdf, symbol);
413 }
414
415 // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)416 void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
417 __m128i cdf_vec = LoadUnaligned16(cdf + 2);
418 const uint16_t count = cdf[11];
419 cdf[11] = count + static_cast<uint16_t>(count < 32);
420 const int rate = (count >> 4) + 5;
421 if (symbol > 1) {
422 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
423 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
424 const __m128i cdf_max_probability =
425 _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
426 const __m128i index =
427 _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
428 const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
429 const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
430 const __m128i a = _mm_or_si128(mask, cdf_max_probability);
431 const __m128i diff = _mm_sub_epi16(a, cdf_vec);
432 const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
433 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
434 cdf_vec = _mm_add_epi16(cdf_offset, delta);
435 StoreUnaligned16(cdf + 2, cdf_vec);
436 } else {
437 if (symbol != 0) {
438 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
439 cdf[1] -= cdf[1] >> rate;
440 } else {
441 cdf[0] -= cdf[0] >> rate;
442 cdf[1] -= cdf[1] >> rate;
443 }
444 const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
445 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
446 StoreUnaligned16(cdf + 2, cdf_vec);
447 }
448 }
449
450 // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)451 void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
452 __m128i cdf_vec0 = LoadLo8(cdf);
453 __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
454 const uint16_t count = cdf[13];
455 const int rate = (count >> 4) + 5;
456 const __m128i cdf_max_probability =
457 _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
458 const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
459
460 const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
461 const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
462 const __m128i a = _mm_or_si128(mask, cdf_max_probability);
463 const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
464 const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
465 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
466 cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
467 StoreLo8(cdf, cdf_vec0);
468
469 const __m128i index1 =
470 _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
471 const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
472 const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
473 const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
474 const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
475 const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
476 cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
477 StoreUnaligned16(cdf + 4, cdf_vec1);
478
479 cdf[13] = count + static_cast<uint16_t>(count < 32);
480 }
481
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)482 void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
483 __m128i cdf_vec0 = LoadUnaligned16(cdf);
484 const uint16_t count = cdf[16];
485 const int rate = (count >> 4) + 5;
486 const __m128i cdf_max_probability =
487 _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
488 const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
489
490 const __m128i index =
491 _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
492 const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
493 const __m128i a = _mm_or_si128(mask, cdf_max_probability);
494 const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
495 const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
496 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
497 cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
498 StoreUnaligned16(cdf, cdf_vec0);
499
500 __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
501 const __m128i index1 =
502 _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
503 const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
504 const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
505 const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
506 const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
507 const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
508 cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
509 StoreUnaligned16(cdf + 8, cdf_vec1);
510
511 cdf[16] = count + static_cast<uint16_t>(count < 32);
512 }
513
514 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
515
UpdateCdf5(uint16_t * const cdf,const int symbol)516 void UpdateCdf5(uint16_t* const cdf, const int symbol) {
517 UpdateCdf(cdf, 5, symbol);
518 }
519
UpdateCdf7(uint16_t * const cdf,const int symbol)520 void UpdateCdf7(uint16_t* const cdf, const int symbol) {
521 UpdateCdf(cdf, 7, symbol);
522 }
523
UpdateCdf8(uint16_t * const cdf,const int symbol)524 void UpdateCdf8(uint16_t* const cdf, const int symbol) {
525 UpdateCdf(cdf, 8, symbol);
526 }
527
UpdateCdf9(uint16_t * const cdf,const int symbol)528 void UpdateCdf9(uint16_t* const cdf, const int symbol) {
529 UpdateCdf(cdf, 9, symbol);
530 }
531
UpdateCdf11(uint16_t * const cdf,const int symbol)532 void UpdateCdf11(uint16_t* const cdf, const int symbol) {
533 UpdateCdf(cdf, 11, symbol);
534 }
535
UpdateCdf13(uint16_t * const cdf,const int symbol)536 void UpdateCdf13(uint16_t* const cdf, const int symbol) {
537 UpdateCdf(cdf, 13, symbol);
538 }
539
UpdateCdf16(uint16_t * const cdf,const int symbol)540 void UpdateCdf16(uint16_t* const cdf, const int symbol) {
541 UpdateCdf(cdf, 16, symbol);
542 }
543
544 #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
545 #endif // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
546
HostToBigEndian(const EntropyDecoder::WindowSize x)547 inline EntropyDecoder::WindowSize HostToBigEndian(
548 const EntropyDecoder::WindowSize x) {
549 static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
550 #if defined(__GNUC__)
551 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
552 return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
553 #else
554 return x;
555 #endif
556 #elif defined(_WIN32)
557 // Note Windows targets are assumed to be little endian.
558 return static_cast<EntropyDecoder::WindowSize>(
559 (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
560 : _byteswap_ulong(static_cast<unsigned long>(x)));
561 #else
562 #error Unknown compiler!
563 #endif // defined(__GNUC__)
564 }
565
566 } // namespace
567
568 #if !LIBGAV1_CXX17
569 constexpr int EntropyDecoder::kWindowSize; // static.
570 #endif
571
EntropyDecoder(const uint8_t * data,size_t size,bool allow_update_cdf)572 EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
573 bool allow_update_cdf)
574 : data_(data),
575 data_end_(data + size),
576 data_memcpy_end_((size >= sizeof(WindowSize))
577 ? data + size - sizeof(WindowSize) + 1
578 : data),
579 allow_update_cdf_(allow_update_cdf),
580 values_in_range_(kCdfMaxProbability) {
581 if (data_ < data_memcpy_end_) {
582 // This is a simplified version of PopulateBits() which loads 8 extra bits
583 // and skips the unnecessary shifts of value and window_diff_.
584 WindowSize value;
585 memcpy(&value, data_, sizeof(value));
586 data_ += sizeof(value);
587 window_diff_ = HostToBigEndian(value) ^ -1;
588 // Note the initial value of bits_ is larger than kMaxCachedBits as it's
589 // used to restore the most significant 0 bit that would be present after
590 // PopulateBits() when we extract the first symbol value.
591 // As shown in Section 8.2.2 Initialization process for symbol decoder,
592 // which uses a fixed offset to read the symbol values, the most
593 // significant bit is always 0:
594 // The variable numBits is set equal to Min( sz * 8, 15).
595 // The variable buf is read using the f(numBits) parsing process.
596 // The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
597 // The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
598 bits_ = kWindowSize - 15;
599 return;
600 }
601 window_diff_ = 0;
602 bits_ = -15;
603 PopulateBits();
604 }
605
606 // This is similar to the ReadSymbol() implementation but it is optimized based
607 // on the following facts:
608 // * The probability is fixed at half. So some multiplications can be replaced
609 // with bit operations.
610 // * Symbol count is fixed at 2.
ReadBit()611 int EntropyDecoder::ReadBit() {
612 const uint32_t curr =
613 ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
614 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
615 int bit = 1;
616 if (symbol_value >= curr) {
617 values_in_range_ -= curr;
618 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
619 bit = 0;
620 } else {
621 values_in_range_ = curr;
622 }
623 NormalizeRange();
624 return bit;
625 }
626
ReadLiteral(int num_bits)627 int64_t EntropyDecoder::ReadLiteral(int num_bits) {
628 assert(num_bits <= 32);
629 assert(num_bits > 0);
630 uint32_t literal = 0;
631 int bit = num_bits - 1;
632 do {
633 // ARM can combine a shift operation with a constant number of bits with
634 // some other operations, such as the OR operation.
635 // Here is an ARM disassembly example:
636 // orr w1, w0, w1, lsl #1
637 // which left shifts register w1 by 1 bit and OR the shift result with
638 // register w0.
639 // The next 2 lines are equivalent to:
640 // literal |= static_cast<uint32_t>(ReadBit()) << bit;
641 literal <<= 1;
642 literal |= static_cast<uint32_t>(ReadBit());
643 } while (--bit >= 0);
644 return literal;
645 }
646
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)647 int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
648 int symbol_count) {
649 const int symbol = ReadSymbolImpl(cdf, symbol_count);
650 if (allow_update_cdf_) {
651 UpdateCdf(cdf, symbol_count, symbol);
652 }
653 return symbol;
654 }
655
ReadSymbol(uint16_t * LIBGAV1_RESTRICT cdf)656 bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
657 assert(cdf[1] == 0);
658 const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
659 if (allow_update_cdf_) {
660 const uint16_t count = cdf[2];
661 // rate is computed in the spec as:
662 // 3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
663 // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
664 // 4 + (count > 15) + (count > 31)
665 // Note that the largest value for count is 32 (it is not incremented beyond
666 // 32). So using that information:
667 // count >> 4 is 0 for count from 0 to 15.
668 // count >> 4 is 1 for count from 16 to 31.
669 // count >> 4 is 2 for count == 32.
670 // Now, the equation becomes:
671 // 4 + (count >> 4).
672 // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
673 // with bitwise or. So the final equation is:
674 // 4 | (count >> 4).
675 const int rate = 4 | (count >> 4);
676 if (symbol) {
677 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
678 } else {
679 cdf[0] -= cdf[0] >> rate;
680 }
681 cdf[2] += static_cast<uint16_t>(count < 32);
682 }
683 return symbol;
684 }
685
ReadSymbolWithoutCdfUpdate(uint16_t cdf)686 bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
687 return ReadSymbolImpl(cdf) != 0;
688 }
689
690 template <int symbol_count>
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf)691 int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
692 static_assert(symbol_count >= 3 && symbol_count <= 16, "");
693 if (symbol_count == 3 || symbol_count == 4) {
694 return ReadSymbol3Or4(cdf, symbol_count);
695 }
696 int symbol;
697 if (symbol_count == 8) {
698 symbol = ReadSymbolImpl8(cdf);
699 } else if (symbol_count <= 13) {
700 symbol = ReadSymbolImpl(cdf, symbol_count);
701 } else {
702 symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
703 }
704 if (allow_update_cdf_) {
705 if (symbol_count == 5) {
706 UpdateCdf5(cdf, symbol);
707 } else if (symbol_count == 7) {
708 UpdateCdf7(cdf, symbol);
709 } else if (symbol_count == 8) {
710 UpdateCdf8(cdf, symbol);
711 } else if (symbol_count == 9) {
712 UpdateCdf9(cdf, symbol);
713 } else if (symbol_count == 11) {
714 UpdateCdf11(cdf, symbol);
715 } else if (symbol_count == 13) {
716 UpdateCdf13(cdf, symbol);
717 } else if (symbol_count == 16) {
718 UpdateCdf16(cdf, symbol);
719 } else {
720 UpdateCdf(cdf, symbol_count, symbol);
721 }
722 }
723 return symbol;
724 }
725
ReadSymbolImpl(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)726 int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
727 int symbol_count) {
728 assert(cdf[symbol_count - 1] == 0);
729 --symbol_count;
730 uint32_t curr = values_in_range_;
731 int symbol = -1;
732 uint32_t prev;
733 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
734 uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
735 // Search through the |cdf| array to determine where the scaled cdf value and
736 // |symbol_value| cross over.
737 do {
738 prev = curr;
739 curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
740 delta;
741 delta -= kMinimumProbabilityPerSymbol;
742 } while (symbol_value < curr);
743 values_in_range_ = prev - curr;
744 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
745 NormalizeRange();
746 return symbol;
747 }
748
ReadSymbolImplBinarySearch(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)749 int EntropyDecoder::ReadSymbolImplBinarySearch(
750 const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
751 assert(cdf[symbol_count - 1] == 0);
752 assert(symbol_count > 1 && symbol_count <= 16);
753 --symbol_count;
754 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
755 // Search through the |cdf| array to determine where the scaled cdf value and
756 // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
757 // search to do this. Let |symbol| be the index of the first |cdf| array
758 // entry whose scaled cdf value is less than or equal to |symbol_value|. The
759 // binary search maintains the invariant:
760 // low <= symbol <= high + 1
761 // and terminates when low == high + 1.
762 int low = 0;
763 int high = symbol_count - 1;
764 // The binary search maintains the invariants that |prev| is the scaled cdf
765 // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
766 // convention, the scaled cdf value for -1 is values_in_range_.) When the
767 // binary search terminates, |prev| is the scaled cdf value for symbol - 1
768 // and |curr| is the scaled cdf value for |symbol|.
769 uint32_t prev = values_in_range_;
770 uint32_t curr = 0;
771 const uint32_t values_in_range_shifted = values_in_range_ >> 8;
772 do {
773 const int mid = DivideBy2(low + high);
774 const uint32_t scaled_cdf =
775 ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
776 if (symbol_value < scaled_cdf) {
777 low = mid + 1;
778 prev = scaled_cdf;
779 } else {
780 high = mid - 1;
781 curr = scaled_cdf;
782 }
783 } while (low <= high);
784 assert(low == high + 1);
785 // At this point, |low| is the symbol that has been decoded.
786 values_in_range_ = prev - curr;
787 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
788 NormalizeRange();
789 return low;
790 }
791
ReadSymbolImpl(uint16_t cdf)792 int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
793 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
794 const uint32_t curr =
795 (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
796 kMinimumProbabilityPerSymbol;
797 const int symbol = static_cast<int>(symbol_value < curr);
798 if (symbol == 1) {
799 values_in_range_ = curr;
800 } else {
801 values_in_range_ -= curr;
802 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
803 }
804 NormalizeRange();
805 return symbol;
806 }
807
808 // Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
809 // calls inlined.
ReadSymbol3Or4(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count)810 int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
811 const int symbol_count) {
812 assert(cdf[symbol_count - 1] == 0);
813 uint32_t curr = values_in_range_;
814 uint32_t prev;
815 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
816 uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
817 const uint32_t values_in_range_shifted = values_in_range_ >> 8;
818
819 // Search through the |cdf| array to determine where the scaled cdf value and
820 // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
821 // array.
822 //
823 // The original code is:
824 //
825 // int symbol = -1;
826 // do {
827 // prev = curr;
828 // curr =
829 // ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
830 // + delta;
831 // delta -= kMinimumProbabilityPerSymbol;
832 // } while (symbol_value < curr);
833 // if (allow_update_cdf_) {
834 // UpdateCdf(cdf, [3,4], symbol);
835 // }
836 //
837 // The do-while loop is unrolled with three or four iterations, and the
838 // UpdateCdf call is inlined and merged into the iterations.
839 int symbol = 0;
840 // Iteration 0.
841 prev = curr;
842 curr =
843 ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
844 if (symbol_value >= curr) {
845 // symbol == 0.
846 if (allow_update_cdf_) {
847 // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
848 const uint16_t count = cdf[symbol_count];
849 cdf[symbol_count] += static_cast<uint16_t>(count < 32);
850 const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
851 if (symbol_count == 4) {
852 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
853 // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
854 // NEON code is slower. Consider using the C version if __arm__ is
855 // defined.
856 // 2. The ARM NEON code (compiled for arm64) is slightly slower on
857 // Samsung Galaxy S8+ (SM-G955FD).
858 uint16x4_t cdf_vec = vld1_u16(cdf);
859 const int16x4_t negative_rate = vdup_n_s16(-rate);
860 const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
861 cdf_vec = vsub_u16(cdf_vec, delta);
862 vst1_u16(cdf, cdf_vec);
863 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
864 __m128i cdf_vec = LoadLo8(cdf);
865 const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
866 cdf_vec = _mm_sub_epi16(cdf_vec, delta);
867 StoreLo8(cdf, cdf_vec);
868 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
869 cdf[0] -= cdf[0] >> rate;
870 cdf[1] -= cdf[1] >> rate;
871 cdf[2] -= cdf[2] >> rate;
872 #endif
873 } else { // symbol_count == 3.
874 cdf[0] -= cdf[0] >> rate;
875 cdf[1] -= cdf[1] >> rate;
876 }
877 }
878 goto found;
879 }
880 ++symbol;
881 delta -= kMinimumProbabilityPerSymbol;
882 // Iteration 1.
883 prev = curr;
884 curr =
885 ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
886 if (symbol_value >= curr) {
887 // symbol == 1.
888 if (allow_update_cdf_) {
889 // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
890 const uint16_t count = cdf[symbol_count];
891 cdf[symbol_count] += static_cast<uint16_t>(count < 32);
892 const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
893 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
894 cdf[1] -= cdf[1] >> rate;
895 if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
896 }
897 goto found;
898 }
899 ++symbol;
900 if (symbol_count == 4) {
901 delta -= kMinimumProbabilityPerSymbol;
902 // Iteration 2.
903 prev = curr;
904 curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
905 delta;
906 if (symbol_value >= curr) {
907 // symbol == 2.
908 if (allow_update_cdf_) {
909 // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
910 const uint16_t count = cdf[4];
911 cdf[4] += static_cast<uint16_t>(count < 32);
912 const int rate = (count >> 4) + 5;
913 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
914 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
915 cdf[2] -= cdf[2] >> rate;
916 }
917 goto found;
918 }
919 ++symbol;
920 }
921 // |delta| is 0 for the last iteration.
922 // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
923 prev = curr;
924 // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
925 curr = 0;
926 // symbol == [2,3].
927 if (allow_update_cdf_) {
928 // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
929 const uint16_t count = cdf[symbol_count];
930 cdf[symbol_count] += static_cast<uint16_t>(count < 32);
931 const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
932 if (symbol_count == 4) {
933 #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
934 // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
935 // code is a tiny bit slower. Consider using the C version if __arm__ is
936 // defined.
937 uint16x4_t cdf_vec = vld1_u16(cdf);
938 const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
939 const int16x4_t diff =
940 vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
941 const int16x4_t negative_rate = vdup_n_s16(-rate);
942 const uint16x4_t delta =
943 vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
944 cdf_vec = vadd_u16(cdf_vec, delta);
945 vst1_u16(cdf, cdf_vec);
946 cdf[3] = 0;
947 #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
948 __m128i cdf_vec = LoadLo8(cdf);
949 const __m128i cdf_max_probability =
950 _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
951 const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
952 const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
953 cdf_vec = _mm_add_epi16(cdf_vec, delta);
954 StoreLo8(cdf, cdf_vec);
955 cdf[3] = 0;
956 #else // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
957 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
958 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
959 cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
960 #endif
961 } else { // symbol_count == 3.
962 cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
963 cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
964 }
965 }
966 found:
967 // End of unrolled do-while loop.
968
969 values_in_range_ = prev - curr;
970 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
971 NormalizeRange();
972 return symbol;
973 }
974
ReadSymbolImpl8(const uint16_t * LIBGAV1_RESTRICT const cdf)975 int EntropyDecoder::ReadSymbolImpl8(
976 const uint16_t* LIBGAV1_RESTRICT const cdf) {
977 assert(cdf[7] == 0);
978 uint32_t curr = values_in_range_;
979 uint32_t prev;
980 const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
981 uint32_t delta = kMinimumProbabilityPerSymbol * 7;
982 // Search through the |cdf| array to determine where the scaled cdf value and
983 // |symbol_value| cross over.
984 //
985 // The original code is:
986 //
987 // int symbol = -1;
988 // do {
989 // prev = curr;
990 // curr =
991 // (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
992 // + delta;
993 // delta -= kMinimumProbabilityPerSymbol;
994 // } while (symbol_value < curr);
995 //
996 // The do-while loop is unrolled with eight iterations.
997 int symbol = 0;
998
999 #define READ_SYMBOL_ITERATION \
1000 prev = curr; \
1001 curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
1002 delta; \
1003 if (symbol_value >= curr) goto found; \
1004 ++symbol; \
1005 delta -= kMinimumProbabilityPerSymbol
1006
1007 READ_SYMBOL_ITERATION; // Iteration 0.
1008 READ_SYMBOL_ITERATION; // Iteration 1.
1009 READ_SYMBOL_ITERATION; // Iteration 2.
1010 READ_SYMBOL_ITERATION; // Iteration 3.
1011 READ_SYMBOL_ITERATION; // Iteration 4.
1012 READ_SYMBOL_ITERATION; // Iteration 5.
1013
1014 // The last two iterations can be simplified, so they don't use the
1015 // READ_SYMBOL_ITERATION macro.
1016 #undef READ_SYMBOL_ITERATION
1017
1018 // Iteration 6.
1019 prev = curr;
1020 curr =
1021 (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
1022 if (symbol_value >= curr) goto found; // symbol == 6.
1023 ++symbol;
1024 // |delta| is 0 for the last iteration.
1025 // Iteration 7.
1026 prev = curr;
1027 // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
1028 curr = 0;
1029 // symbol == 7.
1030 found:
1031 // End of unrolled do-while loop.
1032
1033 values_in_range_ = prev - curr;
1034 window_diff_ -= static_cast<WindowSize>(curr) << bits_;
1035 NormalizeRange();
1036 return symbol;
1037 }
1038
PopulateBits()1039 void EntropyDecoder::PopulateBits() {
1040 constexpr int kMaxCachedBits = kWindowSize - 16;
1041 #if defined(__aarch64__)
1042 // Fast path: read eight bytes and add the first six bytes to window_diff_.
1043 // This fast path makes the following assumptions.
1044 // 1. We assume that unaligned load of uint64_t is fast.
1045 // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
1046 // bytes depending on the value of bits_. This fast path always reads 6
1047 // bytes, which results in more calls to PopulateBits(). We assume that
1048 // making more calls to a faster PopulateBits() is overall a win.
1049 // NOTE: Although this fast path could also be used on x86_64, it hurts
1050 // performance (measured on Lenovo ThinkStation P920 running Linux). (The
1051 // reason is still unknown.) Therefore this fast path is only used on arm64.
1052 static_assert(kWindowSize == 64, "");
1053 if (data_ < data_memcpy_end_) {
1054 uint64_t value;
1055 // arm64 supports unaligned loads, so this memcpy call is compiled to a
1056 // single ldr instruction.
1057 memcpy(&value, data_, sizeof(value));
1058 data_ += kMaxCachedBits >> 3;
1059 value = HostToBigEndian(value) ^ -1;
1060 value >>= kWindowSize - kMaxCachedBits;
1061 window_diff_ = value | (window_diff_ << kMaxCachedBits);
1062 bits_ += kMaxCachedBits;
1063 return;
1064 }
1065 #endif
1066
1067 const uint8_t* data = data_;
1068 int bits = bits_;
1069 WindowSize window_diff = window_diff_;
1070
1071 int count = kWindowSize - 9 - (bits + 15);
1072 // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
1073 // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
1074 // iterations when WindowSize is 64 bits. So it is not profitable to
1075 // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
1076 // the fast path above is not compiled.
1077
1078 #ifdef __clang__
1079 #pragma clang loop vectorize(disable) interleave(disable)
1080 #endif
1081 for (; count >= 0 && data < data_end_; count -= 8) {
1082 const uint8_t value = *data++ ^ -1;
1083 window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
1084 bits += 8;
1085 }
1086 assert(bits <= kMaxCachedBits);
1087 if (data == data_end_) {
1088 // Shift in some 1s. This is equivalent to providing fake 0 data bits.
1089 window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
1090 bits = kMaxCachedBits;
1091 }
1092
1093 data_ = data;
1094 bits_ = bits;
1095 window_diff_ = window_diff;
1096 }
1097
NormalizeRange()1098 void EntropyDecoder::NormalizeRange() {
1099 const int bits_used = 15 ^ FloorLog2(values_in_range_);
1100 bits_ -= bits_used;
1101 values_in_range_ <<= bits_used;
1102 if (bits_ < 0) PopulateBits();
1103 }
1104
1105 // Explicit instantiations.
1106 template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
1107 template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
1108 template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
1109 template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
1110 template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
1111 template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
1112 template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
1113 template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
1114 template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
1115 template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
1116 template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
1117 template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
1118 template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
1119
1120 } // namespace libgav1
1121